pf.c revision 240233
1/*	$OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ */
2
3/*
4 * Copyright (c) 2001 Daniel Hartmeier
5 * Copyright (c) 2002 - 2008 Henning Brauer
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 *    - Redistributions of source code must retain the above copyright
13 *      notice, this list of conditions and the following disclaimer.
14 *    - Redistributions in binary form must reproduce the above
15 *      copyright notice, this list of conditions and the following
16 *      disclaimer in the documentation and/or other materials provided
17 *      with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 *
32 * Effort sponsored in part by the Defense Advanced Research Projects
33 * Agency (DARPA) and Air Force Research Laboratory, Air Force
34 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
35 *
36 */
37
38#include <sys/cdefs.h>
39
40__FBSDID("$FreeBSD: head/sys/contrib/pf/net/pf.c 240233 2012-09-08 06:41:54Z glebius $");
41
42#include "opt_inet.h"
43#include "opt_inet6.h"
44#include "opt_bpf.h"
45#include "opt_pf.h"
46
47#include <sys/param.h>
48#include <sys/bus.h>
49#include <sys/endian.h>
50#include <sys/hash.h>
51#include <sys/interrupt.h>
52#include <sys/kernel.h>
53#include <sys/kthread.h>
54#include <sys/limits.h>
55#include <sys/mbuf.h>
56#include <sys/md5.h>
57#include <sys/random.h>
58#include <sys/refcount.h>
59#include <sys/socket.h>
60#include <sys/sysctl.h>
61#include <sys/taskqueue.h>
62#include <sys/ucred.h>
63
64#include <net/if.h>
65#include <net/if_types.h>
66#include <net/route.h>
67#include <net/radix_mpath.h>
68#include <net/vnet.h>
69
70#include <net/pfvar.h>
71#include <net/pf_mtag.h>
72#include <net/if_pflog.h>
73#include <net/if_pfsync.h>
74
75#include <netinet/in_pcb.h>
76#include <netinet/in_var.h>
77#include <netinet/ip.h>
78#include <netinet/ip_fw.h>
79#include <netinet/ip_icmp.h>
80#include <netinet/icmp_var.h>
81#include <netinet/ip_var.h>
82#include <netinet/ipfw/ip_fw_private.h> /* XXX: only for DIR_IN/DIR_OUT */
83#include <netinet/tcp.h>
84#include <netinet/tcp_fsm.h>
85#include <netinet/tcp_seq.h>
86#include <netinet/tcp_timer.h>
87#include <netinet/tcp_var.h>
88#include <netinet/udp.h>
89#include <netinet/udp_var.h>
90
91#ifdef INET6
92#include <netinet/ip6.h>
93#include <netinet/icmp6.h>
94#include <netinet6/nd6.h>
95#include <netinet6/ip6_var.h>
96#include <netinet6/in6_pcb.h>
97#endif /* INET6 */
98
99#include <machine/in_cksum.h>
100#include <security/mac/mac_framework.h>
101
102#define	DPFPRINTF(n, x)	if (V_pf_status.debug >= (n)) printf x
103
104/*
105 * Global variables
106 */
107
108/* state tables */
109VNET_DEFINE(struct pf_altqqueue,	 pf_altqs[2]);
110VNET_DEFINE(struct pf_palist,		 pf_pabuf);
111VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_active);
112VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_inactive);
113VNET_DEFINE(struct pf_status,		 pf_status);
114
115VNET_DEFINE(u_int32_t,			 ticket_altqs_active);
116VNET_DEFINE(u_int32_t,			 ticket_altqs_inactive);
117VNET_DEFINE(int,			 altqs_inactive_open);
118VNET_DEFINE(u_int32_t,			 ticket_pabuf);
119
120VNET_DEFINE(MD5_CTX,			 pf_tcp_secret_ctx);
121#define	V_pf_tcp_secret_ctx		 VNET(pf_tcp_secret_ctx)
122VNET_DEFINE(u_char,			 pf_tcp_secret[16]);
123#define	V_pf_tcp_secret			 VNET(pf_tcp_secret)
124VNET_DEFINE(int,			 pf_tcp_secret_init);
125#define	V_pf_tcp_secret_init		 VNET(pf_tcp_secret_init)
126VNET_DEFINE(int,			 pf_tcp_iss_off);
127#define	V_pf_tcp_iss_off		 VNET(pf_tcp_iss_off)
128
129struct pf_anchor_stackframe {
130	struct pf_ruleset		*rs;
131	struct pf_rule			*r;
132	struct pf_anchor_node		*parent;
133	struct pf_anchor		*child;
134};
135VNET_DEFINE(struct pf_anchor_stackframe, pf_anchor_stack[64]);
136#define	V_pf_anchor_stack		 VNET(pf_anchor_stack)
137
138/*
139 * Queue for pf_intr() sends.
140 */
141static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations");
142struct pf_send_entry {
143	STAILQ_ENTRY(pf_send_entry)	pfse_next;
144	struct mbuf			*pfse_m;
145	enum {
146		PFSE_IP,
147		PFSE_IP6,
148		PFSE_ICMP,
149		PFSE_ICMP6,
150	}				pfse_type;
151	union {
152		struct route		ro;
153		struct {
154			int		type;
155			int		code;
156			int		mtu;
157		} icmpopts;
158	} u;
159#define	pfse_ro		u.ro
160#define	pfse_icmp_type	u.icmpopts.type
161#define	pfse_icmp_code	u.icmpopts.code
162#define	pfse_icmp_mtu	u.icmpopts.mtu
163};
164
165STAILQ_HEAD(pf_send_head, pf_send_entry);
166static VNET_DEFINE(struct pf_send_head, pf_sendqueue);
167#define	V_pf_sendqueue	VNET(pf_sendqueue)
168
169static struct mtx pf_sendqueue_mtx;
170#define	PF_SENDQ_LOCK()		mtx_lock(&pf_sendqueue_mtx)
171#define	PF_SENDQ_UNLOCK()	mtx_unlock(&pf_sendqueue_mtx)
172
173/*
174 * Queue for pf_flush_task() tasks.
175 */
176struct pf_flush_entry {
177	SLIST_ENTRY(pf_flush_entry)	next;
178	struct pf_addr  		addr;
179	sa_family_t			af;
180	uint8_t				dir;
181	struct pf_rule  		*rule;  /* never dereferenced */
182};
183
184SLIST_HEAD(pf_flush_head, pf_flush_entry);
185static VNET_DEFINE(struct pf_flush_head, pf_flushqueue);
186#define V_pf_flushqueue	VNET(pf_flushqueue)
187static VNET_DEFINE(struct task, pf_flushtask);
188#define	V_pf_flushtask	VNET(pf_flushtask)
189
190static struct mtx pf_flushqueue_mtx;
191#define	PF_FLUSHQ_LOCK()	mtx_lock(&pf_flushqueue_mtx)
192#define	PF_FLUSHQ_UNLOCK()	mtx_unlock(&pf_flushqueue_mtx)
193
194VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules);
195struct mtx pf_unlnkdrules_mtx;
196
197static VNET_DEFINE(uma_zone_t,	pf_sources_z);
198#define	V_pf_sources_z	VNET(pf_sources_z)
199static VNET_DEFINE(uma_zone_t,	pf_mtag_z);
200#define	V_pf_mtag_z	VNET(pf_mtag_z)
201VNET_DEFINE(uma_zone_t,	 pf_state_z);
202VNET_DEFINE(uma_zone_t,	 pf_state_key_z);
203
204VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]);
205#define	PFID_CPUBITS	8
206#define	PFID_CPUSHIFT	(sizeof(uint64_t) * NBBY - PFID_CPUBITS)
207#define	PFID_CPUMASK	((uint64_t)((1 << PFID_CPUBITS) - 1) <<	PFID_CPUSHIFT)
208#define	PFID_MAXID	(~PFID_CPUMASK)
209CTASSERT((1 << PFID_CPUBITS) > MAXCPU);
210
211static void		 pf_src_tree_remove_state(struct pf_state *);
212static void		 pf_init_threshold(struct pf_threshold *, u_int32_t,
213			    u_int32_t);
214static void		 pf_add_threshold(struct pf_threshold *);
215static int		 pf_check_threshold(struct pf_threshold *);
216
217static void		 pf_change_ap(struct pf_addr *, u_int16_t *,
218			    u_int16_t *, u_int16_t *, struct pf_addr *,
219			    u_int16_t, u_int8_t, sa_family_t);
220static int		 pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
221			    struct tcphdr *, struct pf_state_peer *);
222static void		 pf_change_icmp(struct pf_addr *, u_int16_t *,
223			    struct pf_addr *, struct pf_addr *, u_int16_t,
224			    u_int16_t *, u_int16_t *, u_int16_t *,
225			    u_int16_t *, u_int8_t, sa_family_t);
226static void		 pf_send_tcp(struct mbuf *,
227			    const struct pf_rule *, sa_family_t,
228			    const struct pf_addr *, const struct pf_addr *,
229			    u_int16_t, u_int16_t, u_int32_t, u_int32_t,
230			    u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
231			    u_int16_t, struct ifnet *);
232static void		 pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
233			    sa_family_t, struct pf_rule *);
234static void		 pf_detach_state(struct pf_state *);
235static int		 pf_state_key_attach(struct pf_state_key *,
236			    struct pf_state_key *, struct pf_state *);
237static void		 pf_state_key_detach(struct pf_state *, int);
238static int		 pf_state_key_ctor(void *, int, void *, int);
239static u_int32_t	 pf_tcp_iss(struct pf_pdesc *);
240static int		 pf_test_rule(struct pf_rule **, struct pf_state **,
241			    int, struct pfi_kif *, struct mbuf *, int,
242			    struct pf_pdesc *, struct pf_rule **,
243			    struct pf_ruleset **, struct inpcb *);
244static int		 pf_create_state(struct pf_rule *, struct pf_rule *,
245			    struct pf_rule *, struct pf_pdesc *,
246			    struct pf_src_node *, struct pf_state_key *,
247			    struct pf_state_key *, struct mbuf *, int,
248			    u_int16_t, u_int16_t, int *, struct pfi_kif *,
249			    struct pf_state **, int, u_int16_t, u_int16_t,
250			    int);
251static int		 pf_test_fragment(struct pf_rule **, int,
252			    struct pfi_kif *, struct mbuf *, void *,
253			    struct pf_pdesc *, struct pf_rule **,
254			    struct pf_ruleset **);
255static int		 pf_tcp_track_full(struct pf_state_peer *,
256			    struct pf_state_peer *, struct pf_state **,
257			    struct pfi_kif *, struct mbuf *, int,
258			    struct pf_pdesc *, u_short *, int *);
259static int		 pf_tcp_track_sloppy(struct pf_state_peer *,
260			    struct pf_state_peer *, struct pf_state **,
261			    struct pf_pdesc *, u_short *);
262static int		 pf_test_state_tcp(struct pf_state **, int,
263			    struct pfi_kif *, struct mbuf *, int,
264			    void *, struct pf_pdesc *, u_short *);
265static int		 pf_test_state_udp(struct pf_state **, int,
266			    struct pfi_kif *, struct mbuf *, int,
267			    void *, struct pf_pdesc *);
268static int		 pf_test_state_icmp(struct pf_state **, int,
269			    struct pfi_kif *, struct mbuf *, int,
270			    void *, struct pf_pdesc *, u_short *);
271static int		 pf_test_state_other(struct pf_state **, int,
272			    struct pfi_kif *, struct mbuf *, struct pf_pdesc *);
273static u_int8_t		 pf_get_wscale(struct mbuf *, int, u_int16_t,
274			    sa_family_t);
275static u_int16_t	 pf_get_mss(struct mbuf *, int, u_int16_t,
276			    sa_family_t);
277static u_int16_t	 pf_calc_mss(struct pf_addr *, sa_family_t,
278				int, u_int16_t);
279static void		 pf_set_rt_ifp(struct pf_state *,
280			    struct pf_addr *);
281static int		 pf_check_proto_cksum(struct mbuf *, int, int,
282			    u_int8_t, sa_family_t);
283static void		 pf_print_state_parts(struct pf_state *,
284			    struct pf_state_key *, struct pf_state_key *);
285static int		 pf_addr_wrap_neq(struct pf_addr_wrap *,
286			    struct pf_addr_wrap *);
287static struct pf_state	*pf_find_state(struct pfi_kif *,
288			    struct pf_state_key_cmp *, u_int);
289static int		 pf_src_connlimit(struct pf_state **);
290static void		 pf_flush_task(void *c, int pending);
291static int		 pf_insert_src_node(struct pf_src_node **,
292			    struct pf_rule *, struct pf_addr *, sa_family_t);
293static int		 pf_purge_expired_states(int);
294static void		 pf_purge_unlinked_rules(void);
295static int		 pf_mtag_init(void *, int, int);
296static void		 pf_mtag_free(struct m_tag *);
297#ifdef INET
298static void		 pf_route(struct mbuf **, struct pf_rule *, int,
299			    struct ifnet *, struct pf_state *,
300			    struct pf_pdesc *);
301#endif /* INET */
302#ifdef INET6
303static void		 pf_change_a6(struct pf_addr *, u_int16_t *,
304			    struct pf_addr *, u_int8_t);
305static void		 pf_route6(struct mbuf **, struct pf_rule *, int,
306			    struct ifnet *, struct pf_state *,
307			    struct pf_pdesc *);
308#endif /* INET6 */
309
310int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
311
312VNET_DECLARE(int, pf_end_threads);
313
314VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]);
315
316#define	PACKET_LOOPED(pd)	((pd)->pf_mtag &&			\
317				 (pd)->pf_mtag->flags & PF_PACKET_LOOPED)
318
319#define	STATE_LOOKUP(i, k, d, s, pd)					\
320	do {								\
321		(s) = pf_find_state((i), (k), (d));			\
322		if ((s) == NULL || (s)->timeout == PFTM_PURGE)		\
323			return (PF_DROP);				\
324		if (PACKET_LOOPED(pd))					\
325			return (PF_PASS);				\
326		if ((d) == PF_OUT &&					\
327		    (((s)->rule.ptr->rt == PF_ROUTETO &&		\
328		    (s)->rule.ptr->direction == PF_OUT) ||		\
329		    ((s)->rule.ptr->rt == PF_REPLYTO &&			\
330		    (s)->rule.ptr->direction == PF_IN)) &&		\
331		    (s)->rt_kif != NULL &&				\
332		    (s)->rt_kif != (i))					\
333			return (PF_PASS);				\
334	} while (0)
335
336#define	BOUND_IFACE(r, k) \
337	((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all
338
339#define	STATE_INC_COUNTERS(s)				\
340	do {						\
341		s->rule.ptr->states_cur++;		\
342		s->rule.ptr->states_tot++;		\
343		if (s->anchor.ptr != NULL) {		\
344			s->anchor.ptr->states_cur++;	\
345			s->anchor.ptr->states_tot++;	\
346		}					\
347		if (s->nat_rule.ptr != NULL) {		\
348			s->nat_rule.ptr->states_cur++;	\
349			s->nat_rule.ptr->states_tot++;	\
350		}					\
351	} while (0)
352
353#define	STATE_DEC_COUNTERS(s)				\
354	do {						\
355		if (s->nat_rule.ptr != NULL)		\
356			s->nat_rule.ptr->states_cur--;	\
357		if (s->anchor.ptr != NULL)		\
358			s->anchor.ptr->states_cur--;	\
359		s->rule.ptr->states_cur--;		\
360	} while (0)
361
362static MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures");
363VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
364VNET_DEFINE(struct pf_idhash *, pf_idhash);
365VNET_DEFINE(u_long, pf_hashmask);
366VNET_DEFINE(struct pf_srchash *, pf_srchash);
367VNET_DEFINE(u_long, pf_srchashmask);
368
369SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)");
370
371VNET_DEFINE(u_long, pf_hashsize);
372#define	V_pf_hashsize	VNET(pf_hashsize)
373SYSCTL_VNET_UINT(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN,
374    &VNET_NAME(pf_hashsize), 0, "Size of pf(4) states hashtable");
375
376VNET_DEFINE(u_long, pf_srchashsize);
377#define	V_pf_srchashsize	VNET(pf_srchashsize)
378SYSCTL_VNET_UINT(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN,
379    &VNET_NAME(pf_srchashsize), 0, "Size of pf(4) source nodes hashtable");
380
381VNET_DEFINE(void *, pf_swi_cookie);
382
383VNET_DEFINE(uint32_t, pf_hashseed);
384#define	V_pf_hashseed	VNET(pf_hashseed)
385
386static __inline uint32_t
387pf_hashkey(struct pf_state_key *sk)
388{
389	uint32_t h;
390
391	h = jenkins_hash32((uint32_t *)sk,
392	    sizeof(struct pf_state_key_cmp)/sizeof(uint32_t),
393	    V_pf_hashseed);
394
395	return (h & V_pf_hashmask);
396}
397
398#ifdef INET6
399void
400pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
401{
402	switch (af) {
403#ifdef INET
404	case AF_INET:
405		dst->addr32[0] = src->addr32[0];
406		break;
407#endif /* INET */
408	case AF_INET6:
409		dst->addr32[0] = src->addr32[0];
410		dst->addr32[1] = src->addr32[1];
411		dst->addr32[2] = src->addr32[2];
412		dst->addr32[3] = src->addr32[3];
413		break;
414	}
415}
416#endif /* INET6 */
417
418static void
419pf_init_threshold(struct pf_threshold *threshold,
420    u_int32_t limit, u_int32_t seconds)
421{
422	threshold->limit = limit * PF_THRESHOLD_MULT;
423	threshold->seconds = seconds;
424	threshold->count = 0;
425	threshold->last = time_uptime;
426}
427
428static void
429pf_add_threshold(struct pf_threshold *threshold)
430{
431	u_int32_t t = time_uptime, diff = t - threshold->last;
432
433	if (diff >= threshold->seconds)
434		threshold->count = 0;
435	else
436		threshold->count -= threshold->count * diff /
437		    threshold->seconds;
438	threshold->count += PF_THRESHOLD_MULT;
439	threshold->last = t;
440}
441
442static int
443pf_check_threshold(struct pf_threshold *threshold)
444{
445	return (threshold->count > threshold->limit);
446}
447
448static int
449pf_src_connlimit(struct pf_state **state)
450{
451	struct pfr_addr p;
452	struct pf_flush_entry *pffe;
453	int bad = 0;
454
455	PF_STATE_LOCK_ASSERT(*state);
456
457	(*state)->src_node->conn++;
458	(*state)->src.tcp_est = 1;
459	pf_add_threshold(&(*state)->src_node->conn_rate);
460
461	if ((*state)->rule.ptr->max_src_conn &&
462	    (*state)->rule.ptr->max_src_conn <
463	    (*state)->src_node->conn) {
464		V_pf_status.lcounters[LCNT_SRCCONN]++;
465		bad++;
466	}
467
468	if ((*state)->rule.ptr->max_src_conn_rate.limit &&
469	    pf_check_threshold(&(*state)->src_node->conn_rate)) {
470		V_pf_status.lcounters[LCNT_SRCCONNRATE]++;
471		bad++;
472	}
473
474	if (!bad)
475		return (0);
476
477	/* Kill this state. */
478	(*state)->timeout = PFTM_PURGE;
479	(*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
480
481	if ((*state)->rule.ptr->overload_tbl == NULL)
482		return (1);
483
484	V_pf_status.lcounters[LCNT_OVERLOAD_TABLE]++;
485	if (V_pf_status.debug >= PF_DEBUG_MISC) {
486		printf("%s: blocking address ", __func__);
487		pf_print_host(&(*state)->src_node->addr, 0,
488		    (*state)->key[PF_SK_WIRE]->af);
489		printf("\n");
490	}
491
492	bzero(&p, sizeof(p));
493	p.pfra_af = (*state)->key[PF_SK_WIRE]->af;
494	switch ((*state)->key[PF_SK_WIRE]->af) {
495#ifdef INET
496	case AF_INET:
497		p.pfra_net = 32;
498		p.pfra_ip4addr = (*state)->src_node->addr.v4;
499		break;
500#endif /* INET */
501#ifdef INET6
502	case AF_INET6:
503		p.pfra_net = 128;
504		p.pfra_ip6addr = (*state)->src_node->addr.v6;
505		break;
506#endif /* INET6 */
507	}
508
509	pfr_insert_kentry((*state)->rule.ptr->overload_tbl, &p, time_second);
510
511	if ((*state)->rule.ptr->flush == 0)
512		return (1);
513
514	/* Schedule flushing task. */
515	pffe = malloc(sizeof(*pffe), M_PFTEMP, M_NOWAIT);
516	if (pffe == NULL)
517		return (1);	/* too bad :( */
518
519	bcopy(&(*state)->src_node->addr, &pffe->addr, sizeof(pffe->addr));
520	pffe->af = (*state)->key[PF_SK_WIRE]->af;
521	pffe->dir = (*state)->direction;
522	if ((*state)->rule.ptr->flush & PF_FLUSH_GLOBAL)
523		pffe->rule = NULL;
524	else
525		pffe->rule = (*state)->rule.ptr;
526	PF_FLUSHQ_LOCK();
527	SLIST_INSERT_HEAD(&V_pf_flushqueue, pffe, next);
528	PF_FLUSHQ_UNLOCK();
529	taskqueue_enqueue(taskqueue_swi, &V_pf_flushtask);
530
531	return (1);
532}
533
534static void
535pf_flush_task(void *c, int pending)
536{
537	struct pf_flush_head queue;
538	struct pf_flush_entry *pffe, *pffe1;
539	uint32_t killed = 0;
540
541	PF_FLUSHQ_LOCK();
542	queue = *(struct pf_flush_head *)c;
543	SLIST_INIT((struct pf_flush_head *)c);
544	PF_FLUSHQ_UNLOCK();
545
546	V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++;
547
548	for (int i = 0; i <= V_pf_hashmask; i++) {
549		struct pf_idhash *ih = &V_pf_idhash[i];
550		struct pf_state_key *sk;
551		struct pf_state *s;
552
553		PF_HASHROW_LOCK(ih);
554		LIST_FOREACH(s, &ih->states, entry) {
555		    sk = s->key[PF_SK_WIRE];
556		    SLIST_FOREACH(pffe, &queue, next)
557			if (sk->af == pffe->af && (pffe->rule == NULL ||
558			    pffe->rule == s->rule.ptr) &&
559			    ((pffe->dir == PF_OUT &&
560			    PF_AEQ(&pffe->addr, &sk->addr[1], sk->af)) ||
561			    (pffe->dir == PF_IN &&
562			    PF_AEQ(&pffe->addr, &sk->addr[0], sk->af)))) {
563				s->timeout = PFTM_PURGE;
564				s->src.state = s->dst.state = TCPS_CLOSED;
565				killed++;
566			}
567		}
568		PF_HASHROW_UNLOCK(ih);
569	}
570	SLIST_FOREACH_SAFE(pffe, &queue, next, pffe1)
571		free(pffe, M_PFTEMP);
572	if (V_pf_status.debug >= PF_DEBUG_MISC)
573		printf("%s: %u states killed", __func__, killed);
574}
575
576/*
577 * Can return locked on failure, so that we can consistently
578 * allocate and insert a new one.
579 */
580struct pf_src_node *
581pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af,
582	int returnlocked)
583{
584	struct pf_srchash *sh;
585	struct pf_src_node *n;
586
587	V_pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
588
589	sh = &V_pf_srchash[pf_hashsrc(src, af)];
590	PF_HASHROW_LOCK(sh);
591	LIST_FOREACH(n, &sh->nodes, entry)
592		if (n->rule.ptr == rule && n->af == af &&
593		    ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) ||
594		    (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0)))
595			break;
596	if (n != NULL || returnlocked == 0)
597		PF_HASHROW_UNLOCK(sh);
598
599	return (n);
600}
601
602static int
603pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
604    struct pf_addr *src, sa_family_t af)
605{
606
607	KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK ||
608	    rule->rpool.opts & PF_POOL_STICKYADDR),
609	    ("%s for non-tracking rule %p", __func__, rule));
610
611	if (*sn == NULL)
612		*sn = pf_find_src_node(src, rule, af, 1);
613
614	if (*sn == NULL) {
615		struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)];
616
617		PF_HASHROW_ASSERT(sh);
618
619		if (!rule->max_src_nodes ||
620		    rule->src_nodes < rule->max_src_nodes)
621			(*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO);
622		else
623			V_pf_status.lcounters[LCNT_SRCNODES]++;
624		if ((*sn) == NULL) {
625			PF_HASHROW_UNLOCK(sh);
626			return (-1);
627		}
628
629		pf_init_threshold(&(*sn)->conn_rate,
630		    rule->max_src_conn_rate.limit,
631		    rule->max_src_conn_rate.seconds);
632
633		(*sn)->af = af;
634		(*sn)->rule.ptr = rule;
635		PF_ACPY(&(*sn)->addr, src, af);
636		LIST_INSERT_HEAD(&sh->nodes, *sn, entry);
637		(*sn)->creation = time_uptime;
638		(*sn)->ruletype = rule->action;
639		if ((*sn)->rule.ptr != NULL)
640			(*sn)->rule.ptr->src_nodes++;
641		PF_HASHROW_UNLOCK(sh);
642		V_pf_status.scounters[SCNT_SRC_NODE_INSERT]++;
643		V_pf_status.src_nodes++;
644	} else {
645		if (rule->max_src_states &&
646		    (*sn)->states >= rule->max_src_states) {
647			V_pf_status.lcounters[LCNT_SRCSTATES]++;
648			return (-1);
649		}
650	}
651	return (0);
652}
653
654static void
655pf_remove_src_node(struct pf_src_node *src)
656{
657	struct pf_srchash *sh;
658
659	sh = &V_pf_srchash[pf_hashsrc(&src->addr, src->af)];
660	PF_HASHROW_LOCK(sh);
661	LIST_REMOVE(src, entry);
662	PF_HASHROW_UNLOCK(sh);
663}
664
665/* Data storage structures initialization. */
666void
667pf_initialize()
668{
669	struct pf_keyhash	*kh;
670	struct pf_idhash	*ih;
671	struct pf_srchash	*sh;
672	u_int i;
673
674	TUNABLE_ULONG_FETCH("net.pf.states_hashsize", &V_pf_hashsize);
675	if (V_pf_hashsize == 0 || !powerof2(V_pf_hashsize))
676		V_pf_hashsize = PF_HASHSIZ;
677	TUNABLE_ULONG_FETCH("net.pf.source_nodes_hashsize", &V_pf_srchashsize);
678	if (V_pf_srchashsize == 0 || !powerof2(V_pf_srchashsize))
679		V_pf_srchashsize = PF_HASHSIZ / 4;
680
681	V_pf_hashseed = arc4random();
682
683	/* States and state keys storage. */
684	V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state),
685	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
686	V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z;
687	uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
688
689	V_pf_state_key_z = uma_zcreate("pf state keys",
690	    sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL,
691	    UMA_ALIGN_PTR, 0);
692	V_pf_keyhash = malloc(V_pf_hashsize * sizeof(struct pf_keyhash),
693	    M_PFHASH, M_WAITOK | M_ZERO);
694	V_pf_idhash = malloc(V_pf_hashsize * sizeof(struct pf_idhash),
695	    M_PFHASH, M_WAITOK | M_ZERO);
696	V_pf_hashmask = V_pf_hashsize - 1;
697	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
698	    i++, kh++, ih++) {
699		mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF);
700		mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
701	}
702
703	/* Source nodes. */
704	V_pf_sources_z = uma_zcreate("pf source nodes",
705	    sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
706	    0);
707	V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z;
708	uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT);
709	V_pf_srchash = malloc(V_pf_srchashsize * sizeof(struct pf_srchash),
710	  M_PFHASH, M_WAITOK|M_ZERO);
711	V_pf_srchashmask = V_pf_srchashsize - 1;
712	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++)
713		mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
714
715	/* ALTQ */
716	TAILQ_INIT(&V_pf_altqs[0]);
717	TAILQ_INIT(&V_pf_altqs[1]);
718	TAILQ_INIT(&V_pf_pabuf);
719	V_pf_altqs_active = &V_pf_altqs[0];
720	V_pf_altqs_inactive = &V_pf_altqs[1];
721
722	/* Mbuf tags */
723	V_pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) +
724	    sizeof(struct pf_mtag), NULL, NULL, pf_mtag_init, NULL,
725	    UMA_ALIGN_PTR, 0);
726
727	/* Send & flush queues. */
728	STAILQ_INIT(&V_pf_sendqueue);
729	SLIST_INIT(&V_pf_flushqueue);
730	TASK_INIT(&V_pf_flushtask, 0, pf_flush_task, &V_pf_flushqueue);
731	mtx_init(&pf_sendqueue_mtx, "pf send queue", NULL, MTX_DEF);
732	mtx_init(&pf_flushqueue_mtx, "pf flush queue", NULL, MTX_DEF);
733
734	/* Unlinked, but may be referenced rules. */
735	TAILQ_INIT(&V_pf_unlinked_rules);
736	mtx_init(&pf_unlnkdrules_mtx, "pf unlinked rules", NULL, MTX_DEF);
737}
738
739void
740pf_cleanup()
741{
742	struct pf_keyhash	*kh;
743	struct pf_idhash	*ih;
744	struct pf_srchash	*sh;
745	struct pf_send_entry	*pfse, *next;
746	u_int i;
747
748	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
749	    i++, kh++, ih++) {
750		KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
751		    __func__));
752		KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
753		    __func__));
754		mtx_destroy(&kh->lock);
755		mtx_destroy(&ih->lock);
756	}
757	free(V_pf_keyhash, M_PFHASH);
758	free(V_pf_idhash, M_PFHASH);
759
760	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
761		KASSERT(LIST_EMPTY(&sh->nodes),
762		    ("%s: source node hash not empty", __func__));
763		mtx_destroy(&sh->lock);
764	}
765	free(V_pf_srchash, M_PFHASH);
766
767	STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
768		m_freem(pfse->pfse_m);
769		free(pfse, M_PFTEMP);
770	}
771
772	mtx_destroy(&pf_sendqueue_mtx);
773	mtx_destroy(&pf_flushqueue_mtx);
774	mtx_destroy(&pf_unlnkdrules_mtx);
775
776	uma_zdestroy(V_pf_mtag_z);
777	uma_zdestroy(V_pf_sources_z);
778	uma_zdestroy(V_pf_state_z);
779	uma_zdestroy(V_pf_state_key_z);
780}
781
782static int
783pf_mtag_init(void *mem, int size, int how)
784{
785	struct m_tag *t;
786
787	t = (struct m_tag *)mem;
788	t->m_tag_cookie = MTAG_ABI_COMPAT;
789	t->m_tag_id = PACKET_TAG_PF;
790	t->m_tag_len = sizeof(struct pf_mtag);
791	t->m_tag_free = pf_mtag_free;
792
793	return (0);
794}
795
796static void
797pf_mtag_free(struct m_tag *t)
798{
799
800	uma_zfree(V_pf_mtag_z, t);
801}
802
803struct pf_mtag *
804pf_get_mtag(struct mbuf *m)
805{
806	struct m_tag *mtag;
807
808	if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL)
809		return ((struct pf_mtag *)(mtag + 1));
810
811	mtag = uma_zalloc(V_pf_mtag_z, M_NOWAIT);
812	if (mtag == NULL)
813		return (NULL);
814	bzero(mtag + 1, sizeof(struct pf_mtag));
815	m_tag_prepend(m, mtag);
816
817	return ((struct pf_mtag *)(mtag + 1));
818}
819
820static int
821pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
822    struct pf_state *s)
823{
824	struct pf_keyhash	*kh;
825	struct pf_state_key	*sk, *cur;
826	struct pf_state		*si, *olds = NULL;
827	int idx;
828
829	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
830	KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
831	KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
832
833	/*
834	 * First run: start with wire key.
835	 */
836	sk = skw;
837	idx = PF_SK_WIRE;
838
839keyattach:
840	kh = &V_pf_keyhash[pf_hashkey(sk)];
841
842	PF_HASHROW_LOCK(kh);
843	LIST_FOREACH(cur, &kh->keys, entry)
844		if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
845			break;
846
847	if (cur != NULL) {
848		/* Key exists. Check for same kif, if none, add to key. */
849		TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
850			struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
851
852			PF_HASHROW_LOCK(ih);
853			if (si->kif == s->kif &&
854			    si->direction == s->direction) {
855				if (sk->proto == IPPROTO_TCP &&
856				    si->src.state >= TCPS_FIN_WAIT_2 &&
857				    si->dst.state >= TCPS_FIN_WAIT_2) {
858					si->src.state = si->dst.state =
859					    TCPS_CLOSED;
860					/* Unlink later or cur can go away. */
861					pf_ref_state(si);
862					olds = si;
863				} else {
864					if (V_pf_status.debug >= PF_DEBUG_MISC) {
865						printf("pf: %s key attach "
866						    "failed on %s: ",
867						    (idx == PF_SK_WIRE) ?
868						    "wire" : "stack",
869						    s->kif->pfik_name);
870						pf_print_state_parts(s,
871						    (idx == PF_SK_WIRE) ?
872						    sk : NULL,
873						    (idx == PF_SK_STACK) ?
874						    sk : NULL);
875						printf(", existing: ");
876						pf_print_state_parts(si,
877						    (idx == PF_SK_WIRE) ?
878						    sk : NULL,
879						    (idx == PF_SK_STACK) ?
880						    sk : NULL);
881						printf("\n");
882					}
883					PF_HASHROW_UNLOCK(ih);
884					PF_HASHROW_UNLOCK(kh);
885					uma_zfree(V_pf_state_key_z, sk);
886					if (idx == PF_SK_STACK)
887						pf_detach_state(s);
888					return (-1);	/* collision! */
889				}
890			}
891			PF_HASHROW_UNLOCK(ih);
892		}
893		uma_zfree(V_pf_state_key_z, sk);
894		s->key[idx] = cur;
895	} else {
896		LIST_INSERT_HEAD(&kh->keys, sk, entry);
897		s->key[idx] = sk;
898	}
899
900stateattach:
901	/* List is sorted, if-bound states before floating. */
902	if (s->kif == V_pfi_all)
903		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
904	else
905		TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
906
907	/*
908	 * Attach done. See how should we (or should not?)
909	 * attach a second key.
910	 */
911	if (sks == skw) {
912		s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
913		idx = PF_SK_STACK;
914		sks = NULL;
915		goto stateattach;
916	} else if (sks != NULL) {
917		PF_HASHROW_UNLOCK(kh);
918		if (olds) {
919			pf_unlink_state(olds, 0);
920			pf_release_state(olds);
921			olds = NULL;
922		}
923		/*
924		 * Continue attaching with stack key.
925		 */
926		sk = sks;
927		idx = PF_SK_STACK;
928		sks = NULL;
929		goto keyattach;
930	} else
931		PF_HASHROW_UNLOCK(kh);
932
933	if (olds) {
934		pf_unlink_state(olds, 0);
935		pf_release_state(olds);
936	}
937
938	KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
939	    ("%s failure", __func__));
940
941	return (0);
942}
943
944static void
945pf_detach_state(struct pf_state *s)
946{
947	struct pf_state_key *sks = s->key[PF_SK_STACK];
948	struct pf_keyhash *kh;
949
950	if (sks != NULL) {
951		kh = &V_pf_keyhash[pf_hashkey(sks)];
952		PF_HASHROW_LOCK(kh);
953		if (s->key[PF_SK_STACK] != NULL)
954			pf_state_key_detach(s, PF_SK_STACK);
955		/*
956		 * If both point to same key, then we are done.
957		 */
958		if (sks == s->key[PF_SK_WIRE]) {
959			pf_state_key_detach(s, PF_SK_WIRE);
960			PF_HASHROW_UNLOCK(kh);
961			return;
962		}
963		PF_HASHROW_UNLOCK(kh);
964	}
965
966	if (s->key[PF_SK_WIRE] != NULL) {
967		kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])];
968		PF_HASHROW_LOCK(kh);
969		if (s->key[PF_SK_WIRE] != NULL)
970			pf_state_key_detach(s, PF_SK_WIRE);
971		PF_HASHROW_UNLOCK(kh);
972	}
973}
974
975static void
976pf_state_key_detach(struct pf_state *s, int idx)
977{
978	struct pf_state_key *sk = s->key[idx];
979#ifdef INVARIANTS
980	struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)];
981
982	PF_HASHROW_ASSERT(kh);
983#endif
984	TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]);
985	s->key[idx] = NULL;
986
987	if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) {
988		LIST_REMOVE(sk, entry);
989		uma_zfree(V_pf_state_key_z, sk);
990	}
991}
992
993static int
994pf_state_key_ctor(void *mem, int size, void *arg, int flags)
995{
996	struct pf_state_key *sk = mem;
997
998	bzero(sk, sizeof(struct pf_state_key_cmp));
999	TAILQ_INIT(&sk->states[PF_SK_WIRE]);
1000	TAILQ_INIT(&sk->states[PF_SK_STACK]);
1001
1002	return (0);
1003}
1004
1005struct pf_state_key *
1006pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr,
1007	struct pf_addr *daddr, u_int16_t sport, u_int16_t dport)
1008{
1009	struct pf_state_key *sk;
1010
1011	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1012	if (sk == NULL)
1013		return (NULL);
1014
1015	PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af);
1016	PF_ACPY(&sk->addr[pd->didx], daddr, pd->af);
1017	sk->port[pd->sidx] = sport;
1018	sk->port[pd->didx] = dport;
1019	sk->proto = pd->proto;
1020	sk->af = pd->af;
1021
1022	return (sk);
1023}
1024
1025struct pf_state_key *
1026pf_state_key_clone(struct pf_state_key *orig)
1027{
1028	struct pf_state_key *sk;
1029
1030	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1031	if (sk == NULL)
1032		return (NULL);
1033
1034	bcopy(orig, sk, sizeof(struct pf_state_key_cmp));
1035
1036	return (sk);
1037}
1038
1039int
1040pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw,
1041    struct pf_state_key *sks, struct pf_state *s)
1042{
1043	struct pf_idhash *ih;
1044	struct pf_state *cur;
1045
1046	KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]),
1047	    ("%s: sks not pristine", __func__));
1048	KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]),
1049	    ("%s: skw not pristine", __func__));
1050	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
1051
1052	s->kif = kif;
1053
1054	if (pf_state_key_attach(skw, sks, s))
1055		return (-1);
1056
1057	if (s->id == 0 && s->creatorid == 0) {
1058		/* XXX: should be atomic, but probability of collision low */
1059		if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID)
1060			V_pf_stateid[curcpu] = 1;
1061		s->id |= (uint64_t )curcpu << PFID_CPUSHIFT;
1062		s->id = htobe64(s->id);
1063		s->creatorid = V_pf_status.hostid;
1064	}
1065
1066	ih = &V_pf_idhash[PF_IDHASH(s)];
1067	PF_HASHROW_LOCK(ih);
1068	LIST_FOREACH(cur, &ih->states, entry)
1069		if (cur->id == s->id && cur->creatorid == s->creatorid)
1070			break;
1071
1072	if (cur != NULL) {
1073		PF_HASHROW_UNLOCK(ih);
1074		if (V_pf_status.debug >= PF_DEBUG_MISC) {
1075			printf("pf: state insert failed: "
1076			    "id: %016llx creatorid: %08x",
1077			    (unsigned long long)be64toh(s->id),
1078			    ntohl(s->creatorid));
1079			printf("\n");
1080		}
1081		pf_detach_state(s);
1082		return (-1);
1083	}
1084	LIST_INSERT_HEAD(&ih->states, s, entry);
1085	/* One for keys, one for ID hash. */
1086	refcount_init(&s->refs, 2);
1087
1088	V_pf_status.fcounters[FCNT_STATE_INSERT]++;
1089	if (pfsync_insert_state_ptr != NULL)
1090		pfsync_insert_state_ptr(s);
1091
1092	/* Returns locked. */
1093	return (0);
1094}
1095
1096/*
1097 * Find state by ID: returns with locked row on success.
1098 */
1099struct pf_state *
1100pf_find_state_byid(uint64_t id, uint32_t creatorid)
1101{
1102	struct pf_idhash *ih;
1103	struct pf_state *s;
1104
1105	V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
1106
1107	ih = &V_pf_idhash[(be64toh(id) % (V_pf_hashmask + 1))];
1108
1109	PF_HASHROW_LOCK(ih);
1110	LIST_FOREACH(s, &ih->states, entry)
1111		if (s->id == id && s->creatorid == creatorid)
1112			break;
1113
1114	if (s == NULL)
1115		PF_HASHROW_UNLOCK(ih);
1116
1117	return (s);
1118}
1119
1120/*
1121 * Find state by key.
1122 * Returns with ID hash slot locked on success.
1123 */
1124static struct pf_state *
1125pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir)
1126{
1127	struct pf_keyhash	*kh;
1128	struct pf_state_key	*sk;
1129	struct pf_state		*s;
1130	int idx;
1131
1132	V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
1133
1134	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1135
1136	PF_HASHROW_LOCK(kh);
1137	LIST_FOREACH(sk, &kh->keys, entry)
1138		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1139			break;
1140	if (sk == NULL) {
1141		PF_HASHROW_UNLOCK(kh);
1142		return (NULL);
1143	}
1144
1145	idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK);
1146
1147	/* List is sorted, if-bound states before floating ones. */
1148	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx])
1149		if (s->kif == V_pfi_all || s->kif == kif) {
1150			PF_STATE_LOCK(s);
1151			PF_HASHROW_UNLOCK(kh);
1152			if (s->timeout == PFTM_UNLINKED) {
1153				/*
1154				 * State is being processed
1155				 * by pf_unlink_state() in
1156				 * an other thread.
1157				 */
1158				PF_STATE_UNLOCK(s);
1159				return (NULL);
1160			}
1161			return (s);
1162		}
1163	PF_HASHROW_UNLOCK(kh);
1164
1165	return (NULL);
1166}
1167
1168struct pf_state *
1169pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
1170{
1171	struct pf_keyhash	*kh;
1172	struct pf_state_key	*sk;
1173	struct pf_state		*s, *ret = NULL;
1174	int			 idx, inout = 0;
1175
1176	V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
1177
1178	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1179
1180	PF_HASHROW_LOCK(kh);
1181	LIST_FOREACH(sk, &kh->keys, entry)
1182		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1183			break;
1184	if (sk == NULL) {
1185		PF_HASHROW_UNLOCK(kh);
1186		return (NULL);
1187	}
1188	switch (dir) {
1189	case PF_IN:
1190		idx = PF_SK_WIRE;
1191		break;
1192	case PF_OUT:
1193		idx = PF_SK_STACK;
1194		break;
1195	case PF_INOUT:
1196		idx = PF_SK_WIRE;
1197		inout = 1;
1198		break;
1199	default:
1200		panic("%s: dir %u", __func__, dir);
1201	}
1202second_run:
1203	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
1204		if (more == NULL) {
1205			PF_HASHROW_UNLOCK(kh);
1206			return (s);
1207		}
1208
1209		if (ret)
1210			(*more)++;
1211		else
1212			ret = s;
1213	}
1214	if (inout == 1) {
1215		inout = 0;
1216		idx = PF_SK_STACK;
1217		goto second_run;
1218	}
1219	PF_HASHROW_UNLOCK(kh);
1220
1221	return (ret);
1222}
1223
1224/* END state table stuff */
1225
1226static void
1227pf_send(struct pf_send_entry *pfse)
1228{
1229
1230	PF_SENDQ_LOCK();
1231	STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next);
1232	PF_SENDQ_UNLOCK();
1233	swi_sched(V_pf_swi_cookie, 0);
1234}
1235
1236void
1237pf_intr(void *v)
1238{
1239	struct pf_send_head queue;
1240	struct pf_send_entry *pfse, *next;
1241
1242	CURVNET_SET((struct vnet *)v);
1243
1244	PF_SENDQ_LOCK();
1245	queue = V_pf_sendqueue;
1246	STAILQ_INIT(&V_pf_sendqueue);
1247	PF_SENDQ_UNLOCK();
1248
1249	STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) {
1250		switch (pfse->pfse_type) {
1251#ifdef INET
1252		case PFSE_IP:
1253			ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL);
1254			break;
1255		case PFSE_ICMP:
1256			icmp_error(pfse->pfse_m, pfse->pfse_icmp_type,
1257			    pfse->pfse_icmp_code, 0, pfse->pfse_icmp_mtu);
1258			break;
1259#endif /* INET */
1260#ifdef INET6
1261		case PFSE_IP6:
1262			ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL,
1263			    NULL);
1264			break;
1265		case PFSE_ICMP6:
1266			icmp6_error(pfse->pfse_m, pfse->pfse_icmp_type,
1267			    pfse->pfse_icmp_code, pfse->pfse_icmp_mtu);
1268			break;
1269#endif /* INET6 */
1270		default:
1271			panic("%s: unknown type", __func__);
1272		}
1273		free(pfse, M_PFTEMP);
1274	}
1275	CURVNET_RESTORE();
1276}
1277
1278void
1279pf_purge_thread(void *v)
1280{
1281	int fullrun;
1282
1283	CURVNET_SET((struct vnet *)v);
1284
1285	for (;;) {
1286		PF_RULES_RLOCK();
1287		rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftm", hz / 10);
1288
1289		if (V_pf_end_threads) {
1290			/*
1291			 * To cleanse up all kifs and rules we need
1292			 * two runs: first one clears reference flags,
1293			 * then pf_purge_expired_states() doesn't
1294			 * raise them, and then second run frees.
1295			 */
1296			PF_RULES_RUNLOCK();
1297			pf_purge_unlinked_rules();
1298			pfi_kif_purge();
1299
1300			/*
1301			 * Now purge everything.
1302			 */
1303			pf_purge_expired_states(V_pf_hashmask + 1);
1304			pf_purge_expired_fragments();
1305			pf_purge_expired_src_nodes();
1306
1307			/*
1308			 * Now all kifs & rules should be unreferenced,
1309			 * thus should be successfully freed.
1310			 */
1311			pf_purge_unlinked_rules();
1312			pfi_kif_purge();
1313
1314			/*
1315			 * Announce success and exit.
1316			 */
1317			PF_RULES_RLOCK();
1318			V_pf_end_threads++;
1319			PF_RULES_RUNLOCK();
1320			wakeup(pf_purge_thread);
1321			kproc_exit(0);
1322		}
1323		PF_RULES_RUNLOCK();
1324
1325		/* Process 1/interval fraction of the state table every run. */
1326		fullrun = pf_purge_expired_states(V_pf_hashmask /
1327			    (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10));
1328
1329		/* Purge other expired types every PFTM_INTERVAL seconds. */
1330		if (fullrun) {
1331			/*
1332			 * Order is important:
1333			 * - states and src nodes reference rules
1334			 * - states and rules reference kifs
1335			 */
1336			pf_purge_expired_fragments();
1337			pf_purge_expired_src_nodes();
1338			pf_purge_unlinked_rules();
1339			pfi_kif_purge();
1340		}
1341	}
1342	/* not reached */
1343	CURVNET_RESTORE();
1344}
1345
1346u_int32_t
1347pf_state_expires(const struct pf_state *state)
1348{
1349	u_int32_t	timeout;
1350	u_int32_t	start;
1351	u_int32_t	end;
1352	u_int32_t	states;
1353
1354	/* handle all PFTM_* > PFTM_MAX here */
1355	if (state->timeout == PFTM_PURGE)
1356		return (time_uptime);
1357	if (state->timeout == PFTM_UNTIL_PACKET)
1358		return (0);
1359	KASSERT(state->timeout != PFTM_UNLINKED,
1360	    ("pf_state_expires: timeout == PFTM_UNLINKED"));
1361	KASSERT((state->timeout < PFTM_MAX),
1362	    ("pf_state_expires: timeout > PFTM_MAX"));
1363	timeout = state->rule.ptr->timeout[state->timeout];
1364	if (!timeout)
1365		timeout = V_pf_default_rule.timeout[state->timeout];
1366	start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
1367	if (start) {
1368		end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
1369		states = state->rule.ptr->states_cur;	/* XXXGL */
1370	} else {
1371		start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START];
1372		end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END];
1373		states = V_pf_status.states;
1374	}
1375	if (end && states > start && start < end) {
1376		if (states < end)
1377			return (state->expire + timeout * (end - states) /
1378			    (end - start));
1379		else
1380			return (time_uptime);
1381	}
1382	return (state->expire + timeout);
1383}
1384
1385void
1386pf_purge_expired_src_nodes()
1387{
1388	struct pf_srchash	*sh;
1389	struct pf_src_node	*cur, *next;
1390	int i;
1391
1392	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
1393	    PF_HASHROW_LOCK(sh);
1394	    LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next)
1395		if (cur->states <= 0 && cur->expire <= time_uptime) {
1396			if (cur->rule.ptr != NULL)
1397				cur->rule.ptr->src_nodes--;
1398			LIST_REMOVE(cur, entry);
1399			V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
1400			V_pf_status.src_nodes--;
1401			uma_zfree(V_pf_sources_z, cur);
1402		} else if (cur->rule.ptr != NULL)
1403			cur->rule.ptr->rule_flag |= PFRULE_REFS;
1404	    PF_HASHROW_UNLOCK(sh);
1405	}
1406}
1407
1408static void
1409pf_src_tree_remove_state(struct pf_state *s)
1410{
1411	u_int32_t timeout;
1412
1413	if (s->src_node != NULL) {
1414		if (s->src.tcp_est)
1415			--s->src_node->conn;
1416		if (--s->src_node->states <= 0) {
1417			timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1418			if (!timeout)
1419				timeout =
1420				    V_pf_default_rule.timeout[PFTM_SRC_NODE];
1421			s->src_node->expire = time_uptime + timeout;
1422		}
1423	}
1424	if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
1425		if (--s->nat_src_node->states <= 0) {
1426			timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1427			if (!timeout)
1428				timeout =
1429				    V_pf_default_rule.timeout[PFTM_SRC_NODE];
1430			s->nat_src_node->expire = time_uptime + timeout;
1431		}
1432	}
1433	s->src_node = s->nat_src_node = NULL;
1434}
1435
1436/*
1437 * Unlink and potentilly free a state. Function may be
1438 * called with ID hash row locked, but always returns
1439 * unlocked, since it needs to go through key hash locking.
1440 */
1441int
1442pf_unlink_state(struct pf_state *s, u_int flags)
1443{
1444	struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)];
1445
1446	if ((flags & PF_ENTER_LOCKED) == 0)
1447		PF_HASHROW_LOCK(ih);
1448	else
1449		PF_HASHROW_ASSERT(ih);
1450
1451	if (s->timeout == PFTM_UNLINKED) {
1452		/*
1453		 * State is being processed
1454		 * by pf_unlink_state() in
1455		 * an other thread.
1456		 */
1457		PF_HASHROW_UNLOCK(ih);
1458		return (0);	/* XXXGL: undefined actually */
1459	}
1460
1461	s->timeout = PFTM_UNLINKED;
1462
1463	if (s->src.state == PF_TCPS_PROXY_DST) {
1464		/* XXX wire key the right one? */
1465		pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af,
1466		    &s->key[PF_SK_WIRE]->addr[1],
1467		    &s->key[PF_SK_WIRE]->addr[0],
1468		    s->key[PF_SK_WIRE]->port[1],
1469		    s->key[PF_SK_WIRE]->port[0],
1470		    s->src.seqhi, s->src.seqlo + 1,
1471		    TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL);
1472	}
1473
1474	LIST_REMOVE(s, entry);
1475	pf_src_tree_remove_state(s);
1476	PF_HASHROW_UNLOCK(ih);
1477
1478	if (pfsync_delete_state_ptr != NULL)
1479		pfsync_delete_state_ptr(s);
1480
1481	pf_detach_state(s);
1482	refcount_release(&s->refs);
1483
1484	return (pf_release_state(s));
1485}
1486
1487void
1488pf_free_state(struct pf_state *cur)
1489{
1490
1491	KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur));
1492	KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__,
1493	    cur->timeout));
1494	--cur->rule.ptr->states_cur;
1495	if (cur->nat_rule.ptr != NULL)
1496		--cur->nat_rule.ptr->states_cur;
1497	if (cur->anchor.ptr != NULL)
1498		--cur->anchor.ptr->states_cur;
1499	pf_normalize_tcp_cleanup(cur);
1500	uma_zfree(V_pf_state_z, cur);
1501	V_pf_status.fcounters[FCNT_STATE_REMOVALS]++;
1502}
1503
1504/*
1505 * Called only from pf_purge_thread(), thus serialized.
1506 */
1507static int
1508pf_purge_expired_states(int maxcheck)
1509{
1510	static u_int i = 0;
1511
1512	struct pf_idhash *ih;
1513	struct pf_state *s;
1514	int rv = 0;
1515
1516	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1517
1518	/*
1519	 * Go through hash and unlink states that expire now.
1520	 */
1521	while (maxcheck > 0) {
1522
1523		/* Wrap to start of hash when we hit the end. */
1524		if (i > V_pf_hashmask) {
1525			i = 0;
1526			rv = 1;
1527		}
1528
1529		ih = &V_pf_idhash[i];
1530relock:
1531		PF_HASHROW_LOCK(ih);
1532		LIST_FOREACH(s, &ih->states, entry) {
1533			if (pf_state_expires(s) <= time_uptime) {
1534				V_pf_status.states -=
1535				    pf_unlink_state(s, PF_ENTER_LOCKED);
1536				goto relock;
1537			}
1538			s->rule.ptr->rule_flag |= PFRULE_REFS;
1539			if (s->nat_rule.ptr != NULL)
1540				s->nat_rule.ptr->rule_flag |= PFRULE_REFS;
1541			if (s->anchor.ptr != NULL)
1542				s->anchor.ptr->rule_flag |= PFRULE_REFS;
1543			s->kif->pfik_flags |= PFI_IFLAG_REFS;
1544			if (s->rt_kif)
1545				s->rt_kif->pfik_flags |= PFI_IFLAG_REFS;
1546		}
1547		PF_HASHROW_UNLOCK(ih);
1548		i++;
1549		maxcheck--;
1550	}
1551
1552	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1553
1554	return (rv);
1555}
1556
1557static void
1558pf_purge_unlinked_rules()
1559{
1560	struct pf_rulequeue tmpq;
1561	struct pf_rule *r, *r1;
1562
1563	/*
1564	 * Do naive mark-and-sweep garbage collecting of old rules.
1565	 * Reference flag is raised by pf_purge_expired_states()
1566	 * and pf_purge_expired_src_nodes().
1567	 *
1568	 * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK,
1569	 * use a temporary queue.
1570	 */
1571	TAILQ_INIT(&tmpq);
1572	PF_UNLNKDRULES_LOCK();
1573	TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) {
1574		if (!(r->rule_flag & PFRULE_REFS)) {
1575			TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries);
1576			TAILQ_INSERT_TAIL(&tmpq, r, entries);
1577		} else
1578			r->rule_flag &= ~PFRULE_REFS;
1579	}
1580	PF_UNLNKDRULES_UNLOCK();
1581
1582	if (!TAILQ_EMPTY(&tmpq)) {
1583		PF_RULES_WLOCK();
1584		TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) {
1585			TAILQ_REMOVE(&tmpq, r, entries);
1586			pf_free_rule(r);
1587		}
1588		PF_RULES_WUNLOCK();
1589	}
1590}
1591
1592void
1593pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
1594{
1595	switch (af) {
1596#ifdef INET
1597	case AF_INET: {
1598		u_int32_t a = ntohl(addr->addr32[0]);
1599		printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
1600		    (a>>8)&255, a&255);
1601		if (p) {
1602			p = ntohs(p);
1603			printf(":%u", p);
1604		}
1605		break;
1606	}
1607#endif /* INET */
1608#ifdef INET6
1609	case AF_INET6: {
1610		u_int16_t b;
1611		u_int8_t i, curstart, curend, maxstart, maxend;
1612		curstart = curend = maxstart = maxend = 255;
1613		for (i = 0; i < 8; i++) {
1614			if (!addr->addr16[i]) {
1615				if (curstart == 255)
1616					curstart = i;
1617				curend = i;
1618			} else {
1619				if ((curend - curstart) >
1620				    (maxend - maxstart)) {
1621					maxstart = curstart;
1622					maxend = curend;
1623				}
1624				curstart = curend = 255;
1625			}
1626		}
1627		if ((curend - curstart) >
1628		    (maxend - maxstart)) {
1629			maxstart = curstart;
1630			maxend = curend;
1631		}
1632		for (i = 0; i < 8; i++) {
1633			if (i >= maxstart && i <= maxend) {
1634				if (i == 0)
1635					printf(":");
1636				if (i == maxend)
1637					printf(":");
1638			} else {
1639				b = ntohs(addr->addr16[i]);
1640				printf("%x", b);
1641				if (i < 7)
1642					printf(":");
1643			}
1644		}
1645		if (p) {
1646			p = ntohs(p);
1647			printf("[%u]", p);
1648		}
1649		break;
1650	}
1651#endif /* INET6 */
1652	}
1653}
1654
1655void
1656pf_print_state(struct pf_state *s)
1657{
1658	pf_print_state_parts(s, NULL, NULL);
1659}
1660
1661static void
1662pf_print_state_parts(struct pf_state *s,
1663    struct pf_state_key *skwp, struct pf_state_key *sksp)
1664{
1665	struct pf_state_key *skw, *sks;
1666	u_int8_t proto, dir;
1667
1668	/* Do our best to fill these, but they're skipped if NULL */
1669	skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
1670	sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
1671	proto = skw ? skw->proto : (sks ? sks->proto : 0);
1672	dir = s ? s->direction : 0;
1673
1674	switch (proto) {
1675	case IPPROTO_IPV4:
1676		printf("IPv4");
1677		break;
1678	case IPPROTO_IPV6:
1679		printf("IPv6");
1680		break;
1681	case IPPROTO_TCP:
1682		printf("TCP");
1683		break;
1684	case IPPROTO_UDP:
1685		printf("UDP");
1686		break;
1687	case IPPROTO_ICMP:
1688		printf("ICMP");
1689		break;
1690	case IPPROTO_ICMPV6:
1691		printf("ICMPv6");
1692		break;
1693	default:
1694		printf("%u", skw->proto);
1695		break;
1696	}
1697	switch (dir) {
1698	case PF_IN:
1699		printf(" in");
1700		break;
1701	case PF_OUT:
1702		printf(" out");
1703		break;
1704	}
1705	if (skw) {
1706		printf(" wire: ");
1707		pf_print_host(&skw->addr[0], skw->port[0], skw->af);
1708		printf(" ");
1709		pf_print_host(&skw->addr[1], skw->port[1], skw->af);
1710	}
1711	if (sks) {
1712		printf(" stack: ");
1713		if (sks != skw) {
1714			pf_print_host(&sks->addr[0], sks->port[0], sks->af);
1715			printf(" ");
1716			pf_print_host(&sks->addr[1], sks->port[1], sks->af);
1717		} else
1718			printf("-");
1719	}
1720	if (s) {
1721		if (proto == IPPROTO_TCP) {
1722			printf(" [lo=%u high=%u win=%u modulator=%u",
1723			    s->src.seqlo, s->src.seqhi,
1724			    s->src.max_win, s->src.seqdiff);
1725			if (s->src.wscale && s->dst.wscale)
1726				printf(" wscale=%u",
1727				    s->src.wscale & PF_WSCALE_MASK);
1728			printf("]");
1729			printf(" [lo=%u high=%u win=%u modulator=%u",
1730			    s->dst.seqlo, s->dst.seqhi,
1731			    s->dst.max_win, s->dst.seqdiff);
1732			if (s->src.wscale && s->dst.wscale)
1733				printf(" wscale=%u",
1734				s->dst.wscale & PF_WSCALE_MASK);
1735			printf("]");
1736		}
1737		printf(" %u:%u", s->src.state, s->dst.state);
1738	}
1739}
1740
1741void
1742pf_print_flags(u_int8_t f)
1743{
1744	if (f)
1745		printf(" ");
1746	if (f & TH_FIN)
1747		printf("F");
1748	if (f & TH_SYN)
1749		printf("S");
1750	if (f & TH_RST)
1751		printf("R");
1752	if (f & TH_PUSH)
1753		printf("P");
1754	if (f & TH_ACK)
1755		printf("A");
1756	if (f & TH_URG)
1757		printf("U");
1758	if (f & TH_ECE)
1759		printf("E");
1760	if (f & TH_CWR)
1761		printf("W");
1762}
1763
1764#define	PF_SET_SKIP_STEPS(i)					\
1765	do {							\
1766		while (head[i] != cur) {			\
1767			head[i]->skip[i].ptr = cur;		\
1768			head[i] = TAILQ_NEXT(head[i], entries);	\
1769		}						\
1770	} while (0)
1771
1772void
1773pf_calc_skip_steps(struct pf_rulequeue *rules)
1774{
1775	struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT];
1776	int i;
1777
1778	cur = TAILQ_FIRST(rules);
1779	prev = cur;
1780	for (i = 0; i < PF_SKIP_COUNT; ++i)
1781		head[i] = cur;
1782	while (cur != NULL) {
1783
1784		if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
1785			PF_SET_SKIP_STEPS(PF_SKIP_IFP);
1786		if (cur->direction != prev->direction)
1787			PF_SET_SKIP_STEPS(PF_SKIP_DIR);
1788		if (cur->af != prev->af)
1789			PF_SET_SKIP_STEPS(PF_SKIP_AF);
1790		if (cur->proto != prev->proto)
1791			PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
1792		if (cur->src.neg != prev->src.neg ||
1793		    pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
1794			PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
1795		if (cur->src.port[0] != prev->src.port[0] ||
1796		    cur->src.port[1] != prev->src.port[1] ||
1797		    cur->src.port_op != prev->src.port_op)
1798			PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
1799		if (cur->dst.neg != prev->dst.neg ||
1800		    pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
1801			PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
1802		if (cur->dst.port[0] != prev->dst.port[0] ||
1803		    cur->dst.port[1] != prev->dst.port[1] ||
1804		    cur->dst.port_op != prev->dst.port_op)
1805			PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
1806
1807		prev = cur;
1808		cur = TAILQ_NEXT(cur, entries);
1809	}
1810	for (i = 0; i < PF_SKIP_COUNT; ++i)
1811		PF_SET_SKIP_STEPS(i);
1812}
1813
1814static int
1815pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
1816{
1817	if (aw1->type != aw2->type)
1818		return (1);
1819	switch (aw1->type) {
1820	case PF_ADDR_ADDRMASK:
1821	case PF_ADDR_RANGE:
1822		if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, 0))
1823			return (1);
1824		if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, 0))
1825			return (1);
1826		return (0);
1827	case PF_ADDR_DYNIFTL:
1828		return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
1829	case PF_ADDR_NOROUTE:
1830	case PF_ADDR_URPFFAILED:
1831		return (0);
1832	case PF_ADDR_TABLE:
1833		return (aw1->p.tbl != aw2->p.tbl);
1834	default:
1835		printf("invalid address type: %d\n", aw1->type);
1836		return (1);
1837	}
1838}
1839
1840u_int16_t
1841pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
1842{
1843	u_int32_t	l;
1844
1845	if (udp && !cksum)
1846		return (0x0000);
1847	l = cksum + old - new;
1848	l = (l >> 16) + (l & 65535);
1849	l = l & 65535;
1850	if (udp && !l)
1851		return (0xFFFF);
1852	return (l);
1853}
1854
1855static void
1856pf_change_ap(struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc,
1857    struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af)
1858{
1859	struct pf_addr	ao;
1860	u_int16_t	po = *p;
1861
1862	PF_ACPY(&ao, a, af);
1863	PF_ACPY(a, an, af);
1864
1865	*p = pn;
1866
1867	switch (af) {
1868#ifdef INET
1869	case AF_INET:
1870		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
1871		    ao.addr16[0], an->addr16[0], 0),
1872		    ao.addr16[1], an->addr16[1], 0);
1873		*p = pn;
1874		*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
1875		    ao.addr16[0], an->addr16[0], u),
1876		    ao.addr16[1], an->addr16[1], u),
1877		    po, pn, u);
1878		break;
1879#endif /* INET */
1880#ifdef INET6
1881	case AF_INET6:
1882		*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1883		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1884		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
1885		    ao.addr16[0], an->addr16[0], u),
1886		    ao.addr16[1], an->addr16[1], u),
1887		    ao.addr16[2], an->addr16[2], u),
1888		    ao.addr16[3], an->addr16[3], u),
1889		    ao.addr16[4], an->addr16[4], u),
1890		    ao.addr16[5], an->addr16[5], u),
1891		    ao.addr16[6], an->addr16[6], u),
1892		    ao.addr16[7], an->addr16[7], u),
1893		    po, pn, u);
1894		break;
1895#endif /* INET6 */
1896	}
1897}
1898
1899
1900/* Changes a u_int32_t.  Uses a void * so there are no align restrictions */
1901void
1902pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
1903{
1904	u_int32_t	ao;
1905
1906	memcpy(&ao, a, sizeof(ao));
1907	memcpy(a, &an, sizeof(u_int32_t));
1908	*c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
1909	    ao % 65536, an % 65536, u);
1910}
1911
1912#ifdef INET6
1913static void
1914pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
1915{
1916	struct pf_addr	ao;
1917
1918	PF_ACPY(&ao, a, AF_INET6);
1919	PF_ACPY(a, an, AF_INET6);
1920
1921	*c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1922	    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1923	    pf_cksum_fixup(pf_cksum_fixup(*c,
1924	    ao.addr16[0], an->addr16[0], u),
1925	    ao.addr16[1], an->addr16[1], u),
1926	    ao.addr16[2], an->addr16[2], u),
1927	    ao.addr16[3], an->addr16[3], u),
1928	    ao.addr16[4], an->addr16[4], u),
1929	    ao.addr16[5], an->addr16[5], u),
1930	    ao.addr16[6], an->addr16[6], u),
1931	    ao.addr16[7], an->addr16[7], u);
1932}
1933#endif /* INET6 */
1934
1935static void
1936pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
1937    struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
1938    u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
1939{
1940	struct pf_addr	oia, ooa;
1941
1942	PF_ACPY(&oia, ia, af);
1943	if (oa)
1944		PF_ACPY(&ooa, oa, af);
1945
1946	/* Change inner protocol port, fix inner protocol checksum. */
1947	if (ip != NULL) {
1948		u_int16_t	oip = *ip;
1949		u_int32_t	opc;
1950
1951		if (pc != NULL)
1952			opc = *pc;
1953		*ip = np;
1954		if (pc != NULL)
1955			*pc = pf_cksum_fixup(*pc, oip, *ip, u);
1956		*ic = pf_cksum_fixup(*ic, oip, *ip, 0);
1957		if (pc != NULL)
1958			*ic = pf_cksum_fixup(*ic, opc, *pc, 0);
1959	}
1960	/* Change inner ip address, fix inner ip and icmp checksums. */
1961	PF_ACPY(ia, na, af);
1962	switch (af) {
1963#ifdef INET
1964	case AF_INET: {
1965		u_int32_t	 oh2c = *h2c;
1966
1967		*h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
1968		    oia.addr16[0], ia->addr16[0], 0),
1969		    oia.addr16[1], ia->addr16[1], 0);
1970		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
1971		    oia.addr16[0], ia->addr16[0], 0),
1972		    oia.addr16[1], ia->addr16[1], 0);
1973		*ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
1974		break;
1975	}
1976#endif /* INET */
1977#ifdef INET6
1978	case AF_INET6:
1979		*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1980		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1981		    pf_cksum_fixup(pf_cksum_fixup(*ic,
1982		    oia.addr16[0], ia->addr16[0], u),
1983		    oia.addr16[1], ia->addr16[1], u),
1984		    oia.addr16[2], ia->addr16[2], u),
1985		    oia.addr16[3], ia->addr16[3], u),
1986		    oia.addr16[4], ia->addr16[4], u),
1987		    oia.addr16[5], ia->addr16[5], u),
1988		    oia.addr16[6], ia->addr16[6], u),
1989		    oia.addr16[7], ia->addr16[7], u);
1990		break;
1991#endif /* INET6 */
1992	}
1993	/* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
1994	if (oa) {
1995		PF_ACPY(oa, na, af);
1996		switch (af) {
1997#ifdef INET
1998		case AF_INET:
1999			*hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
2000			    ooa.addr16[0], oa->addr16[0], 0),
2001			    ooa.addr16[1], oa->addr16[1], 0);
2002			break;
2003#endif /* INET */
2004#ifdef INET6
2005		case AF_INET6:
2006			*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2007			    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2008			    pf_cksum_fixup(pf_cksum_fixup(*ic,
2009			    ooa.addr16[0], oa->addr16[0], u),
2010			    ooa.addr16[1], oa->addr16[1], u),
2011			    ooa.addr16[2], oa->addr16[2], u),
2012			    ooa.addr16[3], oa->addr16[3], u),
2013			    ooa.addr16[4], oa->addr16[4], u),
2014			    ooa.addr16[5], oa->addr16[5], u),
2015			    ooa.addr16[6], oa->addr16[6], u),
2016			    ooa.addr16[7], oa->addr16[7], u);
2017			break;
2018#endif /* INET6 */
2019		}
2020	}
2021}
2022
2023
2024/*
2025 * Need to modulate the sequence numbers in the TCP SACK option
2026 * (credits to Krzysztof Pfaff for report and patch)
2027 */
2028static int
2029pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
2030    struct tcphdr *th, struct pf_state_peer *dst)
2031{
2032	int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
2033	u_int8_t opts[TCP_MAXOLEN], *opt = opts;
2034	int copyback = 0, i, olen;
2035	struct sackblk sack;
2036
2037#define	TCPOLEN_SACKLEN	(TCPOLEN_SACK + 2)
2038	if (hlen < TCPOLEN_SACKLEN ||
2039	    !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
2040		return 0;
2041
2042	while (hlen >= TCPOLEN_SACKLEN) {
2043		olen = opt[1];
2044		switch (*opt) {
2045		case TCPOPT_EOL:	/* FALLTHROUGH */
2046		case TCPOPT_NOP:
2047			opt++;
2048			hlen--;
2049			break;
2050		case TCPOPT_SACK:
2051			if (olen > hlen)
2052				olen = hlen;
2053			if (olen >= TCPOLEN_SACKLEN) {
2054				for (i = 2; i + TCPOLEN_SACK <= olen;
2055				    i += TCPOLEN_SACK) {
2056					memcpy(&sack, &opt[i], sizeof(sack));
2057					pf_change_a(&sack.start, &th->th_sum,
2058					    htonl(ntohl(sack.start) -
2059					    dst->seqdiff), 0);
2060					pf_change_a(&sack.end, &th->th_sum,
2061					    htonl(ntohl(sack.end) -
2062					    dst->seqdiff), 0);
2063					memcpy(&opt[i], &sack, sizeof(sack));
2064				}
2065				copyback = 1;
2066			}
2067			/* FALLTHROUGH */
2068		default:
2069			if (olen < 2)
2070				olen = 2;
2071			hlen -= olen;
2072			opt += olen;
2073		}
2074	}
2075
2076	if (copyback)
2077		m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts);
2078	return (copyback);
2079}
2080
2081static void
2082pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af,
2083    const struct pf_addr *saddr, const struct pf_addr *daddr,
2084    u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
2085    u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
2086    u_int16_t rtag, struct ifnet *ifp)
2087{
2088	struct pf_send_entry *pfse;
2089	struct mbuf	*m;
2090	int		 len, tlen;
2091#ifdef INET
2092	struct ip	*h = NULL;
2093#endif /* INET */
2094#ifdef INET6
2095	struct ip6_hdr	*h6 = NULL;
2096#endif /* INET6 */
2097	struct tcphdr	*th;
2098	char		*opt;
2099	struct pf_mtag  *pf_mtag;
2100
2101	len = 0;
2102	th = NULL;
2103
2104	/* maximum segment size tcp option */
2105	tlen = sizeof(struct tcphdr);
2106	if (mss)
2107		tlen += 4;
2108
2109	switch (af) {
2110#ifdef INET
2111	case AF_INET:
2112		len = sizeof(struct ip) + tlen;
2113		break;
2114#endif /* INET */
2115#ifdef INET6
2116	case AF_INET6:
2117		len = sizeof(struct ip6_hdr) + tlen;
2118		break;
2119#endif /* INET6 */
2120	default:
2121		panic("%s: unsupported af %d", __func__, af);
2122	}
2123
2124	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
2125	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2126	if (pfse == NULL)
2127		return;
2128	m = m_gethdr(M_NOWAIT, MT_HEADER);
2129	if (m == NULL) {
2130		free(pfse, M_PFTEMP);
2131		return;
2132	}
2133#ifdef MAC
2134	mac_netinet_firewall_send(m);
2135#endif
2136	if ((pf_mtag = pf_get_mtag(m)) == NULL) {
2137		free(pfse, M_PFTEMP);
2138		m_freem(m);
2139		return;
2140	}
2141	if (tag)
2142		m->m_flags |= M_SKIP_FIREWALL;
2143	pf_mtag->tag = rtag;
2144
2145	if (r != NULL && r->rtableid >= 0)
2146		M_SETFIB(m, r->rtableid);
2147
2148#ifdef ALTQ
2149	if (r != NULL && r->qid) {
2150		pf_mtag->qid = r->qid;
2151
2152		/* add hints for ecn */
2153		pf_mtag->hdr = mtod(m, struct ip *);
2154	}
2155#endif /* ALTQ */
2156	m->m_data += max_linkhdr;
2157	m->m_pkthdr.len = m->m_len = len;
2158	m->m_pkthdr.rcvif = NULL;
2159	bzero(m->m_data, len);
2160	switch (af) {
2161#ifdef INET
2162	case AF_INET:
2163		h = mtod(m, struct ip *);
2164
2165		/* IP header fields included in the TCP checksum */
2166		h->ip_p = IPPROTO_TCP;
2167		h->ip_len = htons(tlen);
2168		h->ip_src.s_addr = saddr->v4.s_addr;
2169		h->ip_dst.s_addr = daddr->v4.s_addr;
2170
2171		th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
2172		break;
2173#endif /* INET */
2174#ifdef INET6
2175	case AF_INET6:
2176		h6 = mtod(m, struct ip6_hdr *);
2177
2178		/* IP header fields included in the TCP checksum */
2179		h6->ip6_nxt = IPPROTO_TCP;
2180		h6->ip6_plen = htons(tlen);
2181		memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
2182		memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
2183
2184		th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
2185		break;
2186#endif /* INET6 */
2187	}
2188
2189	/* TCP header */
2190	th->th_sport = sport;
2191	th->th_dport = dport;
2192	th->th_seq = htonl(seq);
2193	th->th_ack = htonl(ack);
2194	th->th_off = tlen >> 2;
2195	th->th_flags = flags;
2196	th->th_win = htons(win);
2197
2198	if (mss) {
2199		opt = (char *)(th + 1);
2200		opt[0] = TCPOPT_MAXSEG;
2201		opt[1] = 4;
2202		HTONS(mss);
2203		bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
2204	}
2205
2206	switch (af) {
2207#ifdef INET
2208	case AF_INET:
2209		/* TCP checksum */
2210		th->th_sum = in_cksum(m, len);
2211
2212		/* Finish the IP header */
2213		h->ip_v = 4;
2214		h->ip_hl = sizeof(*h) >> 2;
2215		h->ip_tos = IPTOS_LOWDELAY;
2216		h->ip_off = V_path_mtu_discovery ? IP_DF : 0;
2217		h->ip_len = len;
2218		h->ip_ttl = ttl ? ttl : V_ip_defttl;
2219		h->ip_sum = 0;
2220
2221		pfse->pfse_type = PFSE_IP;
2222		break;
2223#endif /* INET */
2224#ifdef INET6
2225	case AF_INET6:
2226		/* TCP checksum */
2227		th->th_sum = in6_cksum(m, IPPROTO_TCP,
2228		    sizeof(struct ip6_hdr), tlen);
2229
2230		h6->ip6_vfc |= IPV6_VERSION;
2231		h6->ip6_hlim = IPV6_DEFHLIM;
2232
2233		pfse->pfse_type = PFSE_IP6;
2234		break;
2235#endif /* INET6 */
2236	}
2237	pfse->pfse_m = m;
2238	pf_send(pfse);
2239}
2240
2241static void
2242pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
2243    struct pf_rule *r)
2244{
2245	struct pf_send_entry *pfse;
2246	struct mbuf *m0;
2247	struct pf_mtag *pf_mtag;
2248
2249	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
2250	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2251	if (pfse == NULL)
2252		return;
2253
2254	if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) {
2255		free(pfse, M_PFTEMP);
2256		return;
2257	}
2258
2259	if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
2260		free(pfse, M_PFTEMP);
2261		return;
2262	}
2263	/* XXX: revisit */
2264	m0->m_flags |= M_SKIP_FIREWALL;
2265
2266	if (r->rtableid >= 0)
2267		M_SETFIB(m0, r->rtableid);
2268
2269#ifdef ALTQ
2270	if (r->qid) {
2271		pf_mtag->qid = r->qid;
2272		/* add hints for ecn */
2273		pf_mtag->hdr = mtod(m0, struct ip *);
2274	}
2275#endif /* ALTQ */
2276
2277	switch (af) {
2278#ifdef INET
2279	case AF_INET:
2280	    {
2281		struct ip *ip;
2282
2283		/* icmp_error() expects host byte ordering */
2284		ip = mtod(m0, struct ip *);
2285		NTOHS(ip->ip_len);
2286		NTOHS(ip->ip_off);
2287
2288		pfse->pfse_type = PFSE_ICMP;
2289		break;
2290	    }
2291#endif /* INET */
2292#ifdef INET6
2293	case AF_INET6:
2294		pfse->pfse_type = PFSE_ICMP6;
2295		break;
2296#endif /* INET6 */
2297	}
2298	pfse->pfse_m = m0;
2299	pfse->pfse_icmp_type = type;
2300	pfse->pfse_icmp_code = code;
2301	pf_send(pfse);
2302}
2303
2304/*
2305 * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
2306 * If n is 0, they match if they are equal. If n is != 0, they match if they
2307 * are different.
2308 */
2309int
2310pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
2311    struct pf_addr *b, sa_family_t af)
2312{
2313	int	match = 0;
2314
2315	switch (af) {
2316#ifdef INET
2317	case AF_INET:
2318		if ((a->addr32[0] & m->addr32[0]) ==
2319		    (b->addr32[0] & m->addr32[0]))
2320			match++;
2321		break;
2322#endif /* INET */
2323#ifdef INET6
2324	case AF_INET6:
2325		if (((a->addr32[0] & m->addr32[0]) ==
2326		     (b->addr32[0] & m->addr32[0])) &&
2327		    ((a->addr32[1] & m->addr32[1]) ==
2328		     (b->addr32[1] & m->addr32[1])) &&
2329		    ((a->addr32[2] & m->addr32[2]) ==
2330		     (b->addr32[2] & m->addr32[2])) &&
2331		    ((a->addr32[3] & m->addr32[3]) ==
2332		     (b->addr32[3] & m->addr32[3])))
2333			match++;
2334		break;
2335#endif /* INET6 */
2336	}
2337	if (match) {
2338		if (n)
2339			return (0);
2340		else
2341			return (1);
2342	} else {
2343		if (n)
2344			return (1);
2345		else
2346			return (0);
2347	}
2348}
2349
2350/*
2351 * Return 1 if b <= a <= e, otherwise return 0.
2352 */
2353int
2354pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
2355    struct pf_addr *a, sa_family_t af)
2356{
2357	switch (af) {
2358#ifdef INET
2359	case AF_INET:
2360		if ((a->addr32[0] < b->addr32[0]) ||
2361		    (a->addr32[0] > e->addr32[0]))
2362			return (0);
2363		break;
2364#endif /* INET */
2365#ifdef INET6
2366	case AF_INET6: {
2367		int	i;
2368
2369		/* check a >= b */
2370		for (i = 0; i < 4; ++i)
2371			if (a->addr32[i] > b->addr32[i])
2372				break;
2373			else if (a->addr32[i] < b->addr32[i])
2374				return (0);
2375		/* check a <= e */
2376		for (i = 0; i < 4; ++i)
2377			if (a->addr32[i] < e->addr32[i])
2378				break;
2379			else if (a->addr32[i] > e->addr32[i])
2380				return (0);
2381		break;
2382	}
2383#endif /* INET6 */
2384	}
2385	return (1);
2386}
2387
2388static int
2389pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
2390{
2391	switch (op) {
2392	case PF_OP_IRG:
2393		return ((p > a1) && (p < a2));
2394	case PF_OP_XRG:
2395		return ((p < a1) || (p > a2));
2396	case PF_OP_RRG:
2397		return ((p >= a1) && (p <= a2));
2398	case PF_OP_EQ:
2399		return (p == a1);
2400	case PF_OP_NE:
2401		return (p != a1);
2402	case PF_OP_LT:
2403		return (p < a1);
2404	case PF_OP_LE:
2405		return (p <= a1);
2406	case PF_OP_GT:
2407		return (p > a1);
2408	case PF_OP_GE:
2409		return (p >= a1);
2410	}
2411	return (0); /* never reached */
2412}
2413
2414int
2415pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
2416{
2417	NTOHS(a1);
2418	NTOHS(a2);
2419	NTOHS(p);
2420	return (pf_match(op, a1, a2, p));
2421}
2422
2423static int
2424pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
2425{
2426	if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2427		return (0);
2428	return (pf_match(op, a1, a2, u));
2429}
2430
2431static int
2432pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
2433{
2434	if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2435		return (0);
2436	return (pf_match(op, a1, a2, g));
2437}
2438
2439int
2440pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag)
2441{
2442	if (*tag == -1)
2443		*tag = mtag;
2444
2445	return ((!r->match_tag_not && r->match_tag == *tag) ||
2446	    (r->match_tag_not && r->match_tag != *tag));
2447}
2448
2449int
2450pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag)
2451{
2452
2453	KASSERT(tag > 0, ("%s: tag %d", __func__, tag));
2454
2455	if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL))
2456		return (ENOMEM);
2457
2458	pd->pf_mtag->tag = tag;
2459
2460	return (0);
2461}
2462
2463void
2464pf_step_into_anchor(int *depth, struct pf_ruleset **rs, int n,
2465    struct pf_rule **r, struct pf_rule **a, int *match)
2466{
2467	struct pf_anchor_stackframe	*f;
2468
2469	PF_RULES_RASSERT();
2470
2471	(*r)->anchor->match = 0;
2472	if (match)
2473		*match = 0;
2474	if (*depth >= sizeof(V_pf_anchor_stack) /
2475	    sizeof(V_pf_anchor_stack[0])) {
2476		printf("pf_step_into_anchor: stack overflow\n");
2477		*r = TAILQ_NEXT(*r, entries);
2478		return;
2479	} else if (*depth == 0 && a != NULL)
2480		*a = *r;
2481	f = V_pf_anchor_stack + (*depth)++;
2482	f->rs = *rs;
2483	f->r = *r;
2484	if ((*r)->anchor_wildcard) {
2485		f->parent = &(*r)->anchor->children;
2486		if ((f->child = RB_MIN(pf_anchor_node, f->parent)) ==
2487		    NULL) {
2488			*r = NULL;
2489			return;
2490		}
2491		*rs = &f->child->ruleset;
2492	} else {
2493		f->parent = NULL;
2494		f->child = NULL;
2495		*rs = &(*r)->anchor->ruleset;
2496	}
2497	*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2498}
2499
2500int
2501pf_step_out_of_anchor(int *depth, struct pf_ruleset **rs, int n,
2502    struct pf_rule **r, struct pf_rule **a, int *match)
2503{
2504	struct pf_anchor_stackframe	*f;
2505	int quick = 0;
2506
2507	PF_RULES_RASSERT();
2508
2509	do {
2510		if (*depth <= 0)
2511			break;
2512		f = V_pf_anchor_stack + *depth - 1;
2513		if (f->parent != NULL && f->child != NULL) {
2514			if (f->child->match ||
2515			    (match != NULL && *match)) {
2516				f->r->anchor->match = 1;
2517				*match = 0;
2518			}
2519			f->child = RB_NEXT(pf_anchor_node, f->parent, f->child);
2520			if (f->child != NULL) {
2521				*rs = &f->child->ruleset;
2522				*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2523				if (*r == NULL)
2524					continue;
2525				else
2526					break;
2527			}
2528		}
2529		(*depth)--;
2530		if (*depth == 0 && a != NULL)
2531			*a = NULL;
2532		*rs = f->rs;
2533		if (f->r->anchor->match || (match != NULL && *match))
2534			quick = f->r->quick;
2535		*r = TAILQ_NEXT(f->r, entries);
2536	} while (*r == NULL);
2537
2538	return (quick);
2539}
2540
2541#ifdef INET6
2542void
2543pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
2544    struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
2545{
2546	switch (af) {
2547#ifdef INET
2548	case AF_INET:
2549		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2550		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2551		break;
2552#endif /* INET */
2553	case AF_INET6:
2554		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2555		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2556		naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
2557		((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
2558		naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
2559		((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
2560		naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
2561		((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
2562		break;
2563	}
2564}
2565
2566void
2567pf_addr_inc(struct pf_addr *addr, sa_family_t af)
2568{
2569	switch (af) {
2570#ifdef INET
2571	case AF_INET:
2572		addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
2573		break;
2574#endif /* INET */
2575	case AF_INET6:
2576		if (addr->addr32[3] == 0xffffffff) {
2577			addr->addr32[3] = 0;
2578			if (addr->addr32[2] == 0xffffffff) {
2579				addr->addr32[2] = 0;
2580				if (addr->addr32[1] == 0xffffffff) {
2581					addr->addr32[1] = 0;
2582					addr->addr32[0] =
2583					    htonl(ntohl(addr->addr32[0]) + 1);
2584				} else
2585					addr->addr32[1] =
2586					    htonl(ntohl(addr->addr32[1]) + 1);
2587			} else
2588				addr->addr32[2] =
2589				    htonl(ntohl(addr->addr32[2]) + 1);
2590		} else
2591			addr->addr32[3] =
2592			    htonl(ntohl(addr->addr32[3]) + 1);
2593		break;
2594	}
2595}
2596#endif /* INET6 */
2597
2598int
2599pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m)
2600{
2601	struct pf_addr		*saddr, *daddr;
2602	u_int16_t		 sport, dport;
2603	struct inpcbinfo	*pi;
2604	struct inpcb		*inp;
2605
2606	pd->lookup.uid = UID_MAX;
2607	pd->lookup.gid = GID_MAX;
2608
2609	switch (pd->proto) {
2610	case IPPROTO_TCP:
2611		if (pd->hdr.tcp == NULL)
2612			return (-1);
2613		sport = pd->hdr.tcp->th_sport;
2614		dport = pd->hdr.tcp->th_dport;
2615		pi = &V_tcbinfo;
2616		break;
2617	case IPPROTO_UDP:
2618		if (pd->hdr.udp == NULL)
2619			return (-1);
2620		sport = pd->hdr.udp->uh_sport;
2621		dport = pd->hdr.udp->uh_dport;
2622		pi = &V_udbinfo;
2623		break;
2624	default:
2625		return (-1);
2626	}
2627	if (direction == PF_IN) {
2628		saddr = pd->src;
2629		daddr = pd->dst;
2630	} else {
2631		u_int16_t	p;
2632
2633		p = sport;
2634		sport = dport;
2635		dport = p;
2636		saddr = pd->dst;
2637		daddr = pd->src;
2638	}
2639	switch (pd->af) {
2640#ifdef INET
2641	case AF_INET:
2642		inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4,
2643		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
2644		if (inp == NULL) {
2645			inp = in_pcblookup_mbuf(pi, saddr->v4, sport,
2646			   daddr->v4, dport, INPLOOKUP_WILDCARD |
2647			   INPLOOKUP_RLOCKPCB, NULL, m);
2648			if (inp == NULL)
2649				return (-1);
2650		}
2651		break;
2652#endif /* INET */
2653#ifdef INET6
2654	case AF_INET6:
2655		inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6,
2656		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
2657		if (inp == NULL) {
2658			inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport,
2659			    &daddr->v6, dport, INPLOOKUP_WILDCARD |
2660			    INPLOOKUP_RLOCKPCB, NULL, m);
2661			if (inp == NULL)
2662				return (-1);
2663		}
2664		break;
2665#endif /* INET6 */
2666
2667	default:
2668		return (-1);
2669	}
2670	INP_RLOCK_ASSERT(inp);
2671	pd->lookup.uid = inp->inp_cred->cr_uid;
2672	pd->lookup.gid = inp->inp_cred->cr_groups[0];
2673	INP_RUNLOCK(inp);
2674
2675	return (1);
2676}
2677
2678static u_int8_t
2679pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
2680{
2681	int		 hlen;
2682	u_int8_t	 hdr[60];
2683	u_int8_t	*opt, optlen;
2684	u_int8_t	 wscale = 0;
2685
2686	hlen = th_off << 2;		/* hlen <= sizeof(hdr) */
2687	if (hlen <= sizeof(struct tcphdr))
2688		return (0);
2689	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
2690		return (0);
2691	opt = hdr + sizeof(struct tcphdr);
2692	hlen -= sizeof(struct tcphdr);
2693	while (hlen >= 3) {
2694		switch (*opt) {
2695		case TCPOPT_EOL:
2696		case TCPOPT_NOP:
2697			++opt;
2698			--hlen;
2699			break;
2700		case TCPOPT_WINDOW:
2701			wscale = opt[2];
2702			if (wscale > TCP_MAX_WINSHIFT)
2703				wscale = TCP_MAX_WINSHIFT;
2704			wscale |= PF_WSCALE_FLAG;
2705			/* FALLTHROUGH */
2706		default:
2707			optlen = opt[1];
2708			if (optlen < 2)
2709				optlen = 2;
2710			hlen -= optlen;
2711			opt += optlen;
2712			break;
2713		}
2714	}
2715	return (wscale);
2716}
2717
2718static u_int16_t
2719pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
2720{
2721	int		 hlen;
2722	u_int8_t	 hdr[60];
2723	u_int8_t	*opt, optlen;
2724	u_int16_t	 mss = V_tcp_mssdflt;
2725
2726	hlen = th_off << 2;	/* hlen <= sizeof(hdr) */
2727	if (hlen <= sizeof(struct tcphdr))
2728		return (0);
2729	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
2730		return (0);
2731	opt = hdr + sizeof(struct tcphdr);
2732	hlen -= sizeof(struct tcphdr);
2733	while (hlen >= TCPOLEN_MAXSEG) {
2734		switch (*opt) {
2735		case TCPOPT_EOL:
2736		case TCPOPT_NOP:
2737			++opt;
2738			--hlen;
2739			break;
2740		case TCPOPT_MAXSEG:
2741			bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
2742			NTOHS(mss);
2743			/* FALLTHROUGH */
2744		default:
2745			optlen = opt[1];
2746			if (optlen < 2)
2747				optlen = 2;
2748			hlen -= optlen;
2749			opt += optlen;
2750			break;
2751		}
2752	}
2753	return (mss);
2754}
2755
2756static u_int16_t
2757pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer)
2758{
2759#ifdef INET
2760	struct sockaddr_in	*dst;
2761	struct route		 ro;
2762#endif /* INET */
2763#ifdef INET6
2764	struct sockaddr_in6	*dst6;
2765	struct route_in6	 ro6;
2766#endif /* INET6 */
2767	struct rtentry		*rt = NULL;
2768	int			 hlen = 0;
2769	u_int16_t		 mss = V_tcp_mssdflt;
2770
2771	switch (af) {
2772#ifdef INET
2773	case AF_INET:
2774		hlen = sizeof(struct ip);
2775		bzero(&ro, sizeof(ro));
2776		dst = (struct sockaddr_in *)&ro.ro_dst;
2777		dst->sin_family = AF_INET;
2778		dst->sin_len = sizeof(*dst);
2779		dst->sin_addr = addr->v4;
2780		in_rtalloc_ign(&ro, 0, rtableid);
2781		rt = ro.ro_rt;
2782		break;
2783#endif /* INET */
2784#ifdef INET6
2785	case AF_INET6:
2786		hlen = sizeof(struct ip6_hdr);
2787		bzero(&ro6, sizeof(ro6));
2788		dst6 = (struct sockaddr_in6 *)&ro6.ro_dst;
2789		dst6->sin6_family = AF_INET6;
2790		dst6->sin6_len = sizeof(*dst6);
2791		dst6->sin6_addr = addr->v6;
2792		in6_rtalloc_ign(&ro6, 0, rtableid);
2793		rt = ro6.ro_rt;
2794		break;
2795#endif /* INET6 */
2796	}
2797
2798	if (rt && rt->rt_ifp) {
2799		mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr);
2800		mss = max(V_tcp_mssdflt, mss);
2801		RTFREE(rt);
2802	}
2803	mss = min(mss, offer);
2804	mss = max(mss, 64);		/* sanity - at least max opt space */
2805	return (mss);
2806}
2807
2808static void
2809pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr)
2810{
2811	struct pf_rule *r = s->rule.ptr;
2812	struct pf_src_node *sn = NULL;
2813
2814	s->rt_kif = NULL;
2815	if (!r->rt || r->rt == PF_FASTROUTE)
2816		return;
2817	switch (s->key[PF_SK_WIRE]->af) {
2818#ifdef INET
2819	case AF_INET:
2820		pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL, &sn);
2821		s->rt_kif = r->rpool.cur->kif;
2822		break;
2823#endif /* INET */
2824#ifdef INET6
2825	case AF_INET6:
2826		pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL, &sn);
2827		s->rt_kif = r->rpool.cur->kif;
2828		break;
2829#endif /* INET6 */
2830	}
2831}
2832
2833static u_int32_t
2834pf_tcp_iss(struct pf_pdesc *pd)
2835{
2836	MD5_CTX ctx;
2837	u_int32_t digest[4];
2838
2839	if (V_pf_tcp_secret_init == 0) {
2840		read_random(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret));
2841		MD5Init(&V_pf_tcp_secret_ctx);
2842		MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret,
2843		    sizeof(V_pf_tcp_secret));
2844		V_pf_tcp_secret_init = 1;
2845	}
2846
2847	ctx = V_pf_tcp_secret_ctx;
2848
2849	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
2850	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
2851	if (pd->af == AF_INET6) {
2852		MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
2853		MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
2854	} else {
2855		MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
2856		MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
2857	}
2858	MD5Final((u_char *)digest, &ctx);
2859	V_pf_tcp_iss_off += 4096;
2860#define	ISN_RANDOM_INCREMENT (4096 - 1)
2861	return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) +
2862	    V_pf_tcp_iss_off);
2863#undef	ISN_RANDOM_INCREMENT
2864}
2865
2866static int
2867pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
2868    struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd,
2869    struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp)
2870{
2871	struct pf_rule		*nr = NULL;
2872	struct pf_addr		* const saddr = pd->src;
2873	struct pf_addr		* const daddr = pd->dst;
2874	sa_family_t		 af = pd->af;
2875	struct pf_rule		*r, *a = NULL;
2876	struct pf_ruleset	*ruleset = NULL;
2877	struct pf_src_node	*nsn = NULL;
2878	struct tcphdr		*th = pd->hdr.tcp;
2879	struct pf_state_key	*sk = NULL, *nk = NULL;
2880	u_short			 reason;
2881	int			 rewrite = 0, hdrlen = 0;
2882	int			 tag = -1, rtableid = -1;
2883	int			 asd = 0;
2884	int			 match = 0;
2885	int			 state_icmp = 0;
2886	u_int16_t		 sport = 0, dport = 0;
2887	u_int16_t		 bproto_sum = 0, bip_sum = 0;
2888	u_int8_t		 icmptype = 0, icmpcode = 0;
2889
2890	PF_RULES_RASSERT();
2891
2892	if (inp != NULL) {
2893		INP_LOCK_ASSERT(inp);
2894		pd->lookup.uid = inp->inp_cred->cr_uid;
2895		pd->lookup.gid = inp->inp_cred->cr_groups[0];
2896		pd->lookup.done = 1;
2897	}
2898
2899	switch (pd->proto) {
2900	case IPPROTO_TCP:
2901		sport = th->th_sport;
2902		dport = th->th_dport;
2903		hdrlen = sizeof(*th);
2904		break;
2905	case IPPROTO_UDP:
2906		sport = pd->hdr.udp->uh_sport;
2907		dport = pd->hdr.udp->uh_dport;
2908		hdrlen = sizeof(*pd->hdr.udp);
2909		break;
2910#ifdef INET
2911	case IPPROTO_ICMP:
2912		if (pd->af != AF_INET)
2913			break;
2914		sport = dport = pd->hdr.icmp->icmp_id;
2915		hdrlen = sizeof(*pd->hdr.icmp);
2916		icmptype = pd->hdr.icmp->icmp_type;
2917		icmpcode = pd->hdr.icmp->icmp_code;
2918
2919		if (icmptype == ICMP_UNREACH ||
2920		    icmptype == ICMP_SOURCEQUENCH ||
2921		    icmptype == ICMP_REDIRECT ||
2922		    icmptype == ICMP_TIMXCEED ||
2923		    icmptype == ICMP_PARAMPROB)
2924			state_icmp++;
2925		break;
2926#endif /* INET */
2927#ifdef INET6
2928	case IPPROTO_ICMPV6:
2929		if (af != AF_INET6)
2930			break;
2931		sport = dport = pd->hdr.icmp6->icmp6_id;
2932		hdrlen = sizeof(*pd->hdr.icmp6);
2933		icmptype = pd->hdr.icmp6->icmp6_type;
2934		icmpcode = pd->hdr.icmp6->icmp6_code;
2935
2936		if (icmptype == ICMP6_DST_UNREACH ||
2937		    icmptype == ICMP6_PACKET_TOO_BIG ||
2938		    icmptype == ICMP6_TIME_EXCEEDED ||
2939		    icmptype == ICMP6_PARAM_PROB)
2940			state_icmp++;
2941		break;
2942#endif /* INET6 */
2943	default:
2944		sport = dport = hdrlen = 0;
2945		break;
2946	}
2947
2948	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
2949
2950	/* check packet for BINAT/NAT/RDR */
2951	if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk,
2952	    &nk, saddr, daddr, sport, dport)) != NULL) {
2953		KASSERT(sk != NULL, ("%s: null sk", __func__));
2954		KASSERT(nk != NULL, ("%s: null nk", __func__));
2955
2956		if (pd->ip_sum)
2957			bip_sum = *pd->ip_sum;
2958
2959		switch (pd->proto) {
2960		case IPPROTO_TCP:
2961			bproto_sum = th->th_sum;
2962			pd->proto_sum = &th->th_sum;
2963
2964			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
2965			    nk->port[pd->sidx] != sport) {
2966				pf_change_ap(saddr, &th->th_sport, pd->ip_sum,
2967				    &th->th_sum, &nk->addr[pd->sidx],
2968				    nk->port[pd->sidx], 0, af);
2969				pd->sport = &th->th_sport;
2970				sport = th->th_sport;
2971			}
2972
2973			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
2974			    nk->port[pd->didx] != dport) {
2975				pf_change_ap(daddr, &th->th_dport, pd->ip_sum,
2976				    &th->th_sum, &nk->addr[pd->didx],
2977				    nk->port[pd->didx], 0, af);
2978				dport = th->th_dport;
2979				pd->dport = &th->th_dport;
2980			}
2981			rewrite++;
2982			break;
2983		case IPPROTO_UDP:
2984			bproto_sum = pd->hdr.udp->uh_sum;
2985			pd->proto_sum = &pd->hdr.udp->uh_sum;
2986
2987			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
2988			    nk->port[pd->sidx] != sport) {
2989				pf_change_ap(saddr, &pd->hdr.udp->uh_sport,
2990				    pd->ip_sum, &pd->hdr.udp->uh_sum,
2991				    &nk->addr[pd->sidx],
2992				    nk->port[pd->sidx], 1, af);
2993				sport = pd->hdr.udp->uh_sport;
2994				pd->sport = &pd->hdr.udp->uh_sport;
2995			}
2996
2997			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
2998			    nk->port[pd->didx] != dport) {
2999				pf_change_ap(daddr, &pd->hdr.udp->uh_dport,
3000				    pd->ip_sum, &pd->hdr.udp->uh_sum,
3001				    &nk->addr[pd->didx],
3002				    nk->port[pd->didx], 1, af);
3003				dport = pd->hdr.udp->uh_dport;
3004				pd->dport = &pd->hdr.udp->uh_dport;
3005			}
3006			rewrite++;
3007			break;
3008#ifdef INET
3009		case IPPROTO_ICMP:
3010			nk->port[0] = nk->port[1];
3011			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
3012				pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
3013				    nk->addr[pd->sidx].v4.s_addr, 0);
3014
3015			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
3016				pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
3017				    nk->addr[pd->didx].v4.s_addr, 0);
3018
3019			if (nk->port[1] != pd->hdr.icmp->icmp_id) {
3020				pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
3021				    pd->hdr.icmp->icmp_cksum, sport,
3022				    nk->port[1], 0);
3023				pd->hdr.icmp->icmp_id = nk->port[1];
3024				pd->sport = &pd->hdr.icmp->icmp_id;
3025			}
3026			m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
3027			break;
3028#endif /* INET */
3029#ifdef INET6
3030		case IPPROTO_ICMPV6:
3031			nk->port[0] = nk->port[1];
3032			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
3033				pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
3034				    &nk->addr[pd->sidx], 0);
3035
3036			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
3037				pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
3038				    &nk->addr[pd->didx], 0);
3039			rewrite++;
3040			break;
3041#endif /* INET */
3042		default:
3043			switch (af) {
3044#ifdef INET
3045			case AF_INET:
3046				if (PF_ANEQ(saddr,
3047				    &nk->addr[pd->sidx], AF_INET))
3048					pf_change_a(&saddr->v4.s_addr,
3049					    pd->ip_sum,
3050					    nk->addr[pd->sidx].v4.s_addr, 0);
3051
3052				if (PF_ANEQ(daddr,
3053				    &nk->addr[pd->didx], AF_INET))
3054					pf_change_a(&daddr->v4.s_addr,
3055					    pd->ip_sum,
3056					    nk->addr[pd->didx].v4.s_addr, 0);
3057				break;
3058#endif /* INET */
3059#ifdef INET6
3060			case AF_INET6:
3061				if (PF_ANEQ(saddr,
3062				    &nk->addr[pd->sidx], AF_INET6))
3063					PF_ACPY(saddr, &nk->addr[pd->sidx], af);
3064
3065				if (PF_ANEQ(daddr,
3066				    &nk->addr[pd->didx], AF_INET6))
3067					PF_ACPY(saddr, &nk->addr[pd->didx], af);
3068				break;
3069#endif /* INET */
3070			}
3071			break;
3072		}
3073		if (nr->natpass)
3074			r = NULL;
3075		pd->nat_rule = nr;
3076	}
3077
3078	while (r != NULL) {
3079		r->evaluations++;
3080		if (pfi_kif_match(r->kif, kif) == r->ifnot)
3081			r = r->skip[PF_SKIP_IFP].ptr;
3082		else if (r->direction && r->direction != direction)
3083			r = r->skip[PF_SKIP_DIR].ptr;
3084		else if (r->af && r->af != af)
3085			r = r->skip[PF_SKIP_AF].ptr;
3086		else if (r->proto && r->proto != pd->proto)
3087			r = r->skip[PF_SKIP_PROTO].ptr;
3088		else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
3089		    r->src.neg, kif, M_GETFIB(m)))
3090			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3091		/* tcp/udp only. port_op always 0 in other cases */
3092		else if (r->src.port_op && !pf_match_port(r->src.port_op,
3093		    r->src.port[0], r->src.port[1], sport))
3094			r = r->skip[PF_SKIP_SRC_PORT].ptr;
3095		else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
3096		    r->dst.neg, NULL, M_GETFIB(m)))
3097			r = r->skip[PF_SKIP_DST_ADDR].ptr;
3098		/* tcp/udp only. port_op always 0 in other cases */
3099		else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
3100		    r->dst.port[0], r->dst.port[1], dport))
3101			r = r->skip[PF_SKIP_DST_PORT].ptr;
3102		/* icmp only. type always 0 in other cases */
3103		else if (r->type && r->type != icmptype + 1)
3104			r = TAILQ_NEXT(r, entries);
3105		/* icmp only. type always 0 in other cases */
3106		else if (r->code && r->code != icmpcode + 1)
3107			r = TAILQ_NEXT(r, entries);
3108		else if (r->tos && !(r->tos == pd->tos))
3109			r = TAILQ_NEXT(r, entries);
3110		else if (r->rule_flag & PFRULE_FRAGMENT)
3111			r = TAILQ_NEXT(r, entries);
3112		else if (pd->proto == IPPROTO_TCP &&
3113		    (r->flagset & th->th_flags) != r->flags)
3114			r = TAILQ_NEXT(r, entries);
3115		/* tcp/udp only. uid.op always 0 in other cases */
3116		else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
3117		    pf_socket_lookup(direction, pd, m), 1)) &&
3118		    !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
3119		    pd->lookup.uid))
3120			r = TAILQ_NEXT(r, entries);
3121		/* tcp/udp only. gid.op always 0 in other cases */
3122		else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
3123		    pf_socket_lookup(direction, pd, m), 1)) &&
3124		    !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
3125		    pd->lookup.gid))
3126			r = TAILQ_NEXT(r, entries);
3127		else if (r->prob &&
3128		    r->prob <= arc4random())
3129			r = TAILQ_NEXT(r, entries);
3130		else if (r->match_tag && !pf_match_tag(m, r, &tag,
3131		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
3132			r = TAILQ_NEXT(r, entries);
3133		else if (r->os_fingerprint != PF_OSFP_ANY &&
3134		    (pd->proto != IPPROTO_TCP || !pf_osfp_match(
3135		    pf_osfp_fingerprint(pd, m, off, th),
3136		    r->os_fingerprint)))
3137			r = TAILQ_NEXT(r, entries);
3138		else {
3139			if (r->tag)
3140				tag = r->tag;
3141			if (r->rtableid >= 0)
3142				rtableid = r->rtableid;
3143			if (r->anchor == NULL) {
3144				match = 1;
3145				*rm = r;
3146				*am = a;
3147				*rsm = ruleset;
3148				if ((*rm)->quick)
3149					break;
3150				r = TAILQ_NEXT(r, entries);
3151			} else
3152				pf_step_into_anchor(&asd, &ruleset,
3153				    PF_RULESET_FILTER, &r, &a, &match);
3154		}
3155		if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
3156		    PF_RULESET_FILTER, &r, &a, &match))
3157			break;
3158	}
3159	r = *rm;
3160	a = *am;
3161	ruleset = *rsm;
3162
3163	REASON_SET(&reason, PFRES_MATCH);
3164
3165	if (r->log || (nr != NULL && nr->log)) {
3166		if (rewrite)
3167			m_copyback(m, off, hdrlen, pd->hdr.any);
3168		PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a,
3169		    ruleset, pd, 1);
3170	}
3171
3172	if ((r->action == PF_DROP) &&
3173	    ((r->rule_flag & PFRULE_RETURNRST) ||
3174	    (r->rule_flag & PFRULE_RETURNICMP) ||
3175	    (r->rule_flag & PFRULE_RETURN))) {
3176		/* undo NAT changes, if they have taken place */
3177		if (nr != NULL) {
3178			PF_ACPY(saddr, &sk->addr[pd->sidx], af);
3179			PF_ACPY(daddr, &sk->addr[pd->didx], af);
3180			if (pd->sport)
3181				*pd->sport = sk->port[pd->sidx];
3182			if (pd->dport)
3183				*pd->dport = sk->port[pd->didx];
3184			if (pd->proto_sum)
3185				*pd->proto_sum = bproto_sum;
3186			if (pd->ip_sum)
3187				*pd->ip_sum = bip_sum;
3188			m_copyback(m, off, hdrlen, pd->hdr.any);
3189		}
3190		if (pd->proto == IPPROTO_TCP &&
3191		    ((r->rule_flag & PFRULE_RETURNRST) ||
3192		    (r->rule_flag & PFRULE_RETURN)) &&
3193		    !(th->th_flags & TH_RST)) {
3194			u_int32_t	 ack = ntohl(th->th_seq) + pd->p_len;
3195			int		 len = 0;
3196#ifdef INET
3197			struct ip	*h4;
3198#endif
3199#ifdef INET6
3200			struct ip6_hdr	*h6;
3201#endif
3202
3203			switch (af) {
3204#ifdef INET
3205			case AF_INET:
3206				h4 = mtod(m, struct ip *);
3207				len = ntohs(h4->ip_len) - off;
3208				break;
3209#endif
3210#ifdef INET6
3211			case AF_INET6:
3212				h6 = mtod(m, struct ip6_hdr *);
3213				len = ntohs(h6->ip6_plen) - (off - sizeof(*h6));
3214				break;
3215#endif
3216			}
3217
3218			if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
3219				REASON_SET(&reason, PFRES_PROTCKSUM);
3220			else {
3221				if (th->th_flags & TH_SYN)
3222					ack++;
3223				if (th->th_flags & TH_FIN)
3224					ack++;
3225				pf_send_tcp(m, r, af, pd->dst,
3226				    pd->src, th->th_dport, th->th_sport,
3227				    ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
3228				    r->return_ttl, 1, 0, kif->pfik_ifp);
3229			}
3230		} else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
3231		    r->return_icmp)
3232			pf_send_icmp(m, r->return_icmp >> 8,
3233			    r->return_icmp & 255, af, r);
3234		else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
3235		    r->return_icmp6)
3236			pf_send_icmp(m, r->return_icmp6 >> 8,
3237			    r->return_icmp6 & 255, af, r);
3238	}
3239
3240	if (r->action == PF_DROP)
3241		goto cleanup;
3242
3243	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3244		REASON_SET(&reason, PFRES_MEMORY);
3245		goto cleanup;
3246	}
3247	if (rtableid >= 0)
3248		M_SETFIB(m, rtableid);
3249
3250	if (!state_icmp && (r->keep_state || nr != NULL ||
3251	    (pd->flags & PFDESC_TCP_NORM))) {
3252		int action;
3253		action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off,
3254		    sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum,
3255		    hdrlen);
3256		if (action != PF_PASS)
3257			return (action);
3258	} else {
3259		if (sk != NULL)
3260			uma_zfree(V_pf_state_key_z, sk);
3261		if (nk != NULL)
3262			uma_zfree(V_pf_state_key_z, nk);
3263	}
3264
3265	/* copy back packet headers if we performed NAT operations */
3266	if (rewrite)
3267		m_copyback(m, off, hdrlen, pd->hdr.any);
3268
3269	if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) &&
3270	    direction == PF_OUT &&
3271	    pfsync_defer_ptr != NULL && pfsync_defer_ptr(*sm, m))
3272		/*
3273		 * We want the state created, but we dont
3274		 * want to send this in case a partner
3275		 * firewall has to know about it to allow
3276		 * replies through it.
3277		 */
3278		return (PF_DEFER);
3279
3280	return (PF_PASS);
3281
3282cleanup:
3283	if (sk != NULL)
3284		uma_zfree(V_pf_state_key_z, sk);
3285	if (nk != NULL)
3286		uma_zfree(V_pf_state_key_z, nk);
3287	return (PF_DROP);
3288}
3289
3290static int
3291pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a,
3292    struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk,
3293    struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport,
3294    u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm,
3295    int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen)
3296{
3297	struct pf_state		*s = NULL;
3298	struct pf_src_node	*sn = NULL;
3299	struct tcphdr		*th = pd->hdr.tcp;
3300	u_int16_t		 mss = V_tcp_mssdflt;
3301	u_short			 reason;
3302
3303	/* check maximums */
3304	if (r->max_states && (r->states_cur >= r->max_states)) {
3305		V_pf_status.lcounters[LCNT_STATES]++;
3306		REASON_SET(&reason, PFRES_MAXSTATES);
3307		return (PF_DROP);
3308	}
3309	/* src node for filter rule */
3310	if ((r->rule_flag & PFRULE_SRCTRACK ||
3311	    r->rpool.opts & PF_POOL_STICKYADDR) &&
3312	    pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
3313		REASON_SET(&reason, PFRES_SRCLIMIT);
3314		goto csfailed;
3315	}
3316	/* src node for translation rule */
3317	if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
3318	    pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
3319		REASON_SET(&reason, PFRES_SRCLIMIT);
3320		goto csfailed;
3321	}
3322	s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO);
3323	if (s == NULL) {
3324		REASON_SET(&reason, PFRES_MEMORY);
3325		goto csfailed;
3326	}
3327	s->rule.ptr = r;
3328	s->nat_rule.ptr = nr;
3329	s->anchor.ptr = a;
3330	STATE_INC_COUNTERS(s);
3331	if (r->allow_opts)
3332		s->state_flags |= PFSTATE_ALLOWOPTS;
3333	if (r->rule_flag & PFRULE_STATESLOPPY)
3334		s->state_flags |= PFSTATE_SLOPPY;
3335	s->log = r->log & PF_LOG_ALL;
3336	s->sync_state = PFSYNC_S_NONE;
3337	if (nr != NULL)
3338		s->log |= nr->log & PF_LOG_ALL;
3339	switch (pd->proto) {
3340	case IPPROTO_TCP:
3341		s->src.seqlo = ntohl(th->th_seq);
3342		s->src.seqhi = s->src.seqlo + pd->p_len + 1;
3343		if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
3344		    r->keep_state == PF_STATE_MODULATE) {
3345			/* Generate sequence number modulator */
3346			if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
3347			    0)
3348				s->src.seqdiff = 1;
3349			pf_change_a(&th->th_seq, &th->th_sum,
3350			    htonl(s->src.seqlo + s->src.seqdiff), 0);
3351			*rewrite = 1;
3352		} else
3353			s->src.seqdiff = 0;
3354		if (th->th_flags & TH_SYN) {
3355			s->src.seqhi++;
3356			s->src.wscale = pf_get_wscale(m, off,
3357			    th->th_off, pd->af);
3358		}
3359		s->src.max_win = MAX(ntohs(th->th_win), 1);
3360		if (s->src.wscale & PF_WSCALE_MASK) {
3361			/* Remove scale factor from initial window */
3362			int win = s->src.max_win;
3363			win += 1 << (s->src.wscale & PF_WSCALE_MASK);
3364			s->src.max_win = (win - 1) >>
3365			    (s->src.wscale & PF_WSCALE_MASK);
3366		}
3367		if (th->th_flags & TH_FIN)
3368			s->src.seqhi++;
3369		s->dst.seqhi = 1;
3370		s->dst.max_win = 1;
3371		s->src.state = TCPS_SYN_SENT;
3372		s->dst.state = TCPS_CLOSED;
3373		s->timeout = PFTM_TCP_FIRST_PACKET;
3374		break;
3375	case IPPROTO_UDP:
3376		s->src.state = PFUDPS_SINGLE;
3377		s->dst.state = PFUDPS_NO_TRAFFIC;
3378		s->timeout = PFTM_UDP_FIRST_PACKET;
3379		break;
3380	case IPPROTO_ICMP:
3381#ifdef INET6
3382	case IPPROTO_ICMPV6:
3383#endif
3384		s->timeout = PFTM_ICMP_FIRST_PACKET;
3385		break;
3386	default:
3387		s->src.state = PFOTHERS_SINGLE;
3388		s->dst.state = PFOTHERS_NO_TRAFFIC;
3389		s->timeout = PFTM_OTHER_FIRST_PACKET;
3390	}
3391
3392	s->creation = time_uptime;
3393	s->expire = time_uptime;
3394
3395	if (sn != NULL) {
3396		s->src_node = sn;
3397		s->src_node->states++;
3398	}
3399	if (nsn != NULL) {
3400		/* XXX We only modify one side for now. */
3401		PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
3402		s->nat_src_node = nsn;
3403		s->nat_src_node->states++;
3404	}
3405	if (pd->proto == IPPROTO_TCP) {
3406		if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
3407		    off, pd, th, &s->src, &s->dst)) {
3408			REASON_SET(&reason, PFRES_MEMORY);
3409			pf_src_tree_remove_state(s);
3410			STATE_DEC_COUNTERS(s);
3411			uma_zfree(V_pf_state_z, s);
3412			return (PF_DROP);
3413		}
3414		if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
3415		    pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
3416		    &s->src, &s->dst, rewrite)) {
3417			/* This really shouldn't happen!!! */
3418			DPFPRINTF(PF_DEBUG_URGENT,
3419			    ("pf_normalize_tcp_stateful failed on first pkt"));
3420			pf_normalize_tcp_cleanup(s);
3421			pf_src_tree_remove_state(s);
3422			STATE_DEC_COUNTERS(s);
3423			uma_zfree(V_pf_state_z, s);
3424			return (PF_DROP);
3425		}
3426	}
3427	s->direction = pd->dir;
3428
3429	/*
3430	 * sk/nk could already been setup by pf_get_translation().
3431	 */
3432	if (nr == NULL) {
3433		KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p",
3434		    __func__, nr, sk, nk));
3435		sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport);
3436		if (sk == NULL)
3437			goto csfailed;
3438		nk = sk;
3439	} else
3440		KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p",
3441		    __func__, nr, sk, nk));
3442
3443	/* Swap sk/nk for PF_OUT. */
3444	if (pf_state_insert(BOUND_IFACE(r, kif),
3445	    (pd->dir == PF_IN) ? sk : nk,
3446	    (pd->dir == PF_IN) ? nk : sk, s)) {
3447		if (pd->proto == IPPROTO_TCP)
3448			pf_normalize_tcp_cleanup(s);
3449		REASON_SET(&reason, PFRES_STATEINS);
3450		pf_src_tree_remove_state(s);
3451		STATE_DEC_COUNTERS(s);
3452		uma_zfree(V_pf_state_z, s);
3453		return (PF_DROP);
3454	} else
3455		*sm = s;
3456
3457	pf_set_rt_ifp(s, pd->src);	/* needs s->state_key set */
3458	if (tag > 0)
3459		s->tag = tag;
3460	if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
3461	    TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
3462		s->src.state = PF_TCPS_PROXY_SRC;
3463		/* undo NAT changes, if they have taken place */
3464		if (nr != NULL) {
3465			struct pf_state_key *skt = s->key[PF_SK_WIRE];
3466			if (pd->dir == PF_OUT)
3467				skt = s->key[PF_SK_STACK];
3468			PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
3469			PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
3470			if (pd->sport)
3471				*pd->sport = skt->port[pd->sidx];
3472			if (pd->dport)
3473				*pd->dport = skt->port[pd->didx];
3474			if (pd->proto_sum)
3475				*pd->proto_sum = bproto_sum;
3476			if (pd->ip_sum)
3477				*pd->ip_sum = bip_sum;
3478			m_copyback(m, off, hdrlen, pd->hdr.any);
3479		}
3480		s->src.seqhi = htonl(arc4random());
3481		/* Find mss option */
3482		int rtid = M_GETFIB(m);
3483		mss = pf_get_mss(m, off, th->th_off, pd->af);
3484		mss = pf_calc_mss(pd->src, pd->af, rtid, mss);
3485		mss = pf_calc_mss(pd->dst, pd->af, rtid, mss);
3486		s->src.mss = mss;
3487		pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport,
3488		    th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
3489		    TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL);
3490		REASON_SET(&reason, PFRES_SYNPROXY);
3491		return (PF_SYNPROXY_DROP);
3492	}
3493
3494	return (PF_PASS);
3495
3496csfailed:
3497	if (sk != NULL)
3498		uma_zfree(V_pf_state_key_z, sk);
3499	if (nk != NULL)
3500		uma_zfree(V_pf_state_key_z, nk);
3501
3502	if (sn != NULL && sn->states == 0 && sn->expire == 0) {
3503		pf_remove_src_node(sn);
3504		V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
3505		V_pf_status.src_nodes--;
3506		uma_zfree(V_pf_sources_z, sn);
3507	}
3508	if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) {
3509		pf_remove_src_node(nsn);
3510		V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
3511		V_pf_status.src_nodes--;
3512		uma_zfree(V_pf_sources_z, nsn);
3513	}
3514	return (PF_DROP);
3515}
3516
3517static int
3518pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
3519    struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am,
3520    struct pf_ruleset **rsm)
3521{
3522	struct pf_rule		*r, *a = NULL;
3523	struct pf_ruleset	*ruleset = NULL;
3524	sa_family_t		 af = pd->af;
3525	u_short			 reason;
3526	int			 tag = -1;
3527	int			 asd = 0;
3528	int			 match = 0;
3529
3530	PF_RULES_RASSERT();
3531
3532	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3533	while (r != NULL) {
3534		r->evaluations++;
3535		if (pfi_kif_match(r->kif, kif) == r->ifnot)
3536			r = r->skip[PF_SKIP_IFP].ptr;
3537		else if (r->direction && r->direction != direction)
3538			r = r->skip[PF_SKIP_DIR].ptr;
3539		else if (r->af && r->af != af)
3540			r = r->skip[PF_SKIP_AF].ptr;
3541		else if (r->proto && r->proto != pd->proto)
3542			r = r->skip[PF_SKIP_PROTO].ptr;
3543		else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
3544		    r->src.neg, kif, M_GETFIB(m)))
3545			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3546		else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
3547		    r->dst.neg, NULL, M_GETFIB(m)))
3548			r = r->skip[PF_SKIP_DST_ADDR].ptr;
3549		else if (r->tos && !(r->tos == pd->tos))
3550			r = TAILQ_NEXT(r, entries);
3551		else if (r->os_fingerprint != PF_OSFP_ANY)
3552			r = TAILQ_NEXT(r, entries);
3553		else if (pd->proto == IPPROTO_UDP &&
3554		    (r->src.port_op || r->dst.port_op))
3555			r = TAILQ_NEXT(r, entries);
3556		else if (pd->proto == IPPROTO_TCP &&
3557		    (r->src.port_op || r->dst.port_op || r->flagset))
3558			r = TAILQ_NEXT(r, entries);
3559		else if ((pd->proto == IPPROTO_ICMP ||
3560		    pd->proto == IPPROTO_ICMPV6) &&
3561		    (r->type || r->code))
3562			r = TAILQ_NEXT(r, entries);
3563		else if (r->prob && r->prob <=
3564		    (arc4random() % (UINT_MAX - 1) + 1))
3565			r = TAILQ_NEXT(r, entries);
3566		else if (r->match_tag && !pf_match_tag(m, r, &tag,
3567		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
3568			r = TAILQ_NEXT(r, entries);
3569		else {
3570			if (r->anchor == NULL) {
3571				match = 1;
3572				*rm = r;
3573				*am = a;
3574				*rsm = ruleset;
3575				if ((*rm)->quick)
3576					break;
3577				r = TAILQ_NEXT(r, entries);
3578			} else
3579				pf_step_into_anchor(&asd, &ruleset,
3580				    PF_RULESET_FILTER, &r, &a, &match);
3581		}
3582		if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
3583		    PF_RULESET_FILTER, &r, &a, &match))
3584			break;
3585	}
3586	r = *rm;
3587	a = *am;
3588	ruleset = *rsm;
3589
3590	REASON_SET(&reason, PFRES_MATCH);
3591
3592	if (r->log)
3593		PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd,
3594		    1);
3595
3596	if (r->action != PF_PASS)
3597		return (PF_DROP);
3598
3599	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3600		REASON_SET(&reason, PFRES_MEMORY);
3601		return (PF_DROP);
3602	}
3603
3604	return (PF_PASS);
3605}
3606
3607static int
3608pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
3609	struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
3610	struct pf_pdesc *pd, u_short *reason, int *copyback)
3611{
3612	struct tcphdr		*th = pd->hdr.tcp;
3613	u_int16_t		 win = ntohs(th->th_win);
3614	u_int32_t		 ack, end, seq, orig_seq;
3615	u_int8_t		 sws, dws;
3616	int			 ackskew;
3617
3618	if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
3619		sws = src->wscale & PF_WSCALE_MASK;
3620		dws = dst->wscale & PF_WSCALE_MASK;
3621	} else
3622		sws = dws = 0;
3623
3624	/*
3625	 * Sequence tracking algorithm from Guido van Rooij's paper:
3626	 *   http://www.madison-gurkha.com/publications/tcp_filtering/
3627	 *	tcp_filtering.ps
3628	 */
3629
3630	orig_seq = seq = ntohl(th->th_seq);
3631	if (src->seqlo == 0) {
3632		/* First packet from this end. Set its state */
3633
3634		if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
3635		    src->scrub == NULL) {
3636			if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
3637				REASON_SET(reason, PFRES_MEMORY);
3638				return (PF_DROP);
3639			}
3640		}
3641
3642		/* Deferred generation of sequence number modulator */
3643		if (dst->seqdiff && !src->seqdiff) {
3644			/* use random iss for the TCP server */
3645			while ((src->seqdiff = arc4random() - seq) == 0)
3646				;
3647			ack = ntohl(th->th_ack) - dst->seqdiff;
3648			pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
3649			    src->seqdiff), 0);
3650			pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
3651			*copyback = 1;
3652		} else {
3653			ack = ntohl(th->th_ack);
3654		}
3655
3656		end = seq + pd->p_len;
3657		if (th->th_flags & TH_SYN) {
3658			end++;
3659			if (dst->wscale & PF_WSCALE_FLAG) {
3660				src->wscale = pf_get_wscale(m, off, th->th_off,
3661				    pd->af);
3662				if (src->wscale & PF_WSCALE_FLAG) {
3663					/* Remove scale factor from initial
3664					 * window */
3665					sws = src->wscale & PF_WSCALE_MASK;
3666					win = ((u_int32_t)win + (1 << sws) - 1)
3667					    >> sws;
3668					dws = dst->wscale & PF_WSCALE_MASK;
3669				} else {
3670					/* fixup other window */
3671					dst->max_win <<= dst->wscale &
3672					    PF_WSCALE_MASK;
3673					/* in case of a retrans SYN|ACK */
3674					dst->wscale = 0;
3675				}
3676			}
3677		}
3678		if (th->th_flags & TH_FIN)
3679			end++;
3680
3681		src->seqlo = seq;
3682		if (src->state < TCPS_SYN_SENT)
3683			src->state = TCPS_SYN_SENT;
3684
3685		/*
3686		 * May need to slide the window (seqhi may have been set by
3687		 * the crappy stack check or if we picked up the connection
3688		 * after establishment)
3689		 */
3690		if (src->seqhi == 1 ||
3691		    SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
3692			src->seqhi = end + MAX(1, dst->max_win << dws);
3693		if (win > src->max_win)
3694			src->max_win = win;
3695
3696	} else {
3697		ack = ntohl(th->th_ack) - dst->seqdiff;
3698		if (src->seqdiff) {
3699			/* Modulate sequence numbers */
3700			pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
3701			    src->seqdiff), 0);
3702			pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
3703			*copyback = 1;
3704		}
3705		end = seq + pd->p_len;
3706		if (th->th_flags & TH_SYN)
3707			end++;
3708		if (th->th_flags & TH_FIN)
3709			end++;
3710	}
3711
3712	if ((th->th_flags & TH_ACK) == 0) {
3713		/* Let it pass through the ack skew check */
3714		ack = dst->seqlo;
3715	} else if ((ack == 0 &&
3716	    (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
3717	    /* broken tcp stacks do not set ack */
3718	    (dst->state < TCPS_SYN_SENT)) {
3719		/*
3720		 * Many stacks (ours included) will set the ACK number in an
3721		 * FIN|ACK if the SYN times out -- no sequence to ACK.
3722		 */
3723		ack = dst->seqlo;
3724	}
3725
3726	if (seq == end) {
3727		/* Ease sequencing restrictions on no data packets */
3728		seq = src->seqlo;
3729		end = seq;
3730	}
3731
3732	ackskew = dst->seqlo - ack;
3733
3734
3735	/*
3736	 * Need to demodulate the sequence numbers in any TCP SACK options
3737	 * (Selective ACK). We could optionally validate the SACK values
3738	 * against the current ACK window, either forwards or backwards, but
3739	 * I'm not confident that SACK has been implemented properly
3740	 * everywhere. It wouldn't surprise me if several stacks accidently
3741	 * SACK too far backwards of previously ACKed data. There really aren't
3742	 * any security implications of bad SACKing unless the target stack
3743	 * doesn't validate the option length correctly. Someone trying to
3744	 * spoof into a TCP connection won't bother blindly sending SACK
3745	 * options anyway.
3746	 */
3747	if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
3748		if (pf_modulate_sack(m, off, pd, th, dst))
3749			*copyback = 1;
3750	}
3751
3752
3753#define	MAXACKWINDOW (0xffff + 1500)	/* 1500 is an arbitrary fudge factor */
3754	if (SEQ_GEQ(src->seqhi, end) &&
3755	    /* Last octet inside other's window space */
3756	    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
3757	    /* Retrans: not more than one window back */
3758	    (ackskew >= -MAXACKWINDOW) &&
3759	    /* Acking not more than one reassembled fragment backwards */
3760	    (ackskew <= (MAXACKWINDOW << sws)) &&
3761	    /* Acking not more than one window forward */
3762	    ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
3763	    (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
3764	    (pd->flags & PFDESC_IP_REAS) == 0)) {
3765	    /* Require an exact/+1 sequence match on resets when possible */
3766
3767		if (dst->scrub || src->scrub) {
3768			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
3769			    *state, src, dst, copyback))
3770				return (PF_DROP);
3771		}
3772
3773		/* update max window */
3774		if (src->max_win < win)
3775			src->max_win = win;
3776		/* synchronize sequencing */
3777		if (SEQ_GT(end, src->seqlo))
3778			src->seqlo = end;
3779		/* slide the window of what the other end can send */
3780		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
3781			dst->seqhi = ack + MAX((win << sws), 1);
3782
3783
3784		/* update states */
3785		if (th->th_flags & TH_SYN)
3786			if (src->state < TCPS_SYN_SENT)
3787				src->state = TCPS_SYN_SENT;
3788		if (th->th_flags & TH_FIN)
3789			if (src->state < TCPS_CLOSING)
3790				src->state = TCPS_CLOSING;
3791		if (th->th_flags & TH_ACK) {
3792			if (dst->state == TCPS_SYN_SENT) {
3793				dst->state = TCPS_ESTABLISHED;
3794				if (src->state == TCPS_ESTABLISHED &&
3795				    (*state)->src_node != NULL &&
3796				    pf_src_connlimit(state)) {
3797					REASON_SET(reason, PFRES_SRCLIMIT);
3798					return (PF_DROP);
3799				}
3800			} else if (dst->state == TCPS_CLOSING)
3801				dst->state = TCPS_FIN_WAIT_2;
3802		}
3803		if (th->th_flags & TH_RST)
3804			src->state = dst->state = TCPS_TIME_WAIT;
3805
3806		/* update expire time */
3807		(*state)->expire = time_uptime;
3808		if (src->state >= TCPS_FIN_WAIT_2 &&
3809		    dst->state >= TCPS_FIN_WAIT_2)
3810			(*state)->timeout = PFTM_TCP_CLOSED;
3811		else if (src->state >= TCPS_CLOSING &&
3812		    dst->state >= TCPS_CLOSING)
3813			(*state)->timeout = PFTM_TCP_FIN_WAIT;
3814		else if (src->state < TCPS_ESTABLISHED ||
3815		    dst->state < TCPS_ESTABLISHED)
3816			(*state)->timeout = PFTM_TCP_OPENING;
3817		else if (src->state >= TCPS_CLOSING ||
3818		    dst->state >= TCPS_CLOSING)
3819			(*state)->timeout = PFTM_TCP_CLOSING;
3820		else
3821			(*state)->timeout = PFTM_TCP_ESTABLISHED;
3822
3823		/* Fall through to PASS packet */
3824
3825	} else if ((dst->state < TCPS_SYN_SENT ||
3826		dst->state >= TCPS_FIN_WAIT_2 ||
3827		src->state >= TCPS_FIN_WAIT_2) &&
3828	    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
3829	    /* Within a window forward of the originating packet */
3830	    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
3831	    /* Within a window backward of the originating packet */
3832
3833		/*
3834		 * This currently handles three situations:
3835		 *  1) Stupid stacks will shotgun SYNs before their peer
3836		 *     replies.
3837		 *  2) When PF catches an already established stream (the
3838		 *     firewall rebooted, the state table was flushed, routes
3839		 *     changed...)
3840		 *  3) Packets get funky immediately after the connection
3841		 *     closes (this should catch Solaris spurious ACK|FINs
3842		 *     that web servers like to spew after a close)
3843		 *
3844		 * This must be a little more careful than the above code
3845		 * since packet floods will also be caught here. We don't
3846		 * update the TTL here to mitigate the damage of a packet
3847		 * flood and so the same code can handle awkward establishment
3848		 * and a loosened connection close.
3849		 * In the establishment case, a correct peer response will
3850		 * validate the connection, go through the normal state code
3851		 * and keep updating the state TTL.
3852		 */
3853
3854		if (V_pf_status.debug >= PF_DEBUG_MISC) {
3855			printf("pf: loose state match: ");
3856			pf_print_state(*state);
3857			pf_print_flags(th->th_flags);
3858			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
3859			    "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack,
3860			    pd->p_len, ackskew, (unsigned long long)(*state)->packets[0],
3861			    (unsigned long long)(*state)->packets[1],
3862			    pd->dir == PF_IN ? "in" : "out",
3863			    pd->dir == (*state)->direction ? "fwd" : "rev");
3864		}
3865
3866		if (dst->scrub || src->scrub) {
3867			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
3868			    *state, src, dst, copyback))
3869				return (PF_DROP);
3870		}
3871
3872		/* update max window */
3873		if (src->max_win < win)
3874			src->max_win = win;
3875		/* synchronize sequencing */
3876		if (SEQ_GT(end, src->seqlo))
3877			src->seqlo = end;
3878		/* slide the window of what the other end can send */
3879		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
3880			dst->seqhi = ack + MAX((win << sws), 1);
3881
3882		/*
3883		 * Cannot set dst->seqhi here since this could be a shotgunned
3884		 * SYN and not an already established connection.
3885		 */
3886
3887		if (th->th_flags & TH_FIN)
3888			if (src->state < TCPS_CLOSING)
3889				src->state = TCPS_CLOSING;
3890		if (th->th_flags & TH_RST)
3891			src->state = dst->state = TCPS_TIME_WAIT;
3892
3893		/* Fall through to PASS packet */
3894
3895	} else {
3896		if ((*state)->dst.state == TCPS_SYN_SENT &&
3897		    (*state)->src.state == TCPS_SYN_SENT) {
3898			/* Send RST for state mismatches during handshake */
3899			if (!(th->th_flags & TH_RST))
3900				pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
3901				    pd->dst, pd->src, th->th_dport,
3902				    th->th_sport, ntohl(th->th_ack), 0,
3903				    TH_RST, 0, 0,
3904				    (*state)->rule.ptr->return_ttl, 1, 0,
3905				    kif->pfik_ifp);
3906			src->seqlo = 0;
3907			src->seqhi = 1;
3908			src->max_win = 1;
3909		} else if (V_pf_status.debug >= PF_DEBUG_MISC) {
3910			printf("pf: BAD state: ");
3911			pf_print_state(*state);
3912			pf_print_flags(th->th_flags);
3913			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
3914			    "pkts=%llu:%llu dir=%s,%s\n",
3915			    seq, orig_seq, ack, pd->p_len, ackskew,
3916			    (unsigned long long)(*state)->packets[0],
3917			    (unsigned long long)(*state)->packets[1],
3918			    pd->dir == PF_IN ? "in" : "out",
3919			    pd->dir == (*state)->direction ? "fwd" : "rev");
3920			printf("pf: State failure on: %c %c %c %c | %c %c\n",
3921			    SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
3922			    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
3923			    ' ': '2',
3924			    (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
3925			    (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
3926			    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
3927			    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
3928		}
3929		REASON_SET(reason, PFRES_BADSTATE);
3930		return (PF_DROP);
3931	}
3932
3933	return (PF_PASS);
3934}
3935
3936static int
3937pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
3938	struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
3939{
3940	struct tcphdr		*th = pd->hdr.tcp;
3941
3942	if (th->th_flags & TH_SYN)
3943		if (src->state < TCPS_SYN_SENT)
3944			src->state = TCPS_SYN_SENT;
3945	if (th->th_flags & TH_FIN)
3946		if (src->state < TCPS_CLOSING)
3947			src->state = TCPS_CLOSING;
3948	if (th->th_flags & TH_ACK) {
3949		if (dst->state == TCPS_SYN_SENT) {
3950			dst->state = TCPS_ESTABLISHED;
3951			if (src->state == TCPS_ESTABLISHED &&
3952			    (*state)->src_node != NULL &&
3953			    pf_src_connlimit(state)) {
3954				REASON_SET(reason, PFRES_SRCLIMIT);
3955				return (PF_DROP);
3956			}
3957		} else if (dst->state == TCPS_CLOSING) {
3958			dst->state = TCPS_FIN_WAIT_2;
3959		} else if (src->state == TCPS_SYN_SENT &&
3960		    dst->state < TCPS_SYN_SENT) {
3961			/*
3962			 * Handle a special sloppy case where we only see one
3963			 * half of the connection. If there is a ACK after
3964			 * the initial SYN without ever seeing a packet from
3965			 * the destination, set the connection to established.
3966			 */
3967			dst->state = src->state = TCPS_ESTABLISHED;
3968			if ((*state)->src_node != NULL &&
3969			    pf_src_connlimit(state)) {
3970				REASON_SET(reason, PFRES_SRCLIMIT);
3971				return (PF_DROP);
3972			}
3973		} else if (src->state == TCPS_CLOSING &&
3974		    dst->state == TCPS_ESTABLISHED &&
3975		    dst->seqlo == 0) {
3976			/*
3977			 * Handle the closing of half connections where we
3978			 * don't see the full bidirectional FIN/ACK+ACK
3979			 * handshake.
3980			 */
3981			dst->state = TCPS_CLOSING;
3982		}
3983	}
3984	if (th->th_flags & TH_RST)
3985		src->state = dst->state = TCPS_TIME_WAIT;
3986
3987	/* update expire time */
3988	(*state)->expire = time_uptime;
3989	if (src->state >= TCPS_FIN_WAIT_2 &&
3990	    dst->state >= TCPS_FIN_WAIT_2)
3991		(*state)->timeout = PFTM_TCP_CLOSED;
3992	else if (src->state >= TCPS_CLOSING &&
3993	    dst->state >= TCPS_CLOSING)
3994		(*state)->timeout = PFTM_TCP_FIN_WAIT;
3995	else if (src->state < TCPS_ESTABLISHED ||
3996	    dst->state < TCPS_ESTABLISHED)
3997		(*state)->timeout = PFTM_TCP_OPENING;
3998	else if (src->state >= TCPS_CLOSING ||
3999	    dst->state >= TCPS_CLOSING)
4000		(*state)->timeout = PFTM_TCP_CLOSING;
4001	else
4002		(*state)->timeout = PFTM_TCP_ESTABLISHED;
4003
4004	return (PF_PASS);
4005}
4006
4007static int
4008pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
4009    struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
4010    u_short *reason)
4011{
4012	struct pf_state_key_cmp	 key;
4013	struct tcphdr		*th = pd->hdr.tcp;
4014	int			 copyback = 0;
4015	struct pf_state_peer	*src, *dst;
4016	struct pf_state_key	*sk;
4017
4018	bzero(&key, sizeof(key));
4019	key.af = pd->af;
4020	key.proto = IPPROTO_TCP;
4021	if (direction == PF_IN)	{	/* wire side, straight */
4022		PF_ACPY(&key.addr[0], pd->src, key.af);
4023		PF_ACPY(&key.addr[1], pd->dst, key.af);
4024		key.port[0] = th->th_sport;
4025		key.port[1] = th->th_dport;
4026	} else {			/* stack side, reverse */
4027		PF_ACPY(&key.addr[1], pd->src, key.af);
4028		PF_ACPY(&key.addr[0], pd->dst, key.af);
4029		key.port[1] = th->th_sport;
4030		key.port[0] = th->th_dport;
4031	}
4032
4033	STATE_LOOKUP(kif, &key, direction, *state, pd);
4034
4035	if (direction == (*state)->direction) {
4036		src = &(*state)->src;
4037		dst = &(*state)->dst;
4038	} else {
4039		src = &(*state)->dst;
4040		dst = &(*state)->src;
4041	}
4042
4043	sk = (*state)->key[pd->didx];
4044
4045	if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
4046		if (direction != (*state)->direction) {
4047			REASON_SET(reason, PFRES_SYNPROXY);
4048			return (PF_SYNPROXY_DROP);
4049		}
4050		if (th->th_flags & TH_SYN) {
4051			if (ntohl(th->th_seq) != (*state)->src.seqlo) {
4052				REASON_SET(reason, PFRES_SYNPROXY);
4053				return (PF_DROP);
4054			}
4055			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4056			    pd->src, th->th_dport, th->th_sport,
4057			    (*state)->src.seqhi, ntohl(th->th_seq) + 1,
4058			    TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL);
4059			REASON_SET(reason, PFRES_SYNPROXY);
4060			return (PF_SYNPROXY_DROP);
4061		} else if (!(th->th_flags & TH_ACK) ||
4062		    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4063		    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4064			REASON_SET(reason, PFRES_SYNPROXY);
4065			return (PF_DROP);
4066		} else if ((*state)->src_node != NULL &&
4067		    pf_src_connlimit(state)) {
4068			REASON_SET(reason, PFRES_SRCLIMIT);
4069			return (PF_DROP);
4070		} else
4071			(*state)->src.state = PF_TCPS_PROXY_DST;
4072	}
4073	if ((*state)->src.state == PF_TCPS_PROXY_DST) {
4074		if (direction == (*state)->direction) {
4075			if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
4076			    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4077			    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4078				REASON_SET(reason, PFRES_SYNPROXY);
4079				return (PF_DROP);
4080			}
4081			(*state)->src.max_win = MAX(ntohs(th->th_win), 1);
4082			if ((*state)->dst.seqhi == 1)
4083				(*state)->dst.seqhi = htonl(arc4random());
4084			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4085			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
4086			    sk->port[pd->sidx], sk->port[pd->didx],
4087			    (*state)->dst.seqhi, 0, TH_SYN, 0,
4088			    (*state)->src.mss, 0, 0, (*state)->tag, NULL);
4089			REASON_SET(reason, PFRES_SYNPROXY);
4090			return (PF_SYNPROXY_DROP);
4091		} else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
4092		    (TH_SYN|TH_ACK)) ||
4093		    (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
4094			REASON_SET(reason, PFRES_SYNPROXY);
4095			return (PF_DROP);
4096		} else {
4097			(*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
4098			(*state)->dst.seqlo = ntohl(th->th_seq);
4099			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4100			    pd->src, th->th_dport, th->th_sport,
4101			    ntohl(th->th_ack), ntohl(th->th_seq) + 1,
4102			    TH_ACK, (*state)->src.max_win, 0, 0, 0,
4103			    (*state)->tag, NULL);
4104			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4105			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
4106			    sk->port[pd->sidx], sk->port[pd->didx],
4107			    (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
4108			    TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL);
4109			(*state)->src.seqdiff = (*state)->dst.seqhi -
4110			    (*state)->src.seqlo;
4111			(*state)->dst.seqdiff = (*state)->src.seqhi -
4112			    (*state)->dst.seqlo;
4113			(*state)->src.seqhi = (*state)->src.seqlo +
4114			    (*state)->dst.max_win;
4115			(*state)->dst.seqhi = (*state)->dst.seqlo +
4116			    (*state)->src.max_win;
4117			(*state)->src.wscale = (*state)->dst.wscale = 0;
4118			(*state)->src.state = (*state)->dst.state =
4119			    TCPS_ESTABLISHED;
4120			REASON_SET(reason, PFRES_SYNPROXY);
4121			return (PF_SYNPROXY_DROP);
4122		}
4123	}
4124
4125	if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
4126	    dst->state >= TCPS_FIN_WAIT_2 &&
4127	    src->state >= TCPS_FIN_WAIT_2) {
4128		if (V_pf_status.debug >= PF_DEBUG_MISC) {
4129			printf("pf: state reuse ");
4130			pf_print_state(*state);
4131			pf_print_flags(th->th_flags);
4132			printf("\n");
4133		}
4134		/* XXX make sure it's the same direction ?? */
4135		(*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
4136		pf_unlink_state(*state, PF_ENTER_LOCKED);
4137		*state = NULL;
4138		return (PF_DROP);
4139	}
4140
4141	if ((*state)->state_flags & PFSTATE_SLOPPY) {
4142		if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP)
4143			return (PF_DROP);
4144	} else {
4145		if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason,
4146		    &copyback) == PF_DROP)
4147			return (PF_DROP);
4148	}
4149
4150	/* translate source/destination address, if necessary */
4151	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4152		struct pf_state_key *nk = (*state)->key[pd->didx];
4153
4154		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4155		    nk->port[pd->sidx] != th->th_sport)
4156			pf_change_ap(pd->src, &th->th_sport, pd->ip_sum,
4157			    &th->th_sum, &nk->addr[pd->sidx],
4158			    nk->port[pd->sidx], 0, pd->af);
4159
4160		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4161		    nk->port[pd->didx] != th->th_dport)
4162			pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum,
4163			    &th->th_sum, &nk->addr[pd->didx],
4164			    nk->port[pd->didx], 0, pd->af);
4165		copyback = 1;
4166	}
4167
4168	/* Copyback sequence modulation or stateful scrub changes if needed */
4169	if (copyback)
4170		m_copyback(m, off, sizeof(*th), (caddr_t)th);
4171
4172	return (PF_PASS);
4173}
4174
4175static int
4176pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
4177    struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
4178{
4179	struct pf_state_peer	*src, *dst;
4180	struct pf_state_key_cmp	 key;
4181	struct udphdr		*uh = pd->hdr.udp;
4182
4183	bzero(&key, sizeof(key));
4184	key.af = pd->af;
4185	key.proto = IPPROTO_UDP;
4186	if (direction == PF_IN)	{	/* wire side, straight */
4187		PF_ACPY(&key.addr[0], pd->src, key.af);
4188		PF_ACPY(&key.addr[1], pd->dst, key.af);
4189		key.port[0] = uh->uh_sport;
4190		key.port[1] = uh->uh_dport;
4191	} else {			/* stack side, reverse */
4192		PF_ACPY(&key.addr[1], pd->src, key.af);
4193		PF_ACPY(&key.addr[0], pd->dst, key.af);
4194		key.port[1] = uh->uh_sport;
4195		key.port[0] = uh->uh_dport;
4196	}
4197
4198	STATE_LOOKUP(kif, &key, direction, *state, pd);
4199
4200	if (direction == (*state)->direction) {
4201		src = &(*state)->src;
4202		dst = &(*state)->dst;
4203	} else {
4204		src = &(*state)->dst;
4205		dst = &(*state)->src;
4206	}
4207
4208	/* update states */
4209	if (src->state < PFUDPS_SINGLE)
4210		src->state = PFUDPS_SINGLE;
4211	if (dst->state == PFUDPS_SINGLE)
4212		dst->state = PFUDPS_MULTIPLE;
4213
4214	/* update expire time */
4215	(*state)->expire = time_uptime;
4216	if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
4217		(*state)->timeout = PFTM_UDP_MULTIPLE;
4218	else
4219		(*state)->timeout = PFTM_UDP_SINGLE;
4220
4221	/* translate source/destination address, if necessary */
4222	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4223		struct pf_state_key *nk = (*state)->key[pd->didx];
4224
4225		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4226		    nk->port[pd->sidx] != uh->uh_sport)
4227			pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum,
4228			    &uh->uh_sum, &nk->addr[pd->sidx],
4229			    nk->port[pd->sidx], 1, pd->af);
4230
4231		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4232		    nk->port[pd->didx] != uh->uh_dport)
4233			pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum,
4234			    &uh->uh_sum, &nk->addr[pd->didx],
4235			    nk->port[pd->didx], 1, pd->af);
4236		m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
4237	}
4238
4239	return (PF_PASS);
4240}
4241
4242static int
4243pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
4244    struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
4245{
4246	struct pf_addr  *saddr = pd->src, *daddr = pd->dst;
4247	u_int16_t	 icmpid = 0, *icmpsum;
4248	u_int8_t	 icmptype;
4249	int		 state_icmp = 0;
4250	struct pf_state_key_cmp key;
4251
4252	bzero(&key, sizeof(key));
4253	switch (pd->proto) {
4254#ifdef INET
4255	case IPPROTO_ICMP:
4256		icmptype = pd->hdr.icmp->icmp_type;
4257		icmpid = pd->hdr.icmp->icmp_id;
4258		icmpsum = &pd->hdr.icmp->icmp_cksum;
4259
4260		if (icmptype == ICMP_UNREACH ||
4261		    icmptype == ICMP_SOURCEQUENCH ||
4262		    icmptype == ICMP_REDIRECT ||
4263		    icmptype == ICMP_TIMXCEED ||
4264		    icmptype == ICMP_PARAMPROB)
4265			state_icmp++;
4266		break;
4267#endif /* INET */
4268#ifdef INET6
4269	case IPPROTO_ICMPV6:
4270		icmptype = pd->hdr.icmp6->icmp6_type;
4271		icmpid = pd->hdr.icmp6->icmp6_id;
4272		icmpsum = &pd->hdr.icmp6->icmp6_cksum;
4273
4274		if (icmptype == ICMP6_DST_UNREACH ||
4275		    icmptype == ICMP6_PACKET_TOO_BIG ||
4276		    icmptype == ICMP6_TIME_EXCEEDED ||
4277		    icmptype == ICMP6_PARAM_PROB)
4278			state_icmp++;
4279		break;
4280#endif /* INET6 */
4281	}
4282
4283	if (!state_icmp) {
4284
4285		/*
4286		 * ICMP query/reply message not related to a TCP/UDP packet.
4287		 * Search for an ICMP state.
4288		 */
4289		key.af = pd->af;
4290		key.proto = pd->proto;
4291		key.port[0] = key.port[1] = icmpid;
4292		if (direction == PF_IN)	{	/* wire side, straight */
4293			PF_ACPY(&key.addr[0], pd->src, key.af);
4294			PF_ACPY(&key.addr[1], pd->dst, key.af);
4295		} else {			/* stack side, reverse */
4296			PF_ACPY(&key.addr[1], pd->src, key.af);
4297			PF_ACPY(&key.addr[0], pd->dst, key.af);
4298		}
4299
4300		STATE_LOOKUP(kif, &key, direction, *state, pd);
4301
4302		(*state)->expire = time_uptime;
4303		(*state)->timeout = PFTM_ICMP_ERROR_REPLY;
4304
4305		/* translate source/destination address, if necessary */
4306		if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4307			struct pf_state_key *nk = (*state)->key[pd->didx];
4308
4309			switch (pd->af) {
4310#ifdef INET
4311			case AF_INET:
4312				if (PF_ANEQ(pd->src,
4313				    &nk->addr[pd->sidx], AF_INET))
4314					pf_change_a(&saddr->v4.s_addr,
4315					    pd->ip_sum,
4316					    nk->addr[pd->sidx].v4.s_addr, 0);
4317
4318				if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
4319				    AF_INET))
4320					pf_change_a(&daddr->v4.s_addr,
4321					    pd->ip_sum,
4322					    nk->addr[pd->didx].v4.s_addr, 0);
4323
4324				if (nk->port[0] !=
4325				    pd->hdr.icmp->icmp_id) {
4326					pd->hdr.icmp->icmp_cksum =
4327					    pf_cksum_fixup(
4328					    pd->hdr.icmp->icmp_cksum, icmpid,
4329					    nk->port[pd->sidx], 0);
4330					pd->hdr.icmp->icmp_id =
4331					    nk->port[pd->sidx];
4332				}
4333
4334				m_copyback(m, off, ICMP_MINLEN,
4335				    (caddr_t )pd->hdr.icmp);
4336				break;
4337#endif /* INET */
4338#ifdef INET6
4339			case AF_INET6:
4340				if (PF_ANEQ(pd->src,
4341				    &nk->addr[pd->sidx], AF_INET6))
4342					pf_change_a6(saddr,
4343					    &pd->hdr.icmp6->icmp6_cksum,
4344					    &nk->addr[pd->sidx], 0);
4345
4346				if (PF_ANEQ(pd->dst,
4347				    &nk->addr[pd->didx], AF_INET6))
4348					pf_change_a6(daddr,
4349					    &pd->hdr.icmp6->icmp6_cksum,
4350					    &nk->addr[pd->didx], 0);
4351
4352				m_copyback(m, off, sizeof(struct icmp6_hdr),
4353				    (caddr_t )pd->hdr.icmp6);
4354				break;
4355#endif /* INET6 */
4356			}
4357		}
4358		return (PF_PASS);
4359
4360	} else {
4361		/*
4362		 * ICMP error message in response to a TCP/UDP packet.
4363		 * Extract the inner TCP/UDP header and search for that state.
4364		 */
4365
4366		struct pf_pdesc	pd2;
4367		bzero(&pd2, sizeof pd2);
4368#ifdef INET
4369		struct ip	h2;
4370#endif /* INET */
4371#ifdef INET6
4372		struct ip6_hdr	h2_6;
4373		int		terminal = 0;
4374#endif /* INET6 */
4375		int		ipoff2 = 0;
4376		int		off2 = 0;
4377
4378		pd2.af = pd->af;
4379		/* Payload packet is from the opposite direction. */
4380		pd2.sidx = (direction == PF_IN) ? 1 : 0;
4381		pd2.didx = (direction == PF_IN) ? 0 : 1;
4382		switch (pd->af) {
4383#ifdef INET
4384		case AF_INET:
4385			/* offset of h2 in mbuf chain */
4386			ipoff2 = off + ICMP_MINLEN;
4387
4388			if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
4389			    NULL, reason, pd2.af)) {
4390				DPFPRINTF(PF_DEBUG_MISC,
4391				    ("pf: ICMP error message too short "
4392				    "(ip)\n"));
4393				return (PF_DROP);
4394			}
4395			/*
4396			 * ICMP error messages don't refer to non-first
4397			 * fragments
4398			 */
4399			if (h2.ip_off & htons(IP_OFFMASK)) {
4400				REASON_SET(reason, PFRES_FRAG);
4401				return (PF_DROP);
4402			}
4403
4404			/* offset of protocol header that follows h2 */
4405			off2 = ipoff2 + (h2.ip_hl << 2);
4406
4407			pd2.proto = h2.ip_p;
4408			pd2.src = (struct pf_addr *)&h2.ip_src;
4409			pd2.dst = (struct pf_addr *)&h2.ip_dst;
4410			pd2.ip_sum = &h2.ip_sum;
4411			break;
4412#endif /* INET */
4413#ifdef INET6
4414		case AF_INET6:
4415			ipoff2 = off + sizeof(struct icmp6_hdr);
4416
4417			if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
4418			    NULL, reason, pd2.af)) {
4419				DPFPRINTF(PF_DEBUG_MISC,
4420				    ("pf: ICMP error message too short "
4421				    "(ip6)\n"));
4422				return (PF_DROP);
4423			}
4424			pd2.proto = h2_6.ip6_nxt;
4425			pd2.src = (struct pf_addr *)&h2_6.ip6_src;
4426			pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
4427			pd2.ip_sum = NULL;
4428			off2 = ipoff2 + sizeof(h2_6);
4429			do {
4430				switch (pd2.proto) {
4431				case IPPROTO_FRAGMENT:
4432					/*
4433					 * ICMPv6 error messages for
4434					 * non-first fragments
4435					 */
4436					REASON_SET(reason, PFRES_FRAG);
4437					return (PF_DROP);
4438				case IPPROTO_AH:
4439				case IPPROTO_HOPOPTS:
4440				case IPPROTO_ROUTING:
4441				case IPPROTO_DSTOPTS: {
4442					/* get next header and header length */
4443					struct ip6_ext opt6;
4444
4445					if (!pf_pull_hdr(m, off2, &opt6,
4446					    sizeof(opt6), NULL, reason,
4447					    pd2.af)) {
4448						DPFPRINTF(PF_DEBUG_MISC,
4449						    ("pf: ICMPv6 short opt\n"));
4450						return (PF_DROP);
4451					}
4452					if (pd2.proto == IPPROTO_AH)
4453						off2 += (opt6.ip6e_len + 2) * 4;
4454					else
4455						off2 += (opt6.ip6e_len + 1) * 8;
4456					pd2.proto = opt6.ip6e_nxt;
4457					/* goto the next header */
4458					break;
4459				}
4460				default:
4461					terminal++;
4462					break;
4463				}
4464			} while (!terminal);
4465			break;
4466#endif /* INET6 */
4467		}
4468
4469		switch (pd2.proto) {
4470		case IPPROTO_TCP: {
4471			struct tcphdr		 th;
4472			u_int32_t		 seq;
4473			struct pf_state_peer	*src, *dst;
4474			u_int8_t		 dws;
4475			int			 copyback = 0;
4476
4477			/*
4478			 * Only the first 8 bytes of the TCP header can be
4479			 * expected. Don't access any TCP header fields after
4480			 * th_seq, an ackskew test is not possible.
4481			 */
4482			if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
4483			    pd2.af)) {
4484				DPFPRINTF(PF_DEBUG_MISC,
4485				    ("pf: ICMP error message too short "
4486				    "(tcp)\n"));
4487				return (PF_DROP);
4488			}
4489
4490			key.af = pd2.af;
4491			key.proto = IPPROTO_TCP;
4492			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4493			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4494			key.port[pd2.sidx] = th.th_sport;
4495			key.port[pd2.didx] = th.th_dport;
4496
4497			STATE_LOOKUP(kif, &key, direction, *state, pd);
4498
4499			if (direction == (*state)->direction) {
4500				src = &(*state)->dst;
4501				dst = &(*state)->src;
4502			} else {
4503				src = &(*state)->src;
4504				dst = &(*state)->dst;
4505			}
4506
4507			if (src->wscale && dst->wscale)
4508				dws = dst->wscale & PF_WSCALE_MASK;
4509			else
4510				dws = 0;
4511
4512			/* Demodulate sequence number */
4513			seq = ntohl(th.th_seq) - src->seqdiff;
4514			if (src->seqdiff) {
4515				pf_change_a(&th.th_seq, icmpsum,
4516				    htonl(seq), 0);
4517				copyback = 1;
4518			}
4519
4520			if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
4521			    (!SEQ_GEQ(src->seqhi, seq) ||
4522			    !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
4523				if (V_pf_status.debug >= PF_DEBUG_MISC) {
4524					printf("pf: BAD ICMP %d:%d ",
4525					    icmptype, pd->hdr.icmp->icmp_code);
4526					pf_print_host(pd->src, 0, pd->af);
4527					printf(" -> ");
4528					pf_print_host(pd->dst, 0, pd->af);
4529					printf(" state: ");
4530					pf_print_state(*state);
4531					printf(" seq=%u\n", seq);
4532				}
4533				REASON_SET(reason, PFRES_BADSTATE);
4534				return (PF_DROP);
4535			} else {
4536				if (V_pf_status.debug >= PF_DEBUG_MISC) {
4537					printf("pf: OK ICMP %d:%d ",
4538					    icmptype, pd->hdr.icmp->icmp_code);
4539					pf_print_host(pd->src, 0, pd->af);
4540					printf(" -> ");
4541					pf_print_host(pd->dst, 0, pd->af);
4542					printf(" state: ");
4543					pf_print_state(*state);
4544					printf(" seq=%u\n", seq);
4545				}
4546			}
4547
4548			/* translate source/destination address, if necessary */
4549			if ((*state)->key[PF_SK_WIRE] !=
4550			    (*state)->key[PF_SK_STACK]) {
4551				struct pf_state_key *nk =
4552				    (*state)->key[pd->didx];
4553
4554				if (PF_ANEQ(pd2.src,
4555				    &nk->addr[pd2.sidx], pd2.af) ||
4556				    nk->port[pd2.sidx] != th.th_sport)
4557					pf_change_icmp(pd2.src, &th.th_sport,
4558					    daddr, &nk->addr[pd2.sidx],
4559					    nk->port[pd2.sidx], NULL,
4560					    pd2.ip_sum, icmpsum,
4561					    pd->ip_sum, 0, pd2.af);
4562
4563				if (PF_ANEQ(pd2.dst,
4564				    &nk->addr[pd2.didx], pd2.af) ||
4565				    nk->port[pd2.didx] != th.th_dport)
4566					pf_change_icmp(pd2.dst, &th.th_dport,
4567					    NULL, /* XXX Inbound NAT? */
4568					    &nk->addr[pd2.didx],
4569					    nk->port[pd2.didx], NULL,
4570					    pd2.ip_sum, icmpsum,
4571					    pd->ip_sum, 0, pd2.af);
4572				copyback = 1;
4573			}
4574
4575			if (copyback) {
4576				switch (pd2.af) {
4577#ifdef INET
4578				case AF_INET:
4579					m_copyback(m, off, ICMP_MINLEN,
4580					    (caddr_t )pd->hdr.icmp);
4581					m_copyback(m, ipoff2, sizeof(h2),
4582					    (caddr_t )&h2);
4583					break;
4584#endif /* INET */
4585#ifdef INET6
4586				case AF_INET6:
4587					m_copyback(m, off,
4588					    sizeof(struct icmp6_hdr),
4589					    (caddr_t )pd->hdr.icmp6);
4590					m_copyback(m, ipoff2, sizeof(h2_6),
4591					    (caddr_t )&h2_6);
4592					break;
4593#endif /* INET6 */
4594				}
4595				m_copyback(m, off2, 8, (caddr_t)&th);
4596			}
4597
4598			return (PF_PASS);
4599			break;
4600		}
4601		case IPPROTO_UDP: {
4602			struct udphdr		uh;
4603
4604			if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
4605			    NULL, reason, pd2.af)) {
4606				DPFPRINTF(PF_DEBUG_MISC,
4607				    ("pf: ICMP error message too short "
4608				    "(udp)\n"));
4609				return (PF_DROP);
4610			}
4611
4612			key.af = pd2.af;
4613			key.proto = IPPROTO_UDP;
4614			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4615			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4616			key.port[pd2.sidx] = uh.uh_sport;
4617			key.port[pd2.didx] = uh.uh_dport;
4618
4619			STATE_LOOKUP(kif, &key, direction, *state, pd);
4620
4621			/* translate source/destination address, if necessary */
4622			if ((*state)->key[PF_SK_WIRE] !=
4623			    (*state)->key[PF_SK_STACK]) {
4624				struct pf_state_key *nk =
4625				    (*state)->key[pd->didx];
4626
4627				if (PF_ANEQ(pd2.src,
4628				    &nk->addr[pd2.sidx], pd2.af) ||
4629				    nk->port[pd2.sidx] != uh.uh_sport)
4630					pf_change_icmp(pd2.src, &uh.uh_sport,
4631					    daddr, &nk->addr[pd2.sidx],
4632					    nk->port[pd2.sidx], &uh.uh_sum,
4633					    pd2.ip_sum, icmpsum,
4634					    pd->ip_sum, 1, pd2.af);
4635
4636				if (PF_ANEQ(pd2.dst,
4637				    &nk->addr[pd2.didx], pd2.af) ||
4638				    nk->port[pd2.didx] != uh.uh_dport)
4639					pf_change_icmp(pd2.dst, &uh.uh_dport,
4640					    NULL, /* XXX Inbound NAT? */
4641					    &nk->addr[pd2.didx],
4642					    nk->port[pd2.didx], &uh.uh_sum,
4643					    pd2.ip_sum, icmpsum,
4644					    pd->ip_sum, 1, pd2.af);
4645
4646				switch (pd2.af) {
4647#ifdef INET
4648				case AF_INET:
4649					m_copyback(m, off, ICMP_MINLEN,
4650					    (caddr_t )pd->hdr.icmp);
4651					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4652					break;
4653#endif /* INET */
4654#ifdef INET6
4655				case AF_INET6:
4656					m_copyback(m, off,
4657					    sizeof(struct icmp6_hdr),
4658					    (caddr_t )pd->hdr.icmp6);
4659					m_copyback(m, ipoff2, sizeof(h2_6),
4660					    (caddr_t )&h2_6);
4661					break;
4662#endif /* INET6 */
4663				}
4664				m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
4665			}
4666			return (PF_PASS);
4667			break;
4668		}
4669#ifdef INET
4670		case IPPROTO_ICMP: {
4671			struct icmp		iih;
4672
4673			if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
4674			    NULL, reason, pd2.af)) {
4675				DPFPRINTF(PF_DEBUG_MISC,
4676				    ("pf: ICMP error message too short i"
4677				    "(icmp)\n"));
4678				return (PF_DROP);
4679			}
4680
4681			key.af = pd2.af;
4682			key.proto = IPPROTO_ICMP;
4683			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4684			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4685			key.port[0] = key.port[1] = iih.icmp_id;
4686
4687			STATE_LOOKUP(kif, &key, direction, *state, pd);
4688
4689			/* translate source/destination address, if necessary */
4690			if ((*state)->key[PF_SK_WIRE] !=
4691			    (*state)->key[PF_SK_STACK]) {
4692				struct pf_state_key *nk =
4693				    (*state)->key[pd->didx];
4694
4695				if (PF_ANEQ(pd2.src,
4696				    &nk->addr[pd2.sidx], pd2.af) ||
4697				    nk->port[pd2.sidx] != iih.icmp_id)
4698					pf_change_icmp(pd2.src, &iih.icmp_id,
4699					    daddr, &nk->addr[pd2.sidx],
4700					    nk->port[pd2.sidx], NULL,
4701					    pd2.ip_sum, icmpsum,
4702					    pd->ip_sum, 0, AF_INET);
4703
4704				if (PF_ANEQ(pd2.dst,
4705				    &nk->addr[pd2.didx], pd2.af) ||
4706				    nk->port[pd2.didx] != iih.icmp_id)
4707					pf_change_icmp(pd2.dst, &iih.icmp_id,
4708					    NULL, /* XXX Inbound NAT? */
4709					    &nk->addr[pd2.didx],
4710					    nk->port[pd2.didx], NULL,
4711					    pd2.ip_sum, icmpsum,
4712					    pd->ip_sum, 0, AF_INET);
4713
4714				m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
4715				m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4716				m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
4717			}
4718			return (PF_PASS);
4719			break;
4720		}
4721#endif /* INET */
4722#ifdef INET6
4723		case IPPROTO_ICMPV6: {
4724			struct icmp6_hdr	iih;
4725
4726			if (!pf_pull_hdr(m, off2, &iih,
4727			    sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
4728				DPFPRINTF(PF_DEBUG_MISC,
4729				    ("pf: ICMP error message too short "
4730				    "(icmp6)\n"));
4731				return (PF_DROP);
4732			}
4733
4734			key.af = pd2.af;
4735			key.proto = IPPROTO_ICMPV6;
4736			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4737			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4738			key.port[0] = key.port[1] = iih.icmp6_id;
4739
4740			STATE_LOOKUP(kif, &key, direction, *state, pd);
4741
4742			/* translate source/destination address, if necessary */
4743			if ((*state)->key[PF_SK_WIRE] !=
4744			    (*state)->key[PF_SK_STACK]) {
4745				struct pf_state_key *nk =
4746				    (*state)->key[pd->didx];
4747
4748				if (PF_ANEQ(pd2.src,
4749				    &nk->addr[pd2.sidx], pd2.af) ||
4750				    nk->port[pd2.sidx] != iih.icmp6_id)
4751					pf_change_icmp(pd2.src, &iih.icmp6_id,
4752					    daddr, &nk->addr[pd2.sidx],
4753					    nk->port[pd2.sidx], NULL,
4754					    pd2.ip_sum, icmpsum,
4755					    pd->ip_sum, 0, AF_INET6);
4756
4757				if (PF_ANEQ(pd2.dst,
4758				    &nk->addr[pd2.didx], pd2.af) ||
4759				    nk->port[pd2.didx] != iih.icmp6_id)
4760					pf_change_icmp(pd2.dst, &iih.icmp6_id,
4761					    NULL, /* XXX Inbound NAT? */
4762					    &nk->addr[pd2.didx],
4763					    nk->port[pd2.didx], NULL,
4764					    pd2.ip_sum, icmpsum,
4765					    pd->ip_sum, 0, AF_INET6);
4766
4767				m_copyback(m, off, sizeof(struct icmp6_hdr),
4768				    (caddr_t)pd->hdr.icmp6);
4769				m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
4770				m_copyback(m, off2, sizeof(struct icmp6_hdr),
4771				    (caddr_t)&iih);
4772			}
4773			return (PF_PASS);
4774			break;
4775		}
4776#endif /* INET6 */
4777		default: {
4778			key.af = pd2.af;
4779			key.proto = pd2.proto;
4780			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4781			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4782			key.port[0] = key.port[1] = 0;
4783
4784			STATE_LOOKUP(kif, &key, direction, *state, pd);
4785
4786			/* translate source/destination address, if necessary */
4787			if ((*state)->key[PF_SK_WIRE] !=
4788			    (*state)->key[PF_SK_STACK]) {
4789				struct pf_state_key *nk =
4790				    (*state)->key[pd->didx];
4791
4792				if (PF_ANEQ(pd2.src,
4793				    &nk->addr[pd2.sidx], pd2.af))
4794					pf_change_icmp(pd2.src, NULL, daddr,
4795					    &nk->addr[pd2.sidx], 0, NULL,
4796					    pd2.ip_sum, icmpsum,
4797					    pd->ip_sum, 0, pd2.af);
4798
4799				if (PF_ANEQ(pd2.dst,
4800				    &nk->addr[pd2.didx], pd2.af))
4801					pf_change_icmp(pd2.src, NULL,
4802					    NULL, /* XXX Inbound NAT? */
4803					    &nk->addr[pd2.didx], 0, NULL,
4804					    pd2.ip_sum, icmpsum,
4805					    pd->ip_sum, 0, pd2.af);
4806
4807				switch (pd2.af) {
4808#ifdef INET
4809				case AF_INET:
4810					m_copyback(m, off, ICMP_MINLEN,
4811					    (caddr_t)pd->hdr.icmp);
4812					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4813					break;
4814#endif /* INET */
4815#ifdef INET6
4816				case AF_INET6:
4817					m_copyback(m, off,
4818					    sizeof(struct icmp6_hdr),
4819					    (caddr_t )pd->hdr.icmp6);
4820					m_copyback(m, ipoff2, sizeof(h2_6),
4821					    (caddr_t )&h2_6);
4822					break;
4823#endif /* INET6 */
4824				}
4825			}
4826			return (PF_PASS);
4827			break;
4828		}
4829		}
4830	}
4831}
4832
4833static int
4834pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif,
4835    struct mbuf *m, struct pf_pdesc *pd)
4836{
4837	struct pf_state_peer	*src, *dst;
4838	struct pf_state_key_cmp	 key;
4839
4840	bzero(&key, sizeof(key));
4841	key.af = pd->af;
4842	key.proto = pd->proto;
4843	if (direction == PF_IN)	{
4844		PF_ACPY(&key.addr[0], pd->src, key.af);
4845		PF_ACPY(&key.addr[1], pd->dst, key.af);
4846		key.port[0] = key.port[1] = 0;
4847	} else {
4848		PF_ACPY(&key.addr[1], pd->src, key.af);
4849		PF_ACPY(&key.addr[0], pd->dst, key.af);
4850		key.port[1] = key.port[0] = 0;
4851	}
4852
4853	STATE_LOOKUP(kif, &key, direction, *state, pd);
4854
4855	if (direction == (*state)->direction) {
4856		src = &(*state)->src;
4857		dst = &(*state)->dst;
4858	} else {
4859		src = &(*state)->dst;
4860		dst = &(*state)->src;
4861	}
4862
4863	/* update states */
4864	if (src->state < PFOTHERS_SINGLE)
4865		src->state = PFOTHERS_SINGLE;
4866	if (dst->state == PFOTHERS_SINGLE)
4867		dst->state = PFOTHERS_MULTIPLE;
4868
4869	/* update expire time */
4870	(*state)->expire = time_uptime;
4871	if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
4872		(*state)->timeout = PFTM_OTHER_MULTIPLE;
4873	else
4874		(*state)->timeout = PFTM_OTHER_SINGLE;
4875
4876	/* translate source/destination address, if necessary */
4877	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4878		struct pf_state_key *nk = (*state)->key[pd->didx];
4879
4880		KASSERT(nk, ("%s: nk is null", __func__));
4881		KASSERT(pd, ("%s: pd is null", __func__));
4882		KASSERT(pd->src, ("%s: pd->src is null", __func__));
4883		KASSERT(pd->dst, ("%s: pd->dst is null", __func__));
4884		switch (pd->af) {
4885#ifdef INET
4886		case AF_INET:
4887			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
4888				pf_change_a(&pd->src->v4.s_addr,
4889				    pd->ip_sum,
4890				    nk->addr[pd->sidx].v4.s_addr,
4891				    0);
4892
4893
4894			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
4895				pf_change_a(&pd->dst->v4.s_addr,
4896				    pd->ip_sum,
4897				    nk->addr[pd->didx].v4.s_addr,
4898				    0);
4899
4900				break;
4901#endif /* INET */
4902#ifdef INET6
4903		case AF_INET6:
4904			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
4905				PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
4906
4907			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
4908				PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
4909#endif /* INET6 */
4910		}
4911	}
4912	return (PF_PASS);
4913}
4914
4915/*
4916 * ipoff and off are measured from the start of the mbuf chain.
4917 * h must be at "ipoff" on the mbuf chain.
4918 */
4919void *
4920pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
4921    u_short *actionp, u_short *reasonp, sa_family_t af)
4922{
4923	switch (af) {
4924#ifdef INET
4925	case AF_INET: {
4926		struct ip	*h = mtod(m, struct ip *);
4927		u_int16_t	 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
4928
4929		if (fragoff) {
4930			if (fragoff >= len)
4931				ACTION_SET(actionp, PF_PASS);
4932			else {
4933				ACTION_SET(actionp, PF_DROP);
4934				REASON_SET(reasonp, PFRES_FRAG);
4935			}
4936			return (NULL);
4937		}
4938		if (m->m_pkthdr.len < off + len ||
4939		    ntohs(h->ip_len) < off + len) {
4940			ACTION_SET(actionp, PF_DROP);
4941			REASON_SET(reasonp, PFRES_SHORT);
4942			return (NULL);
4943		}
4944		break;
4945	}
4946#endif /* INET */
4947#ifdef INET6
4948	case AF_INET6: {
4949		struct ip6_hdr	*h = mtod(m, struct ip6_hdr *);
4950
4951		if (m->m_pkthdr.len < off + len ||
4952		    (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
4953		    (unsigned)(off + len)) {
4954			ACTION_SET(actionp, PF_DROP);
4955			REASON_SET(reasonp, PFRES_SHORT);
4956			return (NULL);
4957		}
4958		break;
4959	}
4960#endif /* INET6 */
4961	}
4962	m_copydata(m, off, len, p);
4963	return (p);
4964}
4965
4966int
4967pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
4968    int rtableid)
4969{
4970#ifdef RADIX_MPATH
4971	struct radix_node_head	*rnh;
4972#endif
4973	struct sockaddr_in	*dst;
4974	int			 ret = 1;
4975	int			 check_mpath;
4976#ifdef INET6
4977	struct sockaddr_in6	*dst6;
4978	struct route_in6	 ro;
4979#else
4980	struct route		 ro;
4981#endif
4982	struct radix_node	*rn;
4983	struct rtentry		*rt;
4984	struct ifnet		*ifp;
4985
4986	check_mpath = 0;
4987#ifdef RADIX_MPATH
4988	/* XXX: stick to table 0 for now */
4989	rnh = rt_tables_get_rnh(0, af);
4990	if (rnh != NULL && rn_mpath_capable(rnh))
4991		check_mpath = 1;
4992#endif
4993	bzero(&ro, sizeof(ro));
4994	switch (af) {
4995	case AF_INET:
4996		dst = satosin(&ro.ro_dst);
4997		dst->sin_family = AF_INET;
4998		dst->sin_len = sizeof(*dst);
4999		dst->sin_addr = addr->v4;
5000		break;
5001#ifdef INET6
5002	case AF_INET6:
5003		/*
5004		 * Skip check for addresses with embedded interface scope,
5005		 * as they would always match anyway.
5006		 */
5007		if (IN6_IS_SCOPE_EMBED(&addr->v6))
5008			goto out;
5009		dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
5010		dst6->sin6_family = AF_INET6;
5011		dst6->sin6_len = sizeof(*dst6);
5012		dst6->sin6_addr = addr->v6;
5013		break;
5014#endif /* INET6 */
5015	default:
5016		return (0);
5017	}
5018
5019	/* Skip checks for ipsec interfaces */
5020	if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
5021		goto out;
5022
5023	switch (af) {
5024#ifdef INET6
5025	case AF_INET6:
5026		in6_rtalloc_ign(&ro, 0, rtableid);
5027		break;
5028#endif
5029#ifdef INET
5030	case AF_INET:
5031		in_rtalloc_ign((struct route *)&ro, 0, rtableid);
5032		break;
5033#endif
5034	default:
5035		rtalloc_ign((struct route *)&ro, 0);	/* No/default FIB. */
5036		break;
5037	}
5038
5039	if (ro.ro_rt != NULL) {
5040		/* No interface given, this is a no-route check */
5041		if (kif == NULL)
5042			goto out;
5043
5044		if (kif->pfik_ifp == NULL) {
5045			ret = 0;
5046			goto out;
5047		}
5048
5049		/* Perform uRPF check if passed input interface */
5050		ret = 0;
5051		rn = (struct radix_node *)ro.ro_rt;
5052		do {
5053			rt = (struct rtentry *)rn;
5054			ifp = rt->rt_ifp;
5055
5056			if (kif->pfik_ifp == ifp)
5057				ret = 1;
5058#ifdef RADIX_MPATH
5059			rn = rn_mpath_next(rn);
5060#endif
5061		} while (check_mpath == 1 && rn != NULL && ret == 0);
5062	} else
5063		ret = 0;
5064out:
5065	if (ro.ro_rt != NULL)
5066		RTFREE(ro.ro_rt);
5067	return (ret);
5068}
5069
5070#ifdef INET
5071static void
5072pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5073    struct pf_state *s, struct pf_pdesc *pd)
5074{
5075	struct mbuf		*m0, *m1;
5076	struct sockaddr_in	dst;
5077	struct ip		*ip;
5078	struct ifnet		*ifp = NULL;
5079	struct pf_addr		 naddr;
5080	struct pf_src_node	*sn = NULL;
5081	int			 error = 0;
5082	int sw_csum;
5083
5084	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5085	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5086	    __func__));
5087
5088	if ((pd->pf_mtag == NULL &&
5089	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5090	    pd->pf_mtag->routed++ > 3) {
5091		m0 = *m;
5092		*m = NULL;
5093		goto bad_locked;
5094	}
5095
5096	if (r->rt == PF_DUPTO) {
5097		if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5098			if (s)
5099				PF_STATE_UNLOCK(s);
5100			return;
5101		}
5102	} else {
5103		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5104			if (s)
5105				PF_STATE_UNLOCK(s);
5106			return;
5107		}
5108		m0 = *m;
5109	}
5110
5111	ip = mtod(m0, struct ip *);
5112
5113	bzero(&dst, sizeof(dst));
5114	dst.sin_family = AF_INET;
5115	dst.sin_len = sizeof(dst);
5116	dst.sin_addr = ip->ip_dst;
5117
5118	if (r->rt == PF_FASTROUTE) {
5119		struct rtentry *rt;
5120
5121		if (s)
5122			PF_STATE_UNLOCK(s);
5123		rt = rtalloc1_fib(sintosa(&dst), 0, 0, M_GETFIB(m0));
5124		if (rt == NULL) {
5125			RTFREE_LOCKED(rt);
5126			KMOD_IPSTAT_INC(ips_noroute);
5127			error = EHOSTUNREACH;
5128			goto bad;
5129		}
5130
5131		ifp = rt->rt_ifp;
5132		rt->rt_rmx.rmx_pksent++;
5133
5134		if (rt->rt_flags & RTF_GATEWAY)
5135			bcopy(satosin(rt->rt_gateway), &dst, sizeof(dst));
5136		RTFREE_LOCKED(rt);
5137	} else {
5138		if (TAILQ_EMPTY(&r->rpool.list)) {
5139			DPFPRINTF(PF_DEBUG_URGENT,
5140			    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5141			goto bad_locked;
5142		}
5143		if (s == NULL) {
5144			pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
5145			    &naddr, NULL, &sn);
5146			if (!PF_AZERO(&naddr, AF_INET))
5147				dst.sin_addr.s_addr = naddr.v4.s_addr;
5148			ifp = r->rpool.cur->kif ?
5149			    r->rpool.cur->kif->pfik_ifp : NULL;
5150		} else {
5151			if (!PF_AZERO(&s->rt_addr, AF_INET))
5152				dst.sin_addr.s_addr =
5153				    s->rt_addr.v4.s_addr;
5154			ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5155			PF_STATE_UNLOCK(s);
5156		}
5157	}
5158	if (ifp == NULL)
5159		goto bad;
5160
5161	if (oifp != ifp) {
5162		if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS)
5163			goto bad;
5164		else if (m0 == NULL)
5165			goto done;
5166		if (m0->m_len < sizeof(struct ip)) {
5167			DPFPRINTF(PF_DEBUG_URGENT,
5168			    ("%s: m0->m_len < sizeof(struct ip)\n", __func__));
5169			goto bad;
5170		}
5171		ip = mtod(m0, struct ip *);
5172	}
5173
5174	if (ifp->if_flags & IFF_LOOPBACK)
5175		m0->m_flags |= M_SKIP_FIREWALL;
5176
5177	/* Back to host byte order. */
5178	ip->ip_len = ntohs(ip->ip_len);
5179	ip->ip_off = ntohs(ip->ip_off);
5180
5181	/* Copied from FreeBSD 10.0-CURRENT ip_output. */
5182	m0->m_pkthdr.csum_flags |= CSUM_IP;
5183	sw_csum = m0->m_pkthdr.csum_flags & ~ifp->if_hwassist;
5184	if (sw_csum & CSUM_DELAY_DATA) {
5185		in_delayed_cksum(m0);
5186		sw_csum &= ~CSUM_DELAY_DATA;
5187	}
5188#ifdef SCTP
5189	if (sw_csum & CSUM_SCTP) {
5190		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
5191		sw_csum &= ~CSUM_SCTP;
5192	}
5193#endif
5194	m0->m_pkthdr.csum_flags &= ifp->if_hwassist;
5195
5196	/*
5197	 * If small enough for interface, or the interface will take
5198	 * care of the fragmentation for us, we can just send directly.
5199	 */
5200	if (ip->ip_len <= ifp->if_mtu ||
5201	    (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
5202	    ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
5203		ip->ip_len = htons(ip->ip_len);
5204		ip->ip_off = htons(ip->ip_off);
5205		ip->ip_sum = 0;
5206		if (sw_csum & CSUM_DELAY_IP)
5207			ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
5208		m0->m_flags &= ~(M_PROTOFLAGS);
5209		error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5210		goto done;
5211	}
5212
5213	/* Balk when DF bit is set or the interface didn't support TSO. */
5214	if ((ip->ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) {
5215		error = EMSGSIZE;
5216		KMOD_IPSTAT_INC(ips_cantfrag);
5217		if (r->rt != PF_DUPTO) {
5218			icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
5219			    ifp->if_mtu);
5220			goto done;
5221		} else
5222			goto bad;
5223	}
5224
5225	error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist, sw_csum);
5226	if (error)
5227		goto bad;
5228
5229	for (; m0; m0 = m1) {
5230		m1 = m0->m_nextpkt;
5231		m0->m_nextpkt = NULL;
5232		if (error == 0) {
5233			m0->m_flags &= ~(M_PROTOFLAGS);
5234			error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5235		} else
5236			m_freem(m0);
5237	}
5238
5239	if (error == 0)
5240		KMOD_IPSTAT_INC(ips_fragmented);
5241
5242done:
5243	if (r->rt != PF_DUPTO)
5244		*m = NULL;
5245	return;
5246
5247bad_locked:
5248	if (s)
5249		PF_STATE_UNLOCK(s);
5250bad:
5251	m_freem(m0);
5252	goto done;
5253}
5254#endif /* INET */
5255
5256#ifdef INET6
5257static void
5258pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5259    struct pf_state *s, struct pf_pdesc *pd)
5260{
5261	struct mbuf		*m0;
5262	struct sockaddr_in6	dst;
5263	struct ip6_hdr		*ip6;
5264	struct ifnet		*ifp = NULL;
5265	struct pf_addr		 naddr;
5266	struct pf_src_node	*sn = NULL;
5267
5268	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5269	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5270	    __func__));
5271
5272	if ((pd->pf_mtag == NULL &&
5273	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5274	    pd->pf_mtag->routed++ > 3) {
5275		m0 = *m;
5276		*m = NULL;
5277		goto bad_locked;
5278	}
5279
5280	if (r->rt == PF_DUPTO) {
5281		if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5282			if (s)
5283				PF_STATE_UNLOCK(s);
5284			return;
5285		}
5286	} else {
5287		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5288			if (s)
5289				PF_STATE_UNLOCK(s);
5290			return;
5291		}
5292		m0 = *m;
5293	}
5294
5295	ip6 = mtod(m0, struct ip6_hdr *);
5296
5297	bzero(&dst, sizeof(dst));
5298	dst.sin6_family = AF_INET6;
5299	dst.sin6_len = sizeof(dst);
5300	dst.sin6_addr = ip6->ip6_dst;
5301
5302	/* Cheat. XXX why only in the v6 case??? */
5303	if (r->rt == PF_FASTROUTE) {
5304		if (s)
5305			PF_STATE_UNLOCK(s);
5306		m0->m_flags |= M_SKIP_FIREWALL;
5307		ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
5308		return;
5309	}
5310
5311	if (TAILQ_EMPTY(&r->rpool.list)) {
5312		DPFPRINTF(PF_DEBUG_URGENT,
5313		    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5314		goto bad_locked;
5315	}
5316	if (s == NULL) {
5317		pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
5318		    &naddr, NULL, &sn);
5319		if (!PF_AZERO(&naddr, AF_INET6))
5320			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5321			    &naddr, AF_INET6);
5322		ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
5323	} else {
5324		if (!PF_AZERO(&s->rt_addr, AF_INET6))
5325			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5326			    &s->rt_addr, AF_INET6);
5327		ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5328	}
5329
5330	if (s)
5331		PF_STATE_UNLOCK(s);
5332
5333	if (ifp == NULL)
5334		goto bad;
5335
5336	if (oifp != ifp) {
5337		if (pf_test6(PF_OUT, ifp, &m0, NULL) != PF_PASS)
5338			goto bad;
5339		else if (m0 == NULL)
5340			goto done;
5341		if (m0->m_len < sizeof(struct ip6_hdr)) {
5342			DPFPRINTF(PF_DEBUG_URGENT,
5343			    ("%s: m0->m_len < sizeof(struct ip6_hdr)\n",
5344			    __func__));
5345			goto bad;
5346		}
5347		ip6 = mtod(m0, struct ip6_hdr *);
5348	}
5349
5350	if (ifp->if_flags & IFF_LOOPBACK)
5351		m0->m_flags |= M_SKIP_FIREWALL;
5352
5353	/*
5354	 * If the packet is too large for the outgoing interface,
5355	 * send back an icmp6 error.
5356	 */
5357	if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr))
5358		dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
5359	if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu)
5360		nd6_output(ifp, ifp, m0, &dst, NULL);
5361	else {
5362		in6_ifstat_inc(ifp, ifs6_in_toobig);
5363		if (r->rt != PF_DUPTO)
5364			icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
5365		else
5366			goto bad;
5367	}
5368
5369done:
5370	if (r->rt != PF_DUPTO)
5371		*m = NULL;
5372	return;
5373
5374bad_locked:
5375	if (s)
5376		PF_STATE_UNLOCK(s);
5377bad:
5378	m_freem(m0);
5379	goto done;
5380}
5381#endif /* INET6 */
5382
5383/*
5384 * FreeBSD supports cksum offloads for the following drivers.
5385 *  em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4),
5386 *   ti(4), txp(4), xl(4)
5387 *
5388 * CSUM_DATA_VALID | CSUM_PSEUDO_HDR :
5389 *  network driver performed cksum including pseudo header, need to verify
5390 *   csum_data
5391 * CSUM_DATA_VALID :
5392 *  network driver performed cksum, needs to additional pseudo header
5393 *  cksum computation with partial csum_data(i.e. lack of H/W support for
5394 *  pseudo header, for instance hme(4), sk(4) and possibly gem(4))
5395 *
5396 * After validating the cksum of packet, set both flag CSUM_DATA_VALID and
5397 * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
5398 * TCP/UDP layer.
5399 * Also, set csum_data to 0xffff to force cksum validation.
5400 */
5401static int
5402pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
5403{
5404	u_int16_t sum = 0;
5405	int hw_assist = 0;
5406	struct ip *ip;
5407
5408	if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
5409		return (1);
5410	if (m->m_pkthdr.len < off + len)
5411		return (1);
5412
5413	switch (p) {
5414	case IPPROTO_TCP:
5415		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5416			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5417				sum = m->m_pkthdr.csum_data;
5418			} else {
5419				ip = mtod(m, struct ip *);
5420				sum = in_pseudo(ip->ip_src.s_addr,
5421				ip->ip_dst.s_addr, htonl((u_short)len +
5422				m->m_pkthdr.csum_data + IPPROTO_TCP));
5423			}
5424			sum ^= 0xffff;
5425			++hw_assist;
5426		}
5427		break;
5428	case IPPROTO_UDP:
5429		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5430			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5431				sum = m->m_pkthdr.csum_data;
5432			} else {
5433				ip = mtod(m, struct ip *);
5434				sum = in_pseudo(ip->ip_src.s_addr,
5435				ip->ip_dst.s_addr, htonl((u_short)len +
5436				m->m_pkthdr.csum_data + IPPROTO_UDP));
5437			}
5438			sum ^= 0xffff;
5439			++hw_assist;
5440		}
5441		break;
5442	case IPPROTO_ICMP:
5443#ifdef INET6
5444	case IPPROTO_ICMPV6:
5445#endif /* INET6 */
5446		break;
5447	default:
5448		return (1);
5449	}
5450
5451	if (!hw_assist) {
5452		switch (af) {
5453		case AF_INET:
5454			if (p == IPPROTO_ICMP) {
5455				if (m->m_len < off)
5456					return (1);
5457				m->m_data += off;
5458				m->m_len -= off;
5459				sum = in_cksum(m, len);
5460				m->m_data -= off;
5461				m->m_len += off;
5462			} else {
5463				if (m->m_len < sizeof(struct ip))
5464					return (1);
5465				sum = in4_cksum(m, p, off, len);
5466			}
5467			break;
5468#ifdef INET6
5469		case AF_INET6:
5470			if (m->m_len < sizeof(struct ip6_hdr))
5471				return (1);
5472			sum = in6_cksum(m, p, off, len);
5473			break;
5474#endif /* INET6 */
5475		default:
5476			return (1);
5477		}
5478	}
5479	if (sum) {
5480		switch (p) {
5481		case IPPROTO_TCP:
5482		    {
5483			KMOD_TCPSTAT_INC(tcps_rcvbadsum);
5484			break;
5485		    }
5486		case IPPROTO_UDP:
5487		    {
5488			KMOD_UDPSTAT_INC(udps_badsum);
5489			break;
5490		    }
5491#ifdef INET
5492		case IPPROTO_ICMP:
5493		    {
5494			KMOD_ICMPSTAT_INC(icps_checksum);
5495			break;
5496		    }
5497#endif
5498#ifdef INET6
5499		case IPPROTO_ICMPV6:
5500		    {
5501			KMOD_ICMP6STAT_INC(icp6s_checksum);
5502			break;
5503		    }
5504#endif /* INET6 */
5505		}
5506		return (1);
5507	} else {
5508		if (p == IPPROTO_TCP || p == IPPROTO_UDP) {
5509			m->m_pkthdr.csum_flags |=
5510			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
5511			m->m_pkthdr.csum_data = 0xffff;
5512		}
5513	}
5514	return (0);
5515}
5516
5517
5518#ifdef INET
5519int
5520pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
5521{
5522	struct pfi_kif		*kif;
5523	u_short			 action, reason = 0, log = 0;
5524	struct mbuf		*m = *m0;
5525	struct ip		*h = NULL;
5526	struct m_tag		*ipfwtag;
5527	struct pf_rule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
5528	struct pf_state		*s = NULL;
5529	struct pf_ruleset	*ruleset = NULL;
5530	struct pf_pdesc		 pd;
5531	int			 off, dirndx, pqid = 0;
5532
5533	M_ASSERTPKTHDR(m);
5534
5535	if (!V_pf_status.running)
5536		return (PF_PASS);
5537
5538	memset(&pd, 0, sizeof(pd));
5539
5540	kif = (struct pfi_kif *)ifp->if_pf_kif;
5541
5542	if (kif == NULL) {
5543		DPFPRINTF(PF_DEBUG_URGENT,
5544		    ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
5545		return (PF_DROP);
5546	}
5547	if (kif->pfik_flags & PFI_IFLAG_SKIP)
5548		return (PF_PASS);
5549
5550	if (m->m_flags & M_SKIP_FIREWALL)
5551		return (PF_PASS);
5552
5553	if (m->m_pkthdr.len < (int)sizeof(struct ip)) {
5554		action = PF_DROP;
5555		REASON_SET(&reason, PFRES_SHORT);
5556		log = 1;
5557		goto done;
5558	}
5559
5560	pd.pf_mtag = pf_find_mtag(m);
5561
5562	PF_RULES_RLOCK();
5563
5564	if (ip_divert_ptr != NULL &&
5565	    ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) {
5566		struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1);
5567		if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) {
5568			if (pd.pf_mtag == NULL &&
5569			    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5570				action = PF_DROP;
5571				goto done;
5572			}
5573			pd.pf_mtag->flags |= PF_PACKET_LOOPED;
5574			m_tag_delete(m, ipfwtag);
5575		}
5576		if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) {
5577			m->m_flags |= M_FASTFWD_OURS;
5578			pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT;
5579		}
5580	} else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
5581		/* We do IP header normalization and packet reassembly here */
5582		action = PF_DROP;
5583		goto done;
5584	}
5585	m = *m0;	/* pf_normalize messes with m0 */
5586	h = mtod(m, struct ip *);
5587
5588	off = h->ip_hl << 2;
5589	if (off < (int)sizeof(struct ip)) {
5590		action = PF_DROP;
5591		REASON_SET(&reason, PFRES_SHORT);
5592		log = 1;
5593		goto done;
5594	}
5595
5596	pd.src = (struct pf_addr *)&h->ip_src;
5597	pd.dst = (struct pf_addr *)&h->ip_dst;
5598	pd.sport = pd.dport = NULL;
5599	pd.ip_sum = &h->ip_sum;
5600	pd.proto_sum = NULL;
5601	pd.proto = h->ip_p;
5602	pd.dir = dir;
5603	pd.sidx = (dir == PF_IN) ? 0 : 1;
5604	pd.didx = (dir == PF_IN) ? 1 : 0;
5605	pd.af = AF_INET;
5606	pd.tos = h->ip_tos;
5607	pd.tot_len = ntohs(h->ip_len);
5608
5609	/* handle fragments that didn't get reassembled by normalization */
5610	if (h->ip_off & htons(IP_MF | IP_OFFMASK)) {
5611		action = pf_test_fragment(&r, dir, kif, m, h,
5612		    &pd, &a, &ruleset);
5613		goto done;
5614	}
5615
5616	switch (h->ip_p) {
5617
5618	case IPPROTO_TCP: {
5619		struct tcphdr	th;
5620
5621		pd.hdr.tcp = &th;
5622		if (!pf_pull_hdr(m, off, &th, sizeof(th),
5623		    &action, &reason, AF_INET)) {
5624			log = action != PF_PASS;
5625			goto done;
5626		}
5627		pd.p_len = pd.tot_len - off - (th.th_off << 2);
5628		if ((th.th_flags & TH_ACK) && pd.p_len == 0)
5629			pqid = 1;
5630		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
5631		if (action == PF_DROP)
5632			goto done;
5633		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
5634		    &reason);
5635		if (action == PF_PASS) {
5636			if (pfsync_update_state_ptr != NULL)
5637				pfsync_update_state_ptr(s);
5638			r = s->rule.ptr;
5639			a = s->anchor.ptr;
5640			log = s->log;
5641		} else if (s == NULL)
5642			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5643			    &a, &ruleset, inp);
5644		break;
5645	}
5646
5647	case IPPROTO_UDP: {
5648		struct udphdr	uh;
5649
5650		pd.hdr.udp = &uh;
5651		if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
5652		    &action, &reason, AF_INET)) {
5653			log = action != PF_PASS;
5654			goto done;
5655		}
5656		if (uh.uh_dport == 0 ||
5657		    ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
5658		    ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
5659			action = PF_DROP;
5660			REASON_SET(&reason, PFRES_SHORT);
5661			goto done;
5662		}
5663		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
5664		if (action == PF_PASS) {
5665			if (pfsync_update_state_ptr != NULL)
5666				pfsync_update_state_ptr(s);
5667			r = s->rule.ptr;
5668			a = s->anchor.ptr;
5669			log = s->log;
5670		} else if (s == NULL)
5671			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5672			    &a, &ruleset, inp);
5673		break;
5674	}
5675
5676	case IPPROTO_ICMP: {
5677		struct icmp	ih;
5678
5679		pd.hdr.icmp = &ih;
5680		if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
5681		    &action, &reason, AF_INET)) {
5682			log = action != PF_PASS;
5683			goto done;
5684		}
5685		action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
5686		    &reason);
5687		if (action == PF_PASS) {
5688			if (pfsync_update_state_ptr != NULL)
5689				pfsync_update_state_ptr(s);
5690			r = s->rule.ptr;
5691			a = s->anchor.ptr;
5692			log = s->log;
5693		} else if (s == NULL)
5694			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5695			    &a, &ruleset, inp);
5696		break;
5697	}
5698
5699#ifdef INET6
5700	case IPPROTO_ICMPV6: {
5701		action = PF_DROP;
5702		DPFPRINTF(PF_DEBUG_MISC,
5703		    ("pf: dropping IPv4 packet with ICMPv6 payload\n"));
5704		goto done;
5705	}
5706#endif
5707
5708	default:
5709		action = pf_test_state_other(&s, dir, kif, m, &pd);
5710		if (action == PF_PASS) {
5711			if (pfsync_update_state_ptr != NULL)
5712				pfsync_update_state_ptr(s);
5713			r = s->rule.ptr;
5714			a = s->anchor.ptr;
5715			log = s->log;
5716		} else if (s == NULL)
5717			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5718			    &a, &ruleset, inp);
5719		break;
5720	}
5721
5722done:
5723	PF_RULES_RUNLOCK();
5724	if (action == PF_PASS && h->ip_hl > 5 &&
5725	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
5726		action = PF_DROP;
5727		REASON_SET(&reason, PFRES_IPOPTIONS);
5728		log = 1;
5729		DPFPRINTF(PF_DEBUG_MISC,
5730		    ("pf: dropping packet with ip options\n"));
5731	}
5732
5733	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
5734		action = PF_DROP;
5735		REASON_SET(&reason, PFRES_MEMORY);
5736	}
5737	if (r->rtableid >= 0)
5738		M_SETFIB(m, r->rtableid);
5739
5740#ifdef ALTQ
5741	if (action == PF_PASS && r->qid) {
5742		if (pd.pf_mtag == NULL &&
5743		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5744			action = PF_DROP;
5745			REASON_SET(&reason, PFRES_MEMORY);
5746		}
5747		if (pqid || (pd.tos & IPTOS_LOWDELAY))
5748			pd.pf_mtag->qid = r->pqid;
5749		else
5750			pd.pf_mtag->qid = r->qid;
5751		/* add hints for ecn */
5752		pd.pf_mtag->hdr = h;
5753
5754	}
5755#endif /* ALTQ */
5756
5757	/*
5758	 * connections redirected to loopback should not match sockets
5759	 * bound specifically to loopback due to security implications,
5760	 * see tcp_input() and in_pcblookup_listen().
5761	 */
5762	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
5763	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
5764	    (s->nat_rule.ptr->action == PF_RDR ||
5765	    s->nat_rule.ptr->action == PF_BINAT) &&
5766	    (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
5767		m->m_flags |= M_SKIP_FIREWALL;
5768
5769	if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL &&
5770	    !PACKET_LOOPED(&pd)) {
5771
5772		ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
5773		    sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
5774		if (ipfwtag != NULL) {
5775			((struct ipfw_rule_ref *)(ipfwtag+1))->info =
5776			    ntohs(r->divert.port);
5777			((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir;
5778
5779			if (s)
5780				PF_STATE_UNLOCK(s);
5781
5782			m_tag_prepend(m, ipfwtag);
5783			if (m->m_flags & M_FASTFWD_OURS) {
5784				if (pd.pf_mtag == NULL &&
5785				    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5786					action = PF_DROP;
5787					REASON_SET(&reason, PFRES_MEMORY);
5788					log = 1;
5789					DPFPRINTF(PF_DEBUG_MISC,
5790					    ("pf: failed to allocate tag\n"));
5791				}
5792				pd.pf_mtag->flags |= PF_FASTFWD_OURS_PRESENT;
5793				m->m_flags &= ~M_FASTFWD_OURS;
5794			}
5795			ip_divert_ptr(*m0, dir ==  PF_IN ? DIR_IN : DIR_OUT);
5796			*m0 = NULL;
5797
5798			return (action);
5799		} else {
5800			/* XXX: ipfw has the same behaviour! */
5801			action = PF_DROP;
5802			REASON_SET(&reason, PFRES_MEMORY);
5803			log = 1;
5804			DPFPRINTF(PF_DEBUG_MISC,
5805			    ("pf: failed to allocate divert tag\n"));
5806		}
5807	}
5808
5809	if (log) {
5810		struct pf_rule *lr;
5811
5812		if (s != NULL && s->nat_rule.ptr != NULL &&
5813		    s->nat_rule.ptr->log & PF_LOG_ALL)
5814			lr = s->nat_rule.ptr;
5815		else
5816			lr = r;
5817		PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd,
5818		    (s == NULL));
5819	}
5820
5821	kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
5822	kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
5823
5824	if (action == PF_PASS || r->action == PF_DROP) {
5825		dirndx = (dir == PF_OUT);
5826		r->packets[dirndx]++;
5827		r->bytes[dirndx] += pd.tot_len;
5828		if (a != NULL) {
5829			a->packets[dirndx]++;
5830			a->bytes[dirndx] += pd.tot_len;
5831		}
5832		if (s != NULL) {
5833			if (s->nat_rule.ptr != NULL) {
5834				s->nat_rule.ptr->packets[dirndx]++;
5835				s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
5836			}
5837			if (s->src_node != NULL) {
5838				s->src_node->packets[dirndx]++;
5839				s->src_node->bytes[dirndx] += pd.tot_len;
5840			}
5841			if (s->nat_src_node != NULL) {
5842				s->nat_src_node->packets[dirndx]++;
5843				s->nat_src_node->bytes[dirndx] += pd.tot_len;
5844			}
5845			dirndx = (dir == s->direction) ? 0 : 1;
5846			s->packets[dirndx]++;
5847			s->bytes[dirndx] += pd.tot_len;
5848		}
5849		tr = r;
5850		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
5851		if (nr != NULL && r == &V_pf_default_rule)
5852			tr = nr;
5853		if (tr->src.addr.type == PF_ADDR_TABLE)
5854			pfr_update_stats(tr->src.addr.p.tbl,
5855			    (s == NULL) ? pd.src :
5856			    &s->key[(s->direction == PF_IN)]->
5857				addr[(s->direction == PF_OUT)],
5858			    pd.af, pd.tot_len, dir == PF_OUT,
5859			    r->action == PF_PASS, tr->src.neg);
5860		if (tr->dst.addr.type == PF_ADDR_TABLE)
5861			pfr_update_stats(tr->dst.addr.p.tbl,
5862			    (s == NULL) ? pd.dst :
5863			    &s->key[(s->direction == PF_IN)]->
5864				addr[(s->direction == PF_IN)],
5865			    pd.af, pd.tot_len, dir == PF_OUT,
5866			    r->action == PF_PASS, tr->dst.neg);
5867	}
5868
5869	switch (action) {
5870	case PF_SYNPROXY_DROP:
5871		m_freem(*m0);
5872	case PF_DEFER:
5873		*m0 = NULL;
5874		action = PF_PASS;
5875		break;
5876	default:
5877		/* pf_route() returns unlocked. */
5878		if (r->rt) {
5879			pf_route(m0, r, dir, kif->pfik_ifp, s, &pd);
5880			return (action);
5881		}
5882		break;
5883	}
5884	if (s)
5885		PF_STATE_UNLOCK(s);
5886
5887	return (action);
5888}
5889#endif /* INET */
5890
5891#ifdef INET6
5892int
5893pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
5894{
5895	struct pfi_kif		*kif;
5896	u_short			 action, reason = 0, log = 0;
5897	struct mbuf		*m = *m0, *n = NULL;
5898	struct ip6_hdr		*h = NULL;
5899	struct pf_rule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
5900	struct pf_state		*s = NULL;
5901	struct pf_ruleset	*ruleset = NULL;
5902	struct pf_pdesc		 pd;
5903	int			 off, terminal = 0, dirndx, rh_cnt = 0;
5904
5905	M_ASSERTPKTHDR(m);
5906
5907	if (!V_pf_status.running)
5908		return (PF_PASS);
5909
5910	memset(&pd, 0, sizeof(pd));
5911	pd.pf_mtag = pf_find_mtag(m);
5912
5913	if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED)
5914		return (PF_PASS);
5915
5916	kif = (struct pfi_kif *)ifp->if_pf_kif;
5917	if (kif == NULL) {
5918		DPFPRINTF(PF_DEBUG_URGENT,
5919		    ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
5920		return (PF_DROP);
5921	}
5922	if (kif->pfik_flags & PFI_IFLAG_SKIP)
5923		return (PF_PASS);
5924
5925	if (m->m_pkthdr.len < (int)sizeof(*h)) {
5926		action = PF_DROP;
5927		REASON_SET(&reason, PFRES_SHORT);
5928		log = 1;
5929		goto done;
5930	}
5931
5932	PF_RULES_RLOCK();
5933
5934	/* We do IP header normalization and packet reassembly here */
5935	if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
5936		action = PF_DROP;
5937		goto done;
5938	}
5939	m = *m0;	/* pf_normalize messes with m0 */
5940	h = mtod(m, struct ip6_hdr *);
5941
5942#if 1
5943	/*
5944	 * we do not support jumbogram yet.  if we keep going, zero ip6_plen
5945	 * will do something bad, so drop the packet for now.
5946	 */
5947	if (htons(h->ip6_plen) == 0) {
5948		action = PF_DROP;
5949		REASON_SET(&reason, PFRES_NORM);	/*XXX*/
5950		goto done;
5951	}
5952#endif
5953
5954	pd.src = (struct pf_addr *)&h->ip6_src;
5955	pd.dst = (struct pf_addr *)&h->ip6_dst;
5956	pd.sport = pd.dport = NULL;
5957	pd.ip_sum = NULL;
5958	pd.proto_sum = NULL;
5959	pd.dir = dir;
5960	pd.sidx = (dir == PF_IN) ? 0 : 1;
5961	pd.didx = (dir == PF_IN) ? 1 : 0;
5962	pd.af = AF_INET6;
5963	pd.tos = 0;
5964	pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
5965
5966	off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
5967	pd.proto = h->ip6_nxt;
5968	do {
5969		switch (pd.proto) {
5970		case IPPROTO_FRAGMENT:
5971			action = pf_test_fragment(&r, dir, kif, m, h,
5972			    &pd, &a, &ruleset);
5973			if (action == PF_DROP)
5974				REASON_SET(&reason, PFRES_FRAG);
5975			goto done;
5976		case IPPROTO_ROUTING: {
5977			struct ip6_rthdr rthdr;
5978
5979			if (rh_cnt++) {
5980				DPFPRINTF(PF_DEBUG_MISC,
5981				    ("pf: IPv6 more than one rthdr\n"));
5982				action = PF_DROP;
5983				REASON_SET(&reason, PFRES_IPOPTIONS);
5984				log = 1;
5985				goto done;
5986			}
5987			if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
5988			    &reason, pd.af)) {
5989				DPFPRINTF(PF_DEBUG_MISC,
5990				    ("pf: IPv6 short rthdr\n"));
5991				action = PF_DROP;
5992				REASON_SET(&reason, PFRES_SHORT);
5993				log = 1;
5994				goto done;
5995			}
5996			if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
5997				DPFPRINTF(PF_DEBUG_MISC,
5998				    ("pf: IPv6 rthdr0\n"));
5999				action = PF_DROP;
6000				REASON_SET(&reason, PFRES_IPOPTIONS);
6001				log = 1;
6002				goto done;
6003			}
6004			/* FALLTHROUGH */
6005		}
6006		case IPPROTO_AH:
6007		case IPPROTO_HOPOPTS:
6008		case IPPROTO_DSTOPTS: {
6009			/* get next header and header length */
6010			struct ip6_ext	opt6;
6011
6012			if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
6013			    NULL, &reason, pd.af)) {
6014				DPFPRINTF(PF_DEBUG_MISC,
6015				    ("pf: IPv6 short opt\n"));
6016				action = PF_DROP;
6017				log = 1;
6018				goto done;
6019			}
6020			if (pd.proto == IPPROTO_AH)
6021				off += (opt6.ip6e_len + 2) * 4;
6022			else
6023				off += (opt6.ip6e_len + 1) * 8;
6024			pd.proto = opt6.ip6e_nxt;
6025			/* goto the next header */
6026			break;
6027		}
6028		default:
6029			terminal++;
6030			break;
6031		}
6032	} while (!terminal);
6033
6034	/* if there's no routing header, use unmodified mbuf for checksumming */
6035	if (!n)
6036		n = m;
6037
6038	switch (pd.proto) {
6039
6040	case IPPROTO_TCP: {
6041		struct tcphdr	th;
6042
6043		pd.hdr.tcp = &th;
6044		if (!pf_pull_hdr(m, off, &th, sizeof(th),
6045		    &action, &reason, AF_INET6)) {
6046			log = action != PF_PASS;
6047			goto done;
6048		}
6049		pd.p_len = pd.tot_len - off - (th.th_off << 2);
6050		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
6051		if (action == PF_DROP)
6052			goto done;
6053		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
6054		    &reason);
6055		if (action == PF_PASS) {
6056			if (pfsync_update_state_ptr != NULL)
6057				pfsync_update_state_ptr(s);
6058			r = s->rule.ptr;
6059			a = s->anchor.ptr;
6060			log = s->log;
6061		} else if (s == NULL)
6062			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6063			    &a, &ruleset, inp);
6064		break;
6065	}
6066
6067	case IPPROTO_UDP: {
6068		struct udphdr	uh;
6069
6070		pd.hdr.udp = &uh;
6071		if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
6072		    &action, &reason, AF_INET6)) {
6073			log = action != PF_PASS;
6074			goto done;
6075		}
6076		if (uh.uh_dport == 0 ||
6077		    ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
6078		    ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
6079			action = PF_DROP;
6080			REASON_SET(&reason, PFRES_SHORT);
6081			goto done;
6082		}
6083		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
6084		if (action == PF_PASS) {
6085			if (pfsync_update_state_ptr != NULL)
6086				pfsync_update_state_ptr(s);
6087			r = s->rule.ptr;
6088			a = s->anchor.ptr;
6089			log = s->log;
6090		} else if (s == NULL)
6091			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6092			    &a, &ruleset, inp);
6093		break;
6094	}
6095
6096	case IPPROTO_ICMP: {
6097		action = PF_DROP;
6098		DPFPRINTF(PF_DEBUG_MISC,
6099		    ("pf: dropping IPv6 packet with ICMPv4 payload\n"));
6100		goto done;
6101	}
6102
6103	case IPPROTO_ICMPV6: {
6104		struct icmp6_hdr	ih;
6105
6106		pd.hdr.icmp6 = &ih;
6107		if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
6108		    &action, &reason, AF_INET6)) {
6109			log = action != PF_PASS;
6110			goto done;
6111		}
6112		action = pf_test_state_icmp(&s, dir, kif,
6113		    m, off, h, &pd, &reason);
6114		if (action == PF_PASS) {
6115			if (pfsync_update_state_ptr != NULL)
6116				pfsync_update_state_ptr(s);
6117			r = s->rule.ptr;
6118			a = s->anchor.ptr;
6119			log = s->log;
6120		} else if (s == NULL)
6121			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6122			    &a, &ruleset, inp);
6123		break;
6124	}
6125
6126	default:
6127		action = pf_test_state_other(&s, dir, kif, m, &pd);
6128		if (action == PF_PASS) {
6129			if (pfsync_update_state_ptr != NULL)
6130				pfsync_update_state_ptr(s);
6131			r = s->rule.ptr;
6132			a = s->anchor.ptr;
6133			log = s->log;
6134		} else if (s == NULL)
6135			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6136			    &a, &ruleset, inp);
6137		break;
6138	}
6139
6140done:
6141	PF_RULES_RUNLOCK();
6142	if (n != m) {
6143		m_freem(n);
6144		n = NULL;
6145	}
6146
6147	/* handle dangerous IPv6 extension headers. */
6148	if (action == PF_PASS && rh_cnt &&
6149	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
6150		action = PF_DROP;
6151		REASON_SET(&reason, PFRES_IPOPTIONS);
6152		log = 1;
6153		DPFPRINTF(PF_DEBUG_MISC,
6154		    ("pf: dropping packet with dangerous v6 headers\n"));
6155	}
6156
6157	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
6158		action = PF_DROP;
6159		REASON_SET(&reason, PFRES_MEMORY);
6160	}
6161	if (r->rtableid >= 0)
6162		M_SETFIB(m, r->rtableid);
6163
6164#ifdef ALTQ
6165	if (action == PF_PASS && r->qid) {
6166		if (pd.pf_mtag == NULL &&
6167		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
6168			action = PF_DROP;
6169			REASON_SET(&reason, PFRES_MEMORY);
6170		}
6171		if (pd.tos & IPTOS_LOWDELAY)
6172			pd.pf_mtag->qid = r->pqid;
6173		else
6174			pd.pf_mtag->qid = r->qid;
6175		/* add hints for ecn */
6176		pd.pf_mtag->hdr = h;
6177	}
6178#endif /* ALTQ */
6179
6180	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
6181	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
6182	    (s->nat_rule.ptr->action == PF_RDR ||
6183	    s->nat_rule.ptr->action == PF_BINAT) &&
6184	    IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
6185		m->m_flags |= M_SKIP_FIREWALL;
6186
6187	/* XXX: Anybody working on it?! */
6188	if (r->divert.port)
6189		printf("pf: divert(9) is not supported for IPv6\n");
6190
6191	if (log) {
6192		struct pf_rule *lr;
6193
6194		if (s != NULL && s->nat_rule.ptr != NULL &&
6195		    s->nat_rule.ptr->log & PF_LOG_ALL)
6196			lr = s->nat_rule.ptr;
6197		else
6198			lr = r;
6199		PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset,
6200		    &pd, (s == NULL));
6201	}
6202
6203	kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
6204	kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
6205
6206	if (action == PF_PASS || r->action == PF_DROP) {
6207		dirndx = (dir == PF_OUT);
6208		r->packets[dirndx]++;
6209		r->bytes[dirndx] += pd.tot_len;
6210		if (a != NULL) {
6211			a->packets[dirndx]++;
6212			a->bytes[dirndx] += pd.tot_len;
6213		}
6214		if (s != NULL) {
6215			if (s->nat_rule.ptr != NULL) {
6216				s->nat_rule.ptr->packets[dirndx]++;
6217				s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
6218			}
6219			if (s->src_node != NULL) {
6220				s->src_node->packets[dirndx]++;
6221				s->src_node->bytes[dirndx] += pd.tot_len;
6222			}
6223			if (s->nat_src_node != NULL) {
6224				s->nat_src_node->packets[dirndx]++;
6225				s->nat_src_node->bytes[dirndx] += pd.tot_len;
6226			}
6227			dirndx = (dir == s->direction) ? 0 : 1;
6228			s->packets[dirndx]++;
6229			s->bytes[dirndx] += pd.tot_len;
6230		}
6231		tr = r;
6232		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
6233		if (nr != NULL && r == &V_pf_default_rule)
6234			tr = nr;
6235		if (tr->src.addr.type == PF_ADDR_TABLE)
6236			pfr_update_stats(tr->src.addr.p.tbl,
6237			    (s == NULL) ? pd.src :
6238			    &s->key[(s->direction == PF_IN)]->addr[0],
6239			    pd.af, pd.tot_len, dir == PF_OUT,
6240			    r->action == PF_PASS, tr->src.neg);
6241		if (tr->dst.addr.type == PF_ADDR_TABLE)
6242			pfr_update_stats(tr->dst.addr.p.tbl,
6243			    (s == NULL) ? pd.dst :
6244			    &s->key[(s->direction == PF_IN)]->addr[1],
6245			    pd.af, pd.tot_len, dir == PF_OUT,
6246			    r->action == PF_PASS, tr->dst.neg);
6247	}
6248
6249	switch (action) {
6250	case PF_SYNPROXY_DROP:
6251		m_freem(*m0);
6252	case PF_DEFER:
6253		*m0 = NULL;
6254		action = PF_PASS;
6255		break;
6256	default:
6257		/* pf_route6() returns unlocked. */
6258		if (r->rt) {
6259			pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd);
6260			return (action);
6261		}
6262		break;
6263	}
6264
6265	if (s)
6266		PF_STATE_UNLOCK(s);
6267
6268	return (action);
6269}
6270#endif /* INET6 */
6271