pf.c revision 244769
1/*-
2 * Copyright (c) 2001 Daniel Hartmeier
3 * Copyright (c) 2002 - 2008 Henning Brauer
4 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 *    - Redistributions of source code must retain the above copyright
12 *      notice, this list of conditions and the following disclaimer.
13 *    - Redistributions in binary form must reproduce the above
14 *      copyright notice, this list of conditions and the following
15 *      disclaimer in the documentation and/or other materials provided
16 *      with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 * Effort sponsored in part by the Defense Advanced Research Projects
32 * Agency (DARPA) and Air Force Research Laboratory, Air Force
33 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
34 *
35 *	$OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: head/sys/netpfil/pf/pf.c 244769 2012-12-28 09:19:49Z glebius $");
40
41#include "opt_inet.h"
42#include "opt_inet6.h"
43#include "opt_bpf.h"
44#include "opt_pf.h"
45
46#include <sys/param.h>
47#include <sys/bus.h>
48#include <sys/endian.h>
49#include <sys/hash.h>
50#include <sys/interrupt.h>
51#include <sys/kernel.h>
52#include <sys/kthread.h>
53#include <sys/limits.h>
54#include <sys/mbuf.h>
55#include <sys/md5.h>
56#include <sys/random.h>
57#include <sys/refcount.h>
58#include <sys/socket.h>
59#include <sys/sysctl.h>
60#include <sys/taskqueue.h>
61#include <sys/ucred.h>
62
63#include <net/if.h>
64#include <net/if_types.h>
65#include <net/route.h>
66#include <net/radix_mpath.h>
67#include <net/vnet.h>
68
69#include <net/pfvar.h>
70#include <net/pf_mtag.h>
71#include <net/if_pflog.h>
72#include <net/if_pfsync.h>
73
74#include <netinet/in_pcb.h>
75#include <netinet/in_var.h>
76#include <netinet/ip.h>
77#include <netinet/ip_fw.h>
78#include <netinet/ip_icmp.h>
79#include <netinet/icmp_var.h>
80#include <netinet/ip_var.h>
81#include <netinet/tcp.h>
82#include <netinet/tcp_fsm.h>
83#include <netinet/tcp_seq.h>
84#include <netinet/tcp_timer.h>
85#include <netinet/tcp_var.h>
86#include <netinet/udp.h>
87#include <netinet/udp_var.h>
88
89#include <netpfil/ipfw/ip_fw_private.h> /* XXX: only for DIR_IN/DIR_OUT */
90
91#ifdef INET6
92#include <netinet/ip6.h>
93#include <netinet/icmp6.h>
94#include <netinet6/nd6.h>
95#include <netinet6/ip6_var.h>
96#include <netinet6/in6_pcb.h>
97#endif /* INET6 */
98
99#include <machine/in_cksum.h>
100#include <security/mac/mac_framework.h>
101
102#define	DPFPRINTF(n, x)	if (V_pf_status.debug >= (n)) printf x
103
104/*
105 * Global variables
106 */
107
108/* state tables */
109VNET_DEFINE(struct pf_altqqueue,	 pf_altqs[2]);
110VNET_DEFINE(struct pf_palist,		 pf_pabuf);
111VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_active);
112VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_inactive);
113VNET_DEFINE(struct pf_status,		 pf_status);
114
115VNET_DEFINE(u_int32_t,			 ticket_altqs_active);
116VNET_DEFINE(u_int32_t,			 ticket_altqs_inactive);
117VNET_DEFINE(int,			 altqs_inactive_open);
118VNET_DEFINE(u_int32_t,			 ticket_pabuf);
119
120VNET_DEFINE(MD5_CTX,			 pf_tcp_secret_ctx);
121#define	V_pf_tcp_secret_ctx		 VNET(pf_tcp_secret_ctx)
122VNET_DEFINE(u_char,			 pf_tcp_secret[16]);
123#define	V_pf_tcp_secret			 VNET(pf_tcp_secret)
124VNET_DEFINE(int,			 pf_tcp_secret_init);
125#define	V_pf_tcp_secret_init		 VNET(pf_tcp_secret_init)
126VNET_DEFINE(int,			 pf_tcp_iss_off);
127#define	V_pf_tcp_iss_off		 VNET(pf_tcp_iss_off)
128
129/*
130 * Queue for pf_intr() sends.
131 */
132static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations");
133struct pf_send_entry {
134	STAILQ_ENTRY(pf_send_entry)	pfse_next;
135	struct mbuf			*pfse_m;
136	enum {
137		PFSE_IP,
138		PFSE_IP6,
139		PFSE_ICMP,
140		PFSE_ICMP6,
141	}				pfse_type;
142	union {
143		struct route		ro;
144		struct {
145			int		type;
146			int		code;
147			int		mtu;
148		} icmpopts;
149	} u;
150#define	pfse_ro		u.ro
151#define	pfse_icmp_type	u.icmpopts.type
152#define	pfse_icmp_code	u.icmpopts.code
153#define	pfse_icmp_mtu	u.icmpopts.mtu
154};
155
156STAILQ_HEAD(pf_send_head, pf_send_entry);
157static VNET_DEFINE(struct pf_send_head, pf_sendqueue);
158#define	V_pf_sendqueue	VNET(pf_sendqueue)
159
160static struct mtx pf_sendqueue_mtx;
161#define	PF_SENDQ_LOCK()		mtx_lock(&pf_sendqueue_mtx)
162#define	PF_SENDQ_UNLOCK()	mtx_unlock(&pf_sendqueue_mtx)
163
164/*
165 * Queue for pf_overload_task() tasks.
166 */
167struct pf_overload_entry {
168	SLIST_ENTRY(pf_overload_entry)	next;
169	struct pf_addr  		addr;
170	sa_family_t			af;
171	uint8_t				dir;
172	struct pf_rule  		*rule;
173};
174
175SLIST_HEAD(pf_overload_head, pf_overload_entry);
176static VNET_DEFINE(struct pf_overload_head, pf_overloadqueue);
177#define V_pf_overloadqueue	VNET(pf_overloadqueue)
178static VNET_DEFINE(struct task, pf_overloadtask);
179#define	V_pf_overloadtask	VNET(pf_overloadtask)
180
181static struct mtx pf_overloadqueue_mtx;
182#define	PF_OVERLOADQ_LOCK()	mtx_lock(&pf_overloadqueue_mtx)
183#define	PF_OVERLOADQ_UNLOCK()	mtx_unlock(&pf_overloadqueue_mtx)
184
185VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules);
186struct mtx pf_unlnkdrules_mtx;
187
188static VNET_DEFINE(uma_zone_t,	pf_sources_z);
189#define	V_pf_sources_z	VNET(pf_sources_z)
190static VNET_DEFINE(uma_zone_t,	pf_mtag_z);
191#define	V_pf_mtag_z	VNET(pf_mtag_z)
192VNET_DEFINE(uma_zone_t,	 pf_state_z);
193VNET_DEFINE(uma_zone_t,	 pf_state_key_z);
194
195VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]);
196#define	PFID_CPUBITS	8
197#define	PFID_CPUSHIFT	(sizeof(uint64_t) * NBBY - PFID_CPUBITS)
198#define	PFID_CPUMASK	((uint64_t)((1 << PFID_CPUBITS) - 1) <<	PFID_CPUSHIFT)
199#define	PFID_MAXID	(~PFID_CPUMASK)
200CTASSERT((1 << PFID_CPUBITS) > MAXCPU);
201
202static void		 pf_src_tree_remove_state(struct pf_state *);
203static void		 pf_init_threshold(struct pf_threshold *, u_int32_t,
204			    u_int32_t);
205static void		 pf_add_threshold(struct pf_threshold *);
206static int		 pf_check_threshold(struct pf_threshold *);
207
208static void		 pf_change_ap(struct pf_addr *, u_int16_t *,
209			    u_int16_t *, u_int16_t *, struct pf_addr *,
210			    u_int16_t, u_int8_t, sa_family_t);
211static int		 pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
212			    struct tcphdr *, struct pf_state_peer *);
213static void		 pf_change_icmp(struct pf_addr *, u_int16_t *,
214			    struct pf_addr *, struct pf_addr *, u_int16_t,
215			    u_int16_t *, u_int16_t *, u_int16_t *,
216			    u_int16_t *, u_int8_t, sa_family_t);
217static void		 pf_send_tcp(struct mbuf *,
218			    const struct pf_rule *, sa_family_t,
219			    const struct pf_addr *, const struct pf_addr *,
220			    u_int16_t, u_int16_t, u_int32_t, u_int32_t,
221			    u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
222			    u_int16_t, struct ifnet *);
223static void		 pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
224			    sa_family_t, struct pf_rule *);
225static void		 pf_detach_state(struct pf_state *);
226static int		 pf_state_key_attach(struct pf_state_key *,
227			    struct pf_state_key *, struct pf_state *);
228static void		 pf_state_key_detach(struct pf_state *, int);
229static int		 pf_state_key_ctor(void *, int, void *, int);
230static u_int32_t	 pf_tcp_iss(struct pf_pdesc *);
231static int		 pf_test_rule(struct pf_rule **, struct pf_state **,
232			    int, struct pfi_kif *, struct mbuf *, int,
233			    struct pf_pdesc *, struct pf_rule **,
234			    struct pf_ruleset **, struct inpcb *);
235static int		 pf_create_state(struct pf_rule *, struct pf_rule *,
236			    struct pf_rule *, struct pf_pdesc *,
237			    struct pf_src_node *, struct pf_state_key *,
238			    struct pf_state_key *, struct mbuf *, int,
239			    u_int16_t, u_int16_t, int *, struct pfi_kif *,
240			    struct pf_state **, int, u_int16_t, u_int16_t,
241			    int);
242static int		 pf_test_fragment(struct pf_rule **, int,
243			    struct pfi_kif *, struct mbuf *, void *,
244			    struct pf_pdesc *, struct pf_rule **,
245			    struct pf_ruleset **);
246static int		 pf_tcp_track_full(struct pf_state_peer *,
247			    struct pf_state_peer *, struct pf_state **,
248			    struct pfi_kif *, struct mbuf *, int,
249			    struct pf_pdesc *, u_short *, int *);
250static int		 pf_tcp_track_sloppy(struct pf_state_peer *,
251			    struct pf_state_peer *, struct pf_state **,
252			    struct pf_pdesc *, u_short *);
253static int		 pf_test_state_tcp(struct pf_state **, int,
254			    struct pfi_kif *, struct mbuf *, int,
255			    void *, struct pf_pdesc *, u_short *);
256static int		 pf_test_state_udp(struct pf_state **, int,
257			    struct pfi_kif *, struct mbuf *, int,
258			    void *, struct pf_pdesc *);
259static int		 pf_test_state_icmp(struct pf_state **, int,
260			    struct pfi_kif *, struct mbuf *, int,
261			    void *, struct pf_pdesc *, u_short *);
262static int		 pf_test_state_other(struct pf_state **, int,
263			    struct pfi_kif *, struct mbuf *, struct pf_pdesc *);
264static u_int8_t		 pf_get_wscale(struct mbuf *, int, u_int16_t,
265			    sa_family_t);
266static u_int16_t	 pf_get_mss(struct mbuf *, int, u_int16_t,
267			    sa_family_t);
268static u_int16_t	 pf_calc_mss(struct pf_addr *, sa_family_t,
269				int, u_int16_t);
270static void		 pf_set_rt_ifp(struct pf_state *,
271			    struct pf_addr *);
272static int		 pf_check_proto_cksum(struct mbuf *, int, int,
273			    u_int8_t, sa_family_t);
274static void		 pf_print_state_parts(struct pf_state *,
275			    struct pf_state_key *, struct pf_state_key *);
276static int		 pf_addr_wrap_neq(struct pf_addr_wrap *,
277			    struct pf_addr_wrap *);
278static struct pf_state	*pf_find_state(struct pfi_kif *,
279			    struct pf_state_key_cmp *, u_int);
280static int		 pf_src_connlimit(struct pf_state **);
281static void		 pf_overload_task(void *c, int pending);
282static int		 pf_insert_src_node(struct pf_src_node **,
283			    struct pf_rule *, struct pf_addr *, sa_family_t);
284static u_int		 pf_purge_expired_states(u_int, int);
285static void		 pf_purge_unlinked_rules(void);
286static int		 pf_mtag_init(void *, int, int);
287static void		 pf_mtag_free(struct m_tag *);
288#ifdef INET
289static void		 pf_route(struct mbuf **, struct pf_rule *, int,
290			    struct ifnet *, struct pf_state *,
291			    struct pf_pdesc *);
292#endif /* INET */
293#ifdef INET6
294static void		 pf_change_a6(struct pf_addr *, u_int16_t *,
295			    struct pf_addr *, u_int8_t);
296static void		 pf_route6(struct mbuf **, struct pf_rule *, int,
297			    struct ifnet *, struct pf_state *,
298			    struct pf_pdesc *);
299#endif /* INET6 */
300
301int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
302
303VNET_DECLARE(int, pf_end_threads);
304
305VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]);
306
307#define	PACKET_LOOPED(pd)	((pd)->pf_mtag &&			\
308				 (pd)->pf_mtag->flags & PF_PACKET_LOOPED)
309
310#define	STATE_LOOKUP(i, k, d, s, pd)					\
311	do {								\
312		(s) = pf_find_state((i), (k), (d));			\
313		if ((s) == NULL || (s)->timeout == PFTM_PURGE)		\
314			return (PF_DROP);				\
315		if (PACKET_LOOPED(pd))					\
316			return (PF_PASS);				\
317		if ((d) == PF_OUT &&					\
318		    (((s)->rule.ptr->rt == PF_ROUTETO &&		\
319		    (s)->rule.ptr->direction == PF_OUT) ||		\
320		    ((s)->rule.ptr->rt == PF_REPLYTO &&			\
321		    (s)->rule.ptr->direction == PF_IN)) &&		\
322		    (s)->rt_kif != NULL &&				\
323		    (s)->rt_kif != (i))					\
324			return (PF_PASS);				\
325	} while (0)
326
327#define	BOUND_IFACE(r, k) \
328	((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all
329
330#define	STATE_INC_COUNTERS(s)				\
331	do {						\
332		s->rule.ptr->states_cur++;		\
333		s->rule.ptr->states_tot++;		\
334		if (s->anchor.ptr != NULL) {		\
335			s->anchor.ptr->states_cur++;	\
336			s->anchor.ptr->states_tot++;	\
337		}					\
338		if (s->nat_rule.ptr != NULL) {		\
339			s->nat_rule.ptr->states_cur++;	\
340			s->nat_rule.ptr->states_tot++;	\
341		}					\
342	} while (0)
343
344#define	STATE_DEC_COUNTERS(s)				\
345	do {						\
346		if (s->nat_rule.ptr != NULL)		\
347			s->nat_rule.ptr->states_cur--;	\
348		if (s->anchor.ptr != NULL)		\
349			s->anchor.ptr->states_cur--;	\
350		s->rule.ptr->states_cur--;		\
351	} while (0)
352
353static MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures");
354VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
355VNET_DEFINE(struct pf_idhash *, pf_idhash);
356VNET_DEFINE(u_long, pf_hashmask);
357VNET_DEFINE(struct pf_srchash *, pf_srchash);
358VNET_DEFINE(u_long, pf_srchashmask);
359
360SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)");
361
362VNET_DEFINE(u_long, pf_hashsize);
363#define	V_pf_hashsize	VNET(pf_hashsize)
364SYSCTL_VNET_UINT(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN,
365    &VNET_NAME(pf_hashsize), 0, "Size of pf(4) states hashtable");
366
367VNET_DEFINE(u_long, pf_srchashsize);
368#define	V_pf_srchashsize	VNET(pf_srchashsize)
369SYSCTL_VNET_UINT(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN,
370    &VNET_NAME(pf_srchashsize), 0, "Size of pf(4) source nodes hashtable");
371
372VNET_DEFINE(void *, pf_swi_cookie);
373
374VNET_DEFINE(uint32_t, pf_hashseed);
375#define	V_pf_hashseed	VNET(pf_hashseed)
376
377static __inline uint32_t
378pf_hashkey(struct pf_state_key *sk)
379{
380	uint32_t h;
381
382	h = jenkins_hash32((uint32_t *)sk,
383	    sizeof(struct pf_state_key_cmp)/sizeof(uint32_t),
384	    V_pf_hashseed);
385
386	return (h & V_pf_hashmask);
387}
388
389static __inline uint32_t
390pf_hashsrc(struct pf_addr *addr, sa_family_t af)
391{
392	uint32_t h;
393
394	switch (af) {
395	case AF_INET:
396		h = jenkins_hash32((uint32_t *)&addr->v4,
397		    sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed);
398		break;
399	case AF_INET6:
400		h = jenkins_hash32((uint32_t *)&addr->v6,
401		    sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed);
402		break;
403	default:
404		panic("%s: unknown address family %u", __func__, af);
405	}
406
407	return (h & V_pf_srchashmask);
408}
409
410#ifdef INET6
411void
412pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
413{
414	switch (af) {
415#ifdef INET
416	case AF_INET:
417		dst->addr32[0] = src->addr32[0];
418		break;
419#endif /* INET */
420	case AF_INET6:
421		dst->addr32[0] = src->addr32[0];
422		dst->addr32[1] = src->addr32[1];
423		dst->addr32[2] = src->addr32[2];
424		dst->addr32[3] = src->addr32[3];
425		break;
426	}
427}
428#endif /* INET6 */
429
430static void
431pf_init_threshold(struct pf_threshold *threshold,
432    u_int32_t limit, u_int32_t seconds)
433{
434	threshold->limit = limit * PF_THRESHOLD_MULT;
435	threshold->seconds = seconds;
436	threshold->count = 0;
437	threshold->last = time_uptime;
438}
439
440static void
441pf_add_threshold(struct pf_threshold *threshold)
442{
443	u_int32_t t = time_uptime, diff = t - threshold->last;
444
445	if (diff >= threshold->seconds)
446		threshold->count = 0;
447	else
448		threshold->count -= threshold->count * diff /
449		    threshold->seconds;
450	threshold->count += PF_THRESHOLD_MULT;
451	threshold->last = t;
452}
453
454static int
455pf_check_threshold(struct pf_threshold *threshold)
456{
457	return (threshold->count > threshold->limit);
458}
459
460static int
461pf_src_connlimit(struct pf_state **state)
462{
463	struct pf_overload_entry *pfoe;
464	int bad = 0;
465
466	PF_STATE_LOCK_ASSERT(*state);
467
468	(*state)->src_node->conn++;
469	(*state)->src.tcp_est = 1;
470	pf_add_threshold(&(*state)->src_node->conn_rate);
471
472	if ((*state)->rule.ptr->max_src_conn &&
473	    (*state)->rule.ptr->max_src_conn <
474	    (*state)->src_node->conn) {
475		V_pf_status.lcounters[LCNT_SRCCONN]++;
476		bad++;
477	}
478
479	if ((*state)->rule.ptr->max_src_conn_rate.limit &&
480	    pf_check_threshold(&(*state)->src_node->conn_rate)) {
481		V_pf_status.lcounters[LCNT_SRCCONNRATE]++;
482		bad++;
483	}
484
485	if (!bad)
486		return (0);
487
488	/* Kill this state. */
489	(*state)->timeout = PFTM_PURGE;
490	(*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
491
492	if ((*state)->rule.ptr->overload_tbl == NULL)
493		return (1);
494
495	/* Schedule overloading and flushing task. */
496	pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT);
497	if (pfoe == NULL)
498		return (1);	/* too bad :( */
499
500	bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr));
501	pfoe->af = (*state)->key[PF_SK_WIRE]->af;
502	pfoe->rule = (*state)->rule.ptr;
503	pfoe->dir = (*state)->direction;
504	PF_OVERLOADQ_LOCK();
505	SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next);
506	PF_OVERLOADQ_UNLOCK();
507	taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask);
508
509	return (1);
510}
511
512static void
513pf_overload_task(void *c, int pending)
514{
515	struct pf_overload_head queue;
516	struct pfr_addr p;
517	struct pf_overload_entry *pfoe, *pfoe1;
518	uint32_t killed = 0;
519
520	PF_OVERLOADQ_LOCK();
521	queue = *(struct pf_overload_head *)c;
522	SLIST_INIT((struct pf_overload_head *)c);
523	PF_OVERLOADQ_UNLOCK();
524
525	bzero(&p, sizeof(p));
526	SLIST_FOREACH(pfoe, &queue, next) {
527		V_pf_status.lcounters[LCNT_OVERLOAD_TABLE]++;
528		if (V_pf_status.debug >= PF_DEBUG_MISC) {
529			printf("%s: blocking address ", __func__);
530			pf_print_host(&pfoe->addr, 0, pfoe->af);
531			printf("\n");
532		}
533
534		p.pfra_af = pfoe->af;
535		switch (pfoe->af) {
536#ifdef INET
537		case AF_INET:
538			p.pfra_net = 32;
539			p.pfra_ip4addr = pfoe->addr.v4;
540			break;
541#endif
542#ifdef INET6
543		case AF_INET6:
544			p.pfra_net = 128;
545			p.pfra_ip6addr = pfoe->addr.v6;
546			break;
547#endif
548		}
549
550		PF_RULES_WLOCK();
551		pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second);
552		PF_RULES_WUNLOCK();
553	}
554
555	/*
556	 * Remove those entries, that don't need flushing.
557	 */
558	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
559		if (pfoe->rule->flush == 0) {
560			SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next);
561			free(pfoe, M_PFTEMP);
562		} else
563			V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++;
564
565	/* If nothing to flush, return. */
566	if (SLIST_EMPTY(&queue))
567		return;
568
569	for (int i = 0; i <= V_pf_hashmask; i++) {
570		struct pf_idhash *ih = &V_pf_idhash[i];
571		struct pf_state_key *sk;
572		struct pf_state *s;
573
574		PF_HASHROW_LOCK(ih);
575		LIST_FOREACH(s, &ih->states, entry) {
576		    sk = s->key[PF_SK_WIRE];
577		    SLIST_FOREACH(pfoe, &queue, next)
578			if (sk->af == pfoe->af &&
579			    ((pfoe->rule->flush & PF_FLUSH_GLOBAL) ||
580			    pfoe->rule == s->rule.ptr) &&
581			    ((pfoe->dir == PF_OUT &&
582			    PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) ||
583			    (pfoe->dir == PF_IN &&
584			    PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) {
585				s->timeout = PFTM_PURGE;
586				s->src.state = s->dst.state = TCPS_CLOSED;
587				killed++;
588			}
589		}
590		PF_HASHROW_UNLOCK(ih);
591	}
592	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
593		free(pfoe, M_PFTEMP);
594	if (V_pf_status.debug >= PF_DEBUG_MISC)
595		printf("%s: %u states killed", __func__, killed);
596}
597
598/*
599 * Can return locked on failure, so that we can consistently
600 * allocate and insert a new one.
601 */
602struct pf_src_node *
603pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af,
604	int returnlocked)
605{
606	struct pf_srchash *sh;
607	struct pf_src_node *n;
608
609	V_pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
610
611	sh = &V_pf_srchash[pf_hashsrc(src, af)];
612	PF_HASHROW_LOCK(sh);
613	LIST_FOREACH(n, &sh->nodes, entry)
614		if (n->rule.ptr == rule && n->af == af &&
615		    ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) ||
616		    (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0)))
617			break;
618	if (n != NULL || returnlocked == 0)
619		PF_HASHROW_UNLOCK(sh);
620
621	return (n);
622}
623
624static int
625pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
626    struct pf_addr *src, sa_family_t af)
627{
628
629	KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK ||
630	    rule->rpool.opts & PF_POOL_STICKYADDR),
631	    ("%s for non-tracking rule %p", __func__, rule));
632
633	if (*sn == NULL)
634		*sn = pf_find_src_node(src, rule, af, 1);
635
636	if (*sn == NULL) {
637		struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)];
638
639		PF_HASHROW_ASSERT(sh);
640
641		if (!rule->max_src_nodes ||
642		    rule->src_nodes < rule->max_src_nodes)
643			(*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO);
644		else
645			V_pf_status.lcounters[LCNT_SRCNODES]++;
646		if ((*sn) == NULL) {
647			PF_HASHROW_UNLOCK(sh);
648			return (-1);
649		}
650
651		pf_init_threshold(&(*sn)->conn_rate,
652		    rule->max_src_conn_rate.limit,
653		    rule->max_src_conn_rate.seconds);
654
655		(*sn)->af = af;
656		(*sn)->rule.ptr = rule;
657		PF_ACPY(&(*sn)->addr, src, af);
658		LIST_INSERT_HEAD(&sh->nodes, *sn, entry);
659		(*sn)->creation = time_uptime;
660		(*sn)->ruletype = rule->action;
661		if ((*sn)->rule.ptr != NULL)
662			(*sn)->rule.ptr->src_nodes++;
663		PF_HASHROW_UNLOCK(sh);
664		V_pf_status.scounters[SCNT_SRC_NODE_INSERT]++;
665		V_pf_status.src_nodes++;
666	} else {
667		if (rule->max_src_states &&
668		    (*sn)->states >= rule->max_src_states) {
669			V_pf_status.lcounters[LCNT_SRCSTATES]++;
670			return (-1);
671		}
672	}
673	return (0);
674}
675
676static void
677pf_remove_src_node(struct pf_src_node *src)
678{
679	struct pf_srchash *sh;
680
681	sh = &V_pf_srchash[pf_hashsrc(&src->addr, src->af)];
682	PF_HASHROW_LOCK(sh);
683	LIST_REMOVE(src, entry);
684	PF_HASHROW_UNLOCK(sh);
685
686	V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
687	V_pf_status.src_nodes--;
688
689	uma_zfree(V_pf_sources_z, src);
690}
691
692/* Data storage structures initialization. */
693void
694pf_initialize()
695{
696	struct pf_keyhash	*kh;
697	struct pf_idhash	*ih;
698	struct pf_srchash	*sh;
699	u_int i;
700
701	TUNABLE_ULONG_FETCH("net.pf.states_hashsize", &V_pf_hashsize);
702	if (V_pf_hashsize == 0 || !powerof2(V_pf_hashsize))
703		V_pf_hashsize = PF_HASHSIZ;
704	TUNABLE_ULONG_FETCH("net.pf.source_nodes_hashsize", &V_pf_srchashsize);
705	if (V_pf_srchashsize == 0 || !powerof2(V_pf_srchashsize))
706		V_pf_srchashsize = PF_HASHSIZ / 4;
707
708	V_pf_hashseed = arc4random();
709
710	/* States and state keys storage. */
711	V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state),
712	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
713	V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z;
714	uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
715	uma_zone_set_warning(V_pf_state_z, "PF states limit reached");
716
717	V_pf_state_key_z = uma_zcreate("pf state keys",
718	    sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL,
719	    UMA_ALIGN_PTR, 0);
720	V_pf_keyhash = malloc(V_pf_hashsize * sizeof(struct pf_keyhash),
721	    M_PFHASH, M_WAITOK | M_ZERO);
722	V_pf_idhash = malloc(V_pf_hashsize * sizeof(struct pf_idhash),
723	    M_PFHASH, M_WAITOK | M_ZERO);
724	V_pf_hashmask = V_pf_hashsize - 1;
725	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
726	    i++, kh++, ih++) {
727		mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF);
728		mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
729	}
730
731	/* Source nodes. */
732	V_pf_sources_z = uma_zcreate("pf source nodes",
733	    sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
734	    0);
735	V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z;
736	uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT);
737	uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached");
738	V_pf_srchash = malloc(V_pf_srchashsize * sizeof(struct pf_srchash),
739	  M_PFHASH, M_WAITOK|M_ZERO);
740	V_pf_srchashmask = V_pf_srchashsize - 1;
741	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++)
742		mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
743
744	/* ALTQ */
745	TAILQ_INIT(&V_pf_altqs[0]);
746	TAILQ_INIT(&V_pf_altqs[1]);
747	TAILQ_INIT(&V_pf_pabuf);
748	V_pf_altqs_active = &V_pf_altqs[0];
749	V_pf_altqs_inactive = &V_pf_altqs[1];
750
751	/* Mbuf tags */
752	V_pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) +
753	    sizeof(struct pf_mtag), NULL, NULL, pf_mtag_init, NULL,
754	    UMA_ALIGN_PTR, 0);
755
756	/* Send & overload+flush queues. */
757	STAILQ_INIT(&V_pf_sendqueue);
758	SLIST_INIT(&V_pf_overloadqueue);
759	TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, &V_pf_overloadqueue);
760	mtx_init(&pf_sendqueue_mtx, "pf send queue", NULL, MTX_DEF);
761	mtx_init(&pf_overloadqueue_mtx, "pf overload/flush queue", NULL,
762	    MTX_DEF);
763
764	/* Unlinked, but may be referenced rules. */
765	TAILQ_INIT(&V_pf_unlinked_rules);
766	mtx_init(&pf_unlnkdrules_mtx, "pf unlinked rules", NULL, MTX_DEF);
767}
768
769void
770pf_cleanup()
771{
772	struct pf_keyhash	*kh;
773	struct pf_idhash	*ih;
774	struct pf_srchash	*sh;
775	struct pf_send_entry	*pfse, *next;
776	u_int i;
777
778	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
779	    i++, kh++, ih++) {
780		KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
781		    __func__));
782		KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
783		    __func__));
784		mtx_destroy(&kh->lock);
785		mtx_destroy(&ih->lock);
786	}
787	free(V_pf_keyhash, M_PFHASH);
788	free(V_pf_idhash, M_PFHASH);
789
790	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
791		KASSERT(LIST_EMPTY(&sh->nodes),
792		    ("%s: source node hash not empty", __func__));
793		mtx_destroy(&sh->lock);
794	}
795	free(V_pf_srchash, M_PFHASH);
796
797	STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
798		m_freem(pfse->pfse_m);
799		free(pfse, M_PFTEMP);
800	}
801
802	mtx_destroy(&pf_sendqueue_mtx);
803	mtx_destroy(&pf_overloadqueue_mtx);
804	mtx_destroy(&pf_unlnkdrules_mtx);
805
806	uma_zdestroy(V_pf_mtag_z);
807	uma_zdestroy(V_pf_sources_z);
808	uma_zdestroy(V_pf_state_z);
809	uma_zdestroy(V_pf_state_key_z);
810}
811
812static int
813pf_mtag_init(void *mem, int size, int how)
814{
815	struct m_tag *t;
816
817	t = (struct m_tag *)mem;
818	t->m_tag_cookie = MTAG_ABI_COMPAT;
819	t->m_tag_id = PACKET_TAG_PF;
820	t->m_tag_len = sizeof(struct pf_mtag);
821	t->m_tag_free = pf_mtag_free;
822
823	return (0);
824}
825
826static void
827pf_mtag_free(struct m_tag *t)
828{
829
830	uma_zfree(V_pf_mtag_z, t);
831}
832
833struct pf_mtag *
834pf_get_mtag(struct mbuf *m)
835{
836	struct m_tag *mtag;
837
838	if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL)
839		return ((struct pf_mtag *)(mtag + 1));
840
841	mtag = uma_zalloc(V_pf_mtag_z, M_NOWAIT);
842	if (mtag == NULL)
843		return (NULL);
844	bzero(mtag + 1, sizeof(struct pf_mtag));
845	m_tag_prepend(m, mtag);
846
847	return ((struct pf_mtag *)(mtag + 1));
848}
849
850static int
851pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
852    struct pf_state *s)
853{
854	struct pf_keyhash	*kh;
855	struct pf_state_key	*sk, *cur;
856	struct pf_state		*si, *olds = NULL;
857	int idx;
858
859	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
860	KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
861	KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
862
863	/*
864	 * First run: start with wire key.
865	 */
866	sk = skw;
867	idx = PF_SK_WIRE;
868
869keyattach:
870	kh = &V_pf_keyhash[pf_hashkey(sk)];
871
872	PF_HASHROW_LOCK(kh);
873	LIST_FOREACH(cur, &kh->keys, entry)
874		if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
875			break;
876
877	if (cur != NULL) {
878		/* Key exists. Check for same kif, if none, add to key. */
879		TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
880			struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
881
882			PF_HASHROW_LOCK(ih);
883			if (si->kif == s->kif &&
884			    si->direction == s->direction) {
885				if (sk->proto == IPPROTO_TCP &&
886				    si->src.state >= TCPS_FIN_WAIT_2 &&
887				    si->dst.state >= TCPS_FIN_WAIT_2) {
888					si->src.state = si->dst.state =
889					    TCPS_CLOSED;
890					/* Unlink later or cur can go away. */
891					pf_ref_state(si);
892					olds = si;
893				} else {
894					if (V_pf_status.debug >= PF_DEBUG_MISC) {
895						printf("pf: %s key attach "
896						    "failed on %s: ",
897						    (idx == PF_SK_WIRE) ?
898						    "wire" : "stack",
899						    s->kif->pfik_name);
900						pf_print_state_parts(s,
901						    (idx == PF_SK_WIRE) ?
902						    sk : NULL,
903						    (idx == PF_SK_STACK) ?
904						    sk : NULL);
905						printf(", existing: ");
906						pf_print_state_parts(si,
907						    (idx == PF_SK_WIRE) ?
908						    sk : NULL,
909						    (idx == PF_SK_STACK) ?
910						    sk : NULL);
911						printf("\n");
912					}
913					PF_HASHROW_UNLOCK(ih);
914					PF_HASHROW_UNLOCK(kh);
915					uma_zfree(V_pf_state_key_z, sk);
916					if (idx == PF_SK_STACK)
917						pf_detach_state(s);
918					return (-1);	/* collision! */
919				}
920			}
921			PF_HASHROW_UNLOCK(ih);
922		}
923		uma_zfree(V_pf_state_key_z, sk);
924		s->key[idx] = cur;
925	} else {
926		LIST_INSERT_HEAD(&kh->keys, sk, entry);
927		s->key[idx] = sk;
928	}
929
930stateattach:
931	/* List is sorted, if-bound states before floating. */
932	if (s->kif == V_pfi_all)
933		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
934	else
935		TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
936
937	/*
938	 * Attach done. See how should we (or should not?)
939	 * attach a second key.
940	 */
941	if (sks == skw) {
942		s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
943		idx = PF_SK_STACK;
944		sks = NULL;
945		goto stateattach;
946	} else if (sks != NULL) {
947		PF_HASHROW_UNLOCK(kh);
948		if (olds) {
949			pf_unlink_state(olds, 0);
950			pf_release_state(olds);
951			olds = NULL;
952		}
953		/*
954		 * Continue attaching with stack key.
955		 */
956		sk = sks;
957		idx = PF_SK_STACK;
958		sks = NULL;
959		goto keyattach;
960	} else
961		PF_HASHROW_UNLOCK(kh);
962
963	if (olds) {
964		pf_unlink_state(olds, 0);
965		pf_release_state(olds);
966	}
967
968	KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
969	    ("%s failure", __func__));
970
971	return (0);
972}
973
974static void
975pf_detach_state(struct pf_state *s)
976{
977	struct pf_state_key *sks = s->key[PF_SK_STACK];
978	struct pf_keyhash *kh;
979
980	if (sks != NULL) {
981		kh = &V_pf_keyhash[pf_hashkey(sks)];
982		PF_HASHROW_LOCK(kh);
983		if (s->key[PF_SK_STACK] != NULL)
984			pf_state_key_detach(s, PF_SK_STACK);
985		/*
986		 * If both point to same key, then we are done.
987		 */
988		if (sks == s->key[PF_SK_WIRE]) {
989			pf_state_key_detach(s, PF_SK_WIRE);
990			PF_HASHROW_UNLOCK(kh);
991			return;
992		}
993		PF_HASHROW_UNLOCK(kh);
994	}
995
996	if (s->key[PF_SK_WIRE] != NULL) {
997		kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])];
998		PF_HASHROW_LOCK(kh);
999		if (s->key[PF_SK_WIRE] != NULL)
1000			pf_state_key_detach(s, PF_SK_WIRE);
1001		PF_HASHROW_UNLOCK(kh);
1002	}
1003}
1004
1005static void
1006pf_state_key_detach(struct pf_state *s, int idx)
1007{
1008	struct pf_state_key *sk = s->key[idx];
1009#ifdef INVARIANTS
1010	struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)];
1011
1012	PF_HASHROW_ASSERT(kh);
1013#endif
1014	TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]);
1015	s->key[idx] = NULL;
1016
1017	if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) {
1018		LIST_REMOVE(sk, entry);
1019		uma_zfree(V_pf_state_key_z, sk);
1020	}
1021}
1022
1023static int
1024pf_state_key_ctor(void *mem, int size, void *arg, int flags)
1025{
1026	struct pf_state_key *sk = mem;
1027
1028	bzero(sk, sizeof(struct pf_state_key_cmp));
1029	TAILQ_INIT(&sk->states[PF_SK_WIRE]);
1030	TAILQ_INIT(&sk->states[PF_SK_STACK]);
1031
1032	return (0);
1033}
1034
1035struct pf_state_key *
1036pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr,
1037	struct pf_addr *daddr, u_int16_t sport, u_int16_t dport)
1038{
1039	struct pf_state_key *sk;
1040
1041	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1042	if (sk == NULL)
1043		return (NULL);
1044
1045	PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af);
1046	PF_ACPY(&sk->addr[pd->didx], daddr, pd->af);
1047	sk->port[pd->sidx] = sport;
1048	sk->port[pd->didx] = dport;
1049	sk->proto = pd->proto;
1050	sk->af = pd->af;
1051
1052	return (sk);
1053}
1054
1055struct pf_state_key *
1056pf_state_key_clone(struct pf_state_key *orig)
1057{
1058	struct pf_state_key *sk;
1059
1060	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1061	if (sk == NULL)
1062		return (NULL);
1063
1064	bcopy(orig, sk, sizeof(struct pf_state_key_cmp));
1065
1066	return (sk);
1067}
1068
1069int
1070pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw,
1071    struct pf_state_key *sks, struct pf_state *s)
1072{
1073	struct pf_idhash *ih;
1074	struct pf_state *cur;
1075
1076	KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]),
1077	    ("%s: sks not pristine", __func__));
1078	KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]),
1079	    ("%s: skw not pristine", __func__));
1080	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
1081
1082	s->kif = kif;
1083
1084	if (s->id == 0 && s->creatorid == 0) {
1085		/* XXX: should be atomic, but probability of collision low */
1086		if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID)
1087			V_pf_stateid[curcpu] = 1;
1088		s->id |= (uint64_t )curcpu << PFID_CPUSHIFT;
1089		s->id = htobe64(s->id);
1090		s->creatorid = V_pf_status.hostid;
1091	}
1092
1093	if (pf_state_key_attach(skw, sks, s))
1094		return (-1);
1095
1096	ih = &V_pf_idhash[PF_IDHASH(s)];
1097	PF_HASHROW_LOCK(ih);
1098	LIST_FOREACH(cur, &ih->states, entry)
1099		if (cur->id == s->id && cur->creatorid == s->creatorid)
1100			break;
1101
1102	if (cur != NULL) {
1103		PF_HASHROW_UNLOCK(ih);
1104		if (V_pf_status.debug >= PF_DEBUG_MISC) {
1105			printf("pf: state insert failed: "
1106			    "id: %016llx creatorid: %08x",
1107			    (unsigned long long)be64toh(s->id),
1108			    ntohl(s->creatorid));
1109			printf("\n");
1110		}
1111		pf_detach_state(s);
1112		return (-1);
1113	}
1114	LIST_INSERT_HEAD(&ih->states, s, entry);
1115	/* One for keys, one for ID hash. */
1116	refcount_init(&s->refs, 2);
1117
1118	V_pf_status.fcounters[FCNT_STATE_INSERT]++;
1119	if (pfsync_insert_state_ptr != NULL)
1120		pfsync_insert_state_ptr(s);
1121
1122	/* Returns locked. */
1123	return (0);
1124}
1125
1126/*
1127 * Find state by ID: returns with locked row on success.
1128 */
1129struct pf_state *
1130pf_find_state_byid(uint64_t id, uint32_t creatorid)
1131{
1132	struct pf_idhash *ih;
1133	struct pf_state *s;
1134
1135	V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
1136
1137	ih = &V_pf_idhash[(be64toh(id) % (V_pf_hashmask + 1))];
1138
1139	PF_HASHROW_LOCK(ih);
1140	LIST_FOREACH(s, &ih->states, entry)
1141		if (s->id == id && s->creatorid == creatorid)
1142			break;
1143
1144	if (s == NULL)
1145		PF_HASHROW_UNLOCK(ih);
1146
1147	return (s);
1148}
1149
1150/*
1151 * Find state by key.
1152 * Returns with ID hash slot locked on success.
1153 */
1154static struct pf_state *
1155pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir)
1156{
1157	struct pf_keyhash	*kh;
1158	struct pf_state_key	*sk;
1159	struct pf_state		*s;
1160	int idx;
1161
1162	V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
1163
1164	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1165
1166	PF_HASHROW_LOCK(kh);
1167	LIST_FOREACH(sk, &kh->keys, entry)
1168		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1169			break;
1170	if (sk == NULL) {
1171		PF_HASHROW_UNLOCK(kh);
1172		return (NULL);
1173	}
1174
1175	idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK);
1176
1177	/* List is sorted, if-bound states before floating ones. */
1178	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx])
1179		if (s->kif == V_pfi_all || s->kif == kif) {
1180			PF_STATE_LOCK(s);
1181			PF_HASHROW_UNLOCK(kh);
1182			if (s->timeout == PFTM_UNLINKED) {
1183				/*
1184				 * State is being processed
1185				 * by pf_unlink_state() in
1186				 * an other thread.
1187				 */
1188				PF_STATE_UNLOCK(s);
1189				return (NULL);
1190			}
1191			return (s);
1192		}
1193	PF_HASHROW_UNLOCK(kh);
1194
1195	return (NULL);
1196}
1197
1198struct pf_state *
1199pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
1200{
1201	struct pf_keyhash	*kh;
1202	struct pf_state_key	*sk;
1203	struct pf_state		*s, *ret = NULL;
1204	int			 idx, inout = 0;
1205
1206	V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
1207
1208	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1209
1210	PF_HASHROW_LOCK(kh);
1211	LIST_FOREACH(sk, &kh->keys, entry)
1212		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1213			break;
1214	if (sk == NULL) {
1215		PF_HASHROW_UNLOCK(kh);
1216		return (NULL);
1217	}
1218	switch (dir) {
1219	case PF_IN:
1220		idx = PF_SK_WIRE;
1221		break;
1222	case PF_OUT:
1223		idx = PF_SK_STACK;
1224		break;
1225	case PF_INOUT:
1226		idx = PF_SK_WIRE;
1227		inout = 1;
1228		break;
1229	default:
1230		panic("%s: dir %u", __func__, dir);
1231	}
1232second_run:
1233	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
1234		if (more == NULL) {
1235			PF_HASHROW_UNLOCK(kh);
1236			return (s);
1237		}
1238
1239		if (ret)
1240			(*more)++;
1241		else
1242			ret = s;
1243	}
1244	if (inout == 1) {
1245		inout = 0;
1246		idx = PF_SK_STACK;
1247		goto second_run;
1248	}
1249	PF_HASHROW_UNLOCK(kh);
1250
1251	return (ret);
1252}
1253
1254/* END state table stuff */
1255
1256static void
1257pf_send(struct pf_send_entry *pfse)
1258{
1259
1260	PF_SENDQ_LOCK();
1261	STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next);
1262	PF_SENDQ_UNLOCK();
1263	swi_sched(V_pf_swi_cookie, 0);
1264}
1265
1266void
1267pf_intr(void *v)
1268{
1269	struct pf_send_head queue;
1270	struct pf_send_entry *pfse, *next;
1271
1272	CURVNET_SET((struct vnet *)v);
1273
1274	PF_SENDQ_LOCK();
1275	queue = V_pf_sendqueue;
1276	STAILQ_INIT(&V_pf_sendqueue);
1277	PF_SENDQ_UNLOCK();
1278
1279	STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) {
1280		switch (pfse->pfse_type) {
1281#ifdef INET
1282		case PFSE_IP:
1283			ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL);
1284			break;
1285		case PFSE_ICMP:
1286			icmp_error(pfse->pfse_m, pfse->pfse_icmp_type,
1287			    pfse->pfse_icmp_code, 0, pfse->pfse_icmp_mtu);
1288			break;
1289#endif /* INET */
1290#ifdef INET6
1291		case PFSE_IP6:
1292			ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL,
1293			    NULL);
1294			break;
1295		case PFSE_ICMP6:
1296			icmp6_error(pfse->pfse_m, pfse->pfse_icmp_type,
1297			    pfse->pfse_icmp_code, pfse->pfse_icmp_mtu);
1298			break;
1299#endif /* INET6 */
1300		default:
1301			panic("%s: unknown type", __func__);
1302		}
1303		free(pfse, M_PFTEMP);
1304	}
1305	CURVNET_RESTORE();
1306}
1307
1308void
1309pf_purge_thread(void *v)
1310{
1311	u_int idx = 0;
1312
1313	CURVNET_SET((struct vnet *)v);
1314
1315	for (;;) {
1316		PF_RULES_RLOCK();
1317		rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftm", hz / 10);
1318
1319		if (V_pf_end_threads) {
1320			/*
1321			 * To cleanse up all kifs and rules we need
1322			 * two runs: first one clears reference flags,
1323			 * then pf_purge_expired_states() doesn't
1324			 * raise them, and then second run frees.
1325			 */
1326			PF_RULES_RUNLOCK();
1327			pf_purge_unlinked_rules();
1328			pfi_kif_purge();
1329
1330			/*
1331			 * Now purge everything.
1332			 */
1333			pf_purge_expired_states(0, V_pf_hashmask);
1334			pf_purge_expired_fragments();
1335			pf_purge_expired_src_nodes();
1336
1337			/*
1338			 * Now all kifs & rules should be unreferenced,
1339			 * thus should be successfully freed.
1340			 */
1341			pf_purge_unlinked_rules();
1342			pfi_kif_purge();
1343
1344			/*
1345			 * Announce success and exit.
1346			 */
1347			PF_RULES_RLOCK();
1348			V_pf_end_threads++;
1349			PF_RULES_RUNLOCK();
1350			wakeup(pf_purge_thread);
1351			kproc_exit(0);
1352		}
1353		PF_RULES_RUNLOCK();
1354
1355		/* Process 1/interval fraction of the state table every run. */
1356		idx = pf_purge_expired_states(idx, V_pf_hashmask /
1357			    (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10));
1358
1359		/* Purge other expired types every PFTM_INTERVAL seconds. */
1360		if (idx == 0) {
1361			/*
1362			 * Order is important:
1363			 * - states and src nodes reference rules
1364			 * - states and rules reference kifs
1365			 */
1366			pf_purge_expired_fragments();
1367			pf_purge_expired_src_nodes();
1368			pf_purge_unlinked_rules();
1369			pfi_kif_purge();
1370		}
1371	}
1372	/* not reached */
1373	CURVNET_RESTORE();
1374}
1375
1376u_int32_t
1377pf_state_expires(const struct pf_state *state)
1378{
1379	u_int32_t	timeout;
1380	u_int32_t	start;
1381	u_int32_t	end;
1382	u_int32_t	states;
1383
1384	/* handle all PFTM_* > PFTM_MAX here */
1385	if (state->timeout == PFTM_PURGE)
1386		return (time_uptime);
1387	if (state->timeout == PFTM_UNTIL_PACKET)
1388		return (0);
1389	KASSERT(state->timeout != PFTM_UNLINKED,
1390	    ("pf_state_expires: timeout == PFTM_UNLINKED"));
1391	KASSERT((state->timeout < PFTM_MAX),
1392	    ("pf_state_expires: timeout > PFTM_MAX"));
1393	timeout = state->rule.ptr->timeout[state->timeout];
1394	if (!timeout)
1395		timeout = V_pf_default_rule.timeout[state->timeout];
1396	start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
1397	if (start) {
1398		end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
1399		states = state->rule.ptr->states_cur;	/* XXXGL */
1400	} else {
1401		start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START];
1402		end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END];
1403		states = V_pf_status.states;
1404	}
1405	if (end && states > start && start < end) {
1406		if (states < end)
1407			return (state->expire + timeout * (end - states) /
1408			    (end - start));
1409		else
1410			return (time_uptime);
1411	}
1412	return (state->expire + timeout);
1413}
1414
1415void
1416pf_purge_expired_src_nodes()
1417{
1418	struct pf_srchash	*sh;
1419	struct pf_src_node	*cur, *next;
1420	int i;
1421
1422	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
1423	    PF_HASHROW_LOCK(sh);
1424	    LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next)
1425		if (cur->states <= 0 && cur->expire <= time_uptime) {
1426			if (cur->rule.ptr != NULL)
1427				cur->rule.ptr->src_nodes--;
1428			LIST_REMOVE(cur, entry);
1429			V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
1430			V_pf_status.src_nodes--;
1431			uma_zfree(V_pf_sources_z, cur);
1432		} else if (cur->rule.ptr != NULL)
1433			cur->rule.ptr->rule_flag |= PFRULE_REFS;
1434	    PF_HASHROW_UNLOCK(sh);
1435	}
1436}
1437
1438static void
1439pf_src_tree_remove_state(struct pf_state *s)
1440{
1441	u_int32_t timeout;
1442
1443	if (s->src_node != NULL) {
1444		if (s->src.tcp_est)
1445			--s->src_node->conn;
1446		if (--s->src_node->states <= 0) {
1447			timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1448			if (!timeout)
1449				timeout =
1450				    V_pf_default_rule.timeout[PFTM_SRC_NODE];
1451			s->src_node->expire = time_uptime + timeout;
1452		}
1453	}
1454	if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
1455		if (--s->nat_src_node->states <= 0) {
1456			timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1457			if (!timeout)
1458				timeout =
1459				    V_pf_default_rule.timeout[PFTM_SRC_NODE];
1460			s->nat_src_node->expire = time_uptime + timeout;
1461		}
1462	}
1463	s->src_node = s->nat_src_node = NULL;
1464}
1465
1466/*
1467 * Unlink and potentilly free a state. Function may be
1468 * called with ID hash row locked, but always returns
1469 * unlocked, since it needs to go through key hash locking.
1470 */
1471int
1472pf_unlink_state(struct pf_state *s, u_int flags)
1473{
1474	struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)];
1475
1476	if ((flags & PF_ENTER_LOCKED) == 0)
1477		PF_HASHROW_LOCK(ih);
1478	else
1479		PF_HASHROW_ASSERT(ih);
1480
1481	if (s->timeout == PFTM_UNLINKED) {
1482		/*
1483		 * State is being processed
1484		 * by pf_unlink_state() in
1485		 * an other thread.
1486		 */
1487		PF_HASHROW_UNLOCK(ih);
1488		return (0);	/* XXXGL: undefined actually */
1489	}
1490
1491	if (s->src.state == PF_TCPS_PROXY_DST) {
1492		/* XXX wire key the right one? */
1493		pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af,
1494		    &s->key[PF_SK_WIRE]->addr[1],
1495		    &s->key[PF_SK_WIRE]->addr[0],
1496		    s->key[PF_SK_WIRE]->port[1],
1497		    s->key[PF_SK_WIRE]->port[0],
1498		    s->src.seqhi, s->src.seqlo + 1,
1499		    TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL);
1500	}
1501
1502	LIST_REMOVE(s, entry);
1503	pf_src_tree_remove_state(s);
1504
1505	if (pfsync_delete_state_ptr != NULL)
1506		pfsync_delete_state_ptr(s);
1507
1508	--s->rule.ptr->states_cur;
1509	if (s->nat_rule.ptr != NULL)
1510		--s->nat_rule.ptr->states_cur;
1511	if (s->anchor.ptr != NULL)
1512		--s->anchor.ptr->states_cur;
1513
1514	s->timeout = PFTM_UNLINKED;
1515
1516	PF_HASHROW_UNLOCK(ih);
1517
1518	pf_detach_state(s);
1519	refcount_release(&s->refs);
1520
1521	return (pf_release_state(s));
1522}
1523
1524void
1525pf_free_state(struct pf_state *cur)
1526{
1527
1528	KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur));
1529	KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__,
1530	    cur->timeout));
1531
1532	pf_normalize_tcp_cleanup(cur);
1533	uma_zfree(V_pf_state_z, cur);
1534	V_pf_status.fcounters[FCNT_STATE_REMOVALS]++;
1535}
1536
1537/*
1538 * Called only from pf_purge_thread(), thus serialized.
1539 */
1540static u_int
1541pf_purge_expired_states(u_int i, int maxcheck)
1542{
1543	struct pf_idhash *ih;
1544	struct pf_state *s;
1545
1546	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1547
1548	/*
1549	 * Go through hash and unlink states that expire now.
1550	 */
1551	while (maxcheck > 0) {
1552
1553		ih = &V_pf_idhash[i];
1554relock:
1555		PF_HASHROW_LOCK(ih);
1556		LIST_FOREACH(s, &ih->states, entry) {
1557			if (pf_state_expires(s) <= time_uptime) {
1558				V_pf_status.states -=
1559				    pf_unlink_state(s, PF_ENTER_LOCKED);
1560				goto relock;
1561			}
1562			s->rule.ptr->rule_flag |= PFRULE_REFS;
1563			if (s->nat_rule.ptr != NULL)
1564				s->nat_rule.ptr->rule_flag |= PFRULE_REFS;
1565			if (s->anchor.ptr != NULL)
1566				s->anchor.ptr->rule_flag |= PFRULE_REFS;
1567			s->kif->pfik_flags |= PFI_IFLAG_REFS;
1568			if (s->rt_kif)
1569				s->rt_kif->pfik_flags |= PFI_IFLAG_REFS;
1570		}
1571		PF_HASHROW_UNLOCK(ih);
1572
1573		/* Return when we hit end of hash. */
1574		if (++i > V_pf_hashmask) {
1575			V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1576			return (0);
1577		}
1578
1579		maxcheck--;
1580	}
1581
1582	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1583
1584	return (i);
1585}
1586
1587static void
1588pf_purge_unlinked_rules()
1589{
1590	struct pf_rulequeue tmpq;
1591	struct pf_rule *r, *r1;
1592
1593	/*
1594	 * If we have overloading task pending, then we'd
1595	 * better skip purging this time. There is a tiny
1596	 * probability that overloading task references
1597	 * an already unlinked rule.
1598	 */
1599	PF_OVERLOADQ_LOCK();
1600	if (!SLIST_EMPTY(&V_pf_overloadqueue)) {
1601		PF_OVERLOADQ_UNLOCK();
1602		return;
1603	}
1604	PF_OVERLOADQ_UNLOCK();
1605
1606	/*
1607	 * Do naive mark-and-sweep garbage collecting of old rules.
1608	 * Reference flag is raised by pf_purge_expired_states()
1609	 * and pf_purge_expired_src_nodes().
1610	 *
1611	 * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK,
1612	 * use a temporary queue.
1613	 */
1614	TAILQ_INIT(&tmpq);
1615	PF_UNLNKDRULES_LOCK();
1616	TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) {
1617		if (!(r->rule_flag & PFRULE_REFS)) {
1618			TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries);
1619			TAILQ_INSERT_TAIL(&tmpq, r, entries);
1620		} else
1621			r->rule_flag &= ~PFRULE_REFS;
1622	}
1623	PF_UNLNKDRULES_UNLOCK();
1624
1625	if (!TAILQ_EMPTY(&tmpq)) {
1626		PF_RULES_WLOCK();
1627		TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) {
1628			TAILQ_REMOVE(&tmpq, r, entries);
1629			pf_free_rule(r);
1630		}
1631		PF_RULES_WUNLOCK();
1632	}
1633}
1634
1635void
1636pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
1637{
1638	switch (af) {
1639#ifdef INET
1640	case AF_INET: {
1641		u_int32_t a = ntohl(addr->addr32[0]);
1642		printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
1643		    (a>>8)&255, a&255);
1644		if (p) {
1645			p = ntohs(p);
1646			printf(":%u", p);
1647		}
1648		break;
1649	}
1650#endif /* INET */
1651#ifdef INET6
1652	case AF_INET6: {
1653		u_int16_t b;
1654		u_int8_t i, curstart, curend, maxstart, maxend;
1655		curstart = curend = maxstart = maxend = 255;
1656		for (i = 0; i < 8; i++) {
1657			if (!addr->addr16[i]) {
1658				if (curstart == 255)
1659					curstart = i;
1660				curend = i;
1661			} else {
1662				if ((curend - curstart) >
1663				    (maxend - maxstart)) {
1664					maxstart = curstart;
1665					maxend = curend;
1666				}
1667				curstart = curend = 255;
1668			}
1669		}
1670		if ((curend - curstart) >
1671		    (maxend - maxstart)) {
1672			maxstart = curstart;
1673			maxend = curend;
1674		}
1675		for (i = 0; i < 8; i++) {
1676			if (i >= maxstart && i <= maxend) {
1677				if (i == 0)
1678					printf(":");
1679				if (i == maxend)
1680					printf(":");
1681			} else {
1682				b = ntohs(addr->addr16[i]);
1683				printf("%x", b);
1684				if (i < 7)
1685					printf(":");
1686			}
1687		}
1688		if (p) {
1689			p = ntohs(p);
1690			printf("[%u]", p);
1691		}
1692		break;
1693	}
1694#endif /* INET6 */
1695	}
1696}
1697
1698void
1699pf_print_state(struct pf_state *s)
1700{
1701	pf_print_state_parts(s, NULL, NULL);
1702}
1703
1704static void
1705pf_print_state_parts(struct pf_state *s,
1706    struct pf_state_key *skwp, struct pf_state_key *sksp)
1707{
1708	struct pf_state_key *skw, *sks;
1709	u_int8_t proto, dir;
1710
1711	/* Do our best to fill these, but they're skipped if NULL */
1712	skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
1713	sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
1714	proto = skw ? skw->proto : (sks ? sks->proto : 0);
1715	dir = s ? s->direction : 0;
1716
1717	switch (proto) {
1718	case IPPROTO_IPV4:
1719		printf("IPv4");
1720		break;
1721	case IPPROTO_IPV6:
1722		printf("IPv6");
1723		break;
1724	case IPPROTO_TCP:
1725		printf("TCP");
1726		break;
1727	case IPPROTO_UDP:
1728		printf("UDP");
1729		break;
1730	case IPPROTO_ICMP:
1731		printf("ICMP");
1732		break;
1733	case IPPROTO_ICMPV6:
1734		printf("ICMPv6");
1735		break;
1736	default:
1737		printf("%u", skw->proto);
1738		break;
1739	}
1740	switch (dir) {
1741	case PF_IN:
1742		printf(" in");
1743		break;
1744	case PF_OUT:
1745		printf(" out");
1746		break;
1747	}
1748	if (skw) {
1749		printf(" wire: ");
1750		pf_print_host(&skw->addr[0], skw->port[0], skw->af);
1751		printf(" ");
1752		pf_print_host(&skw->addr[1], skw->port[1], skw->af);
1753	}
1754	if (sks) {
1755		printf(" stack: ");
1756		if (sks != skw) {
1757			pf_print_host(&sks->addr[0], sks->port[0], sks->af);
1758			printf(" ");
1759			pf_print_host(&sks->addr[1], sks->port[1], sks->af);
1760		} else
1761			printf("-");
1762	}
1763	if (s) {
1764		if (proto == IPPROTO_TCP) {
1765			printf(" [lo=%u high=%u win=%u modulator=%u",
1766			    s->src.seqlo, s->src.seqhi,
1767			    s->src.max_win, s->src.seqdiff);
1768			if (s->src.wscale && s->dst.wscale)
1769				printf(" wscale=%u",
1770				    s->src.wscale & PF_WSCALE_MASK);
1771			printf("]");
1772			printf(" [lo=%u high=%u win=%u modulator=%u",
1773			    s->dst.seqlo, s->dst.seqhi,
1774			    s->dst.max_win, s->dst.seqdiff);
1775			if (s->src.wscale && s->dst.wscale)
1776				printf(" wscale=%u",
1777				s->dst.wscale & PF_WSCALE_MASK);
1778			printf("]");
1779		}
1780		printf(" %u:%u", s->src.state, s->dst.state);
1781	}
1782}
1783
1784void
1785pf_print_flags(u_int8_t f)
1786{
1787	if (f)
1788		printf(" ");
1789	if (f & TH_FIN)
1790		printf("F");
1791	if (f & TH_SYN)
1792		printf("S");
1793	if (f & TH_RST)
1794		printf("R");
1795	if (f & TH_PUSH)
1796		printf("P");
1797	if (f & TH_ACK)
1798		printf("A");
1799	if (f & TH_URG)
1800		printf("U");
1801	if (f & TH_ECE)
1802		printf("E");
1803	if (f & TH_CWR)
1804		printf("W");
1805}
1806
1807#define	PF_SET_SKIP_STEPS(i)					\
1808	do {							\
1809		while (head[i] != cur) {			\
1810			head[i]->skip[i].ptr = cur;		\
1811			head[i] = TAILQ_NEXT(head[i], entries);	\
1812		}						\
1813	} while (0)
1814
1815void
1816pf_calc_skip_steps(struct pf_rulequeue *rules)
1817{
1818	struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT];
1819	int i;
1820
1821	cur = TAILQ_FIRST(rules);
1822	prev = cur;
1823	for (i = 0; i < PF_SKIP_COUNT; ++i)
1824		head[i] = cur;
1825	while (cur != NULL) {
1826
1827		if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
1828			PF_SET_SKIP_STEPS(PF_SKIP_IFP);
1829		if (cur->direction != prev->direction)
1830			PF_SET_SKIP_STEPS(PF_SKIP_DIR);
1831		if (cur->af != prev->af)
1832			PF_SET_SKIP_STEPS(PF_SKIP_AF);
1833		if (cur->proto != prev->proto)
1834			PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
1835		if (cur->src.neg != prev->src.neg ||
1836		    pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
1837			PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
1838		if (cur->src.port[0] != prev->src.port[0] ||
1839		    cur->src.port[1] != prev->src.port[1] ||
1840		    cur->src.port_op != prev->src.port_op)
1841			PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
1842		if (cur->dst.neg != prev->dst.neg ||
1843		    pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
1844			PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
1845		if (cur->dst.port[0] != prev->dst.port[0] ||
1846		    cur->dst.port[1] != prev->dst.port[1] ||
1847		    cur->dst.port_op != prev->dst.port_op)
1848			PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
1849
1850		prev = cur;
1851		cur = TAILQ_NEXT(cur, entries);
1852	}
1853	for (i = 0; i < PF_SKIP_COUNT; ++i)
1854		PF_SET_SKIP_STEPS(i);
1855}
1856
1857static int
1858pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
1859{
1860	if (aw1->type != aw2->type)
1861		return (1);
1862	switch (aw1->type) {
1863	case PF_ADDR_ADDRMASK:
1864	case PF_ADDR_RANGE:
1865		if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, 0))
1866			return (1);
1867		if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, 0))
1868			return (1);
1869		return (0);
1870	case PF_ADDR_DYNIFTL:
1871		return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
1872	case PF_ADDR_NOROUTE:
1873	case PF_ADDR_URPFFAILED:
1874		return (0);
1875	case PF_ADDR_TABLE:
1876		return (aw1->p.tbl != aw2->p.tbl);
1877	default:
1878		printf("invalid address type: %d\n", aw1->type);
1879		return (1);
1880	}
1881}
1882
1883u_int16_t
1884pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
1885{
1886	u_int32_t	l;
1887
1888	if (udp && !cksum)
1889		return (0x0000);
1890	l = cksum + old - new;
1891	l = (l >> 16) + (l & 65535);
1892	l = l & 65535;
1893	if (udp && !l)
1894		return (0xFFFF);
1895	return (l);
1896}
1897
1898static void
1899pf_change_ap(struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc,
1900    struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af)
1901{
1902	struct pf_addr	ao;
1903	u_int16_t	po = *p;
1904
1905	PF_ACPY(&ao, a, af);
1906	PF_ACPY(a, an, af);
1907
1908	*p = pn;
1909
1910	switch (af) {
1911#ifdef INET
1912	case AF_INET:
1913		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
1914		    ao.addr16[0], an->addr16[0], 0),
1915		    ao.addr16[1], an->addr16[1], 0);
1916		*p = pn;
1917		*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
1918		    ao.addr16[0], an->addr16[0], u),
1919		    ao.addr16[1], an->addr16[1], u),
1920		    po, pn, u);
1921		break;
1922#endif /* INET */
1923#ifdef INET6
1924	case AF_INET6:
1925		*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1926		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1927		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
1928		    ao.addr16[0], an->addr16[0], u),
1929		    ao.addr16[1], an->addr16[1], u),
1930		    ao.addr16[2], an->addr16[2], u),
1931		    ao.addr16[3], an->addr16[3], u),
1932		    ao.addr16[4], an->addr16[4], u),
1933		    ao.addr16[5], an->addr16[5], u),
1934		    ao.addr16[6], an->addr16[6], u),
1935		    ao.addr16[7], an->addr16[7], u),
1936		    po, pn, u);
1937		break;
1938#endif /* INET6 */
1939	}
1940}
1941
1942
1943/* Changes a u_int32_t.  Uses a void * so there are no align restrictions */
1944void
1945pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
1946{
1947	u_int32_t	ao;
1948
1949	memcpy(&ao, a, sizeof(ao));
1950	memcpy(a, &an, sizeof(u_int32_t));
1951	*c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
1952	    ao % 65536, an % 65536, u);
1953}
1954
1955#ifdef INET6
1956static void
1957pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
1958{
1959	struct pf_addr	ao;
1960
1961	PF_ACPY(&ao, a, AF_INET6);
1962	PF_ACPY(a, an, AF_INET6);
1963
1964	*c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1965	    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1966	    pf_cksum_fixup(pf_cksum_fixup(*c,
1967	    ao.addr16[0], an->addr16[0], u),
1968	    ao.addr16[1], an->addr16[1], u),
1969	    ao.addr16[2], an->addr16[2], u),
1970	    ao.addr16[3], an->addr16[3], u),
1971	    ao.addr16[4], an->addr16[4], u),
1972	    ao.addr16[5], an->addr16[5], u),
1973	    ao.addr16[6], an->addr16[6], u),
1974	    ao.addr16[7], an->addr16[7], u);
1975}
1976#endif /* INET6 */
1977
1978static void
1979pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
1980    struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
1981    u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
1982{
1983	struct pf_addr	oia, ooa;
1984
1985	PF_ACPY(&oia, ia, af);
1986	if (oa)
1987		PF_ACPY(&ooa, oa, af);
1988
1989	/* Change inner protocol port, fix inner protocol checksum. */
1990	if (ip != NULL) {
1991		u_int16_t	oip = *ip;
1992		u_int32_t	opc;
1993
1994		if (pc != NULL)
1995			opc = *pc;
1996		*ip = np;
1997		if (pc != NULL)
1998			*pc = pf_cksum_fixup(*pc, oip, *ip, u);
1999		*ic = pf_cksum_fixup(*ic, oip, *ip, 0);
2000		if (pc != NULL)
2001			*ic = pf_cksum_fixup(*ic, opc, *pc, 0);
2002	}
2003	/* Change inner ip address, fix inner ip and icmp checksums. */
2004	PF_ACPY(ia, na, af);
2005	switch (af) {
2006#ifdef INET
2007	case AF_INET: {
2008		u_int32_t	 oh2c = *h2c;
2009
2010		*h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
2011		    oia.addr16[0], ia->addr16[0], 0),
2012		    oia.addr16[1], ia->addr16[1], 0);
2013		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
2014		    oia.addr16[0], ia->addr16[0], 0),
2015		    oia.addr16[1], ia->addr16[1], 0);
2016		*ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
2017		break;
2018	}
2019#endif /* INET */
2020#ifdef INET6
2021	case AF_INET6:
2022		*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2023		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2024		    pf_cksum_fixup(pf_cksum_fixup(*ic,
2025		    oia.addr16[0], ia->addr16[0], u),
2026		    oia.addr16[1], ia->addr16[1], u),
2027		    oia.addr16[2], ia->addr16[2], u),
2028		    oia.addr16[3], ia->addr16[3], u),
2029		    oia.addr16[4], ia->addr16[4], u),
2030		    oia.addr16[5], ia->addr16[5], u),
2031		    oia.addr16[6], ia->addr16[6], u),
2032		    oia.addr16[7], ia->addr16[7], u);
2033		break;
2034#endif /* INET6 */
2035	}
2036	/* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
2037	if (oa) {
2038		PF_ACPY(oa, na, af);
2039		switch (af) {
2040#ifdef INET
2041		case AF_INET:
2042			*hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
2043			    ooa.addr16[0], oa->addr16[0], 0),
2044			    ooa.addr16[1], oa->addr16[1], 0);
2045			break;
2046#endif /* INET */
2047#ifdef INET6
2048		case AF_INET6:
2049			*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2050			    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2051			    pf_cksum_fixup(pf_cksum_fixup(*ic,
2052			    ooa.addr16[0], oa->addr16[0], u),
2053			    ooa.addr16[1], oa->addr16[1], u),
2054			    ooa.addr16[2], oa->addr16[2], u),
2055			    ooa.addr16[3], oa->addr16[3], u),
2056			    ooa.addr16[4], oa->addr16[4], u),
2057			    ooa.addr16[5], oa->addr16[5], u),
2058			    ooa.addr16[6], oa->addr16[6], u),
2059			    ooa.addr16[7], oa->addr16[7], u);
2060			break;
2061#endif /* INET6 */
2062		}
2063	}
2064}
2065
2066
2067/*
2068 * Need to modulate the sequence numbers in the TCP SACK option
2069 * (credits to Krzysztof Pfaff for report and patch)
2070 */
2071static int
2072pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
2073    struct tcphdr *th, struct pf_state_peer *dst)
2074{
2075	int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
2076	u_int8_t opts[TCP_MAXOLEN], *opt = opts;
2077	int copyback = 0, i, olen;
2078	struct sackblk sack;
2079
2080#define	TCPOLEN_SACKLEN	(TCPOLEN_SACK + 2)
2081	if (hlen < TCPOLEN_SACKLEN ||
2082	    !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
2083		return 0;
2084
2085	while (hlen >= TCPOLEN_SACKLEN) {
2086		olen = opt[1];
2087		switch (*opt) {
2088		case TCPOPT_EOL:	/* FALLTHROUGH */
2089		case TCPOPT_NOP:
2090			opt++;
2091			hlen--;
2092			break;
2093		case TCPOPT_SACK:
2094			if (olen > hlen)
2095				olen = hlen;
2096			if (olen >= TCPOLEN_SACKLEN) {
2097				for (i = 2; i + TCPOLEN_SACK <= olen;
2098				    i += TCPOLEN_SACK) {
2099					memcpy(&sack, &opt[i], sizeof(sack));
2100					pf_change_a(&sack.start, &th->th_sum,
2101					    htonl(ntohl(sack.start) -
2102					    dst->seqdiff), 0);
2103					pf_change_a(&sack.end, &th->th_sum,
2104					    htonl(ntohl(sack.end) -
2105					    dst->seqdiff), 0);
2106					memcpy(&opt[i], &sack, sizeof(sack));
2107				}
2108				copyback = 1;
2109			}
2110			/* FALLTHROUGH */
2111		default:
2112			if (olen < 2)
2113				olen = 2;
2114			hlen -= olen;
2115			opt += olen;
2116		}
2117	}
2118
2119	if (copyback)
2120		m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts);
2121	return (copyback);
2122}
2123
2124static void
2125pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af,
2126    const struct pf_addr *saddr, const struct pf_addr *daddr,
2127    u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
2128    u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
2129    u_int16_t rtag, struct ifnet *ifp)
2130{
2131	struct pf_send_entry *pfse;
2132	struct mbuf	*m;
2133	int		 len, tlen;
2134#ifdef INET
2135	struct ip	*h = NULL;
2136#endif /* INET */
2137#ifdef INET6
2138	struct ip6_hdr	*h6 = NULL;
2139#endif /* INET6 */
2140	struct tcphdr	*th;
2141	char		*opt;
2142	struct pf_mtag  *pf_mtag;
2143
2144	len = 0;
2145	th = NULL;
2146
2147	/* maximum segment size tcp option */
2148	tlen = sizeof(struct tcphdr);
2149	if (mss)
2150		tlen += 4;
2151
2152	switch (af) {
2153#ifdef INET
2154	case AF_INET:
2155		len = sizeof(struct ip) + tlen;
2156		break;
2157#endif /* INET */
2158#ifdef INET6
2159	case AF_INET6:
2160		len = sizeof(struct ip6_hdr) + tlen;
2161		break;
2162#endif /* INET6 */
2163	default:
2164		panic("%s: unsupported af %d", __func__, af);
2165	}
2166
2167	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
2168	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2169	if (pfse == NULL)
2170		return;
2171	m = m_gethdr(M_NOWAIT, MT_HEADER);
2172	if (m == NULL) {
2173		free(pfse, M_PFTEMP);
2174		return;
2175	}
2176#ifdef MAC
2177	mac_netinet_firewall_send(m);
2178#endif
2179	if ((pf_mtag = pf_get_mtag(m)) == NULL) {
2180		free(pfse, M_PFTEMP);
2181		m_freem(m);
2182		return;
2183	}
2184	if (tag)
2185		m->m_flags |= M_SKIP_FIREWALL;
2186	pf_mtag->tag = rtag;
2187
2188	if (r != NULL && r->rtableid >= 0)
2189		M_SETFIB(m, r->rtableid);
2190
2191#ifdef ALTQ
2192	if (r != NULL && r->qid) {
2193		pf_mtag->qid = r->qid;
2194
2195		/* add hints for ecn */
2196		pf_mtag->hdr = mtod(m, struct ip *);
2197	}
2198#endif /* ALTQ */
2199	m->m_data += max_linkhdr;
2200	m->m_pkthdr.len = m->m_len = len;
2201	m->m_pkthdr.rcvif = NULL;
2202	bzero(m->m_data, len);
2203	switch (af) {
2204#ifdef INET
2205	case AF_INET:
2206		h = mtod(m, struct ip *);
2207
2208		/* IP header fields included in the TCP checksum */
2209		h->ip_p = IPPROTO_TCP;
2210		h->ip_len = htons(tlen);
2211		h->ip_src.s_addr = saddr->v4.s_addr;
2212		h->ip_dst.s_addr = daddr->v4.s_addr;
2213
2214		th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
2215		break;
2216#endif /* INET */
2217#ifdef INET6
2218	case AF_INET6:
2219		h6 = mtod(m, struct ip6_hdr *);
2220
2221		/* IP header fields included in the TCP checksum */
2222		h6->ip6_nxt = IPPROTO_TCP;
2223		h6->ip6_plen = htons(tlen);
2224		memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
2225		memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
2226
2227		th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
2228		break;
2229#endif /* INET6 */
2230	}
2231
2232	/* TCP header */
2233	th->th_sport = sport;
2234	th->th_dport = dport;
2235	th->th_seq = htonl(seq);
2236	th->th_ack = htonl(ack);
2237	th->th_off = tlen >> 2;
2238	th->th_flags = flags;
2239	th->th_win = htons(win);
2240
2241	if (mss) {
2242		opt = (char *)(th + 1);
2243		opt[0] = TCPOPT_MAXSEG;
2244		opt[1] = 4;
2245		HTONS(mss);
2246		bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
2247	}
2248
2249	switch (af) {
2250#ifdef INET
2251	case AF_INET:
2252		/* TCP checksum */
2253		th->th_sum = in_cksum(m, len);
2254
2255		/* Finish the IP header */
2256		h->ip_v = 4;
2257		h->ip_hl = sizeof(*h) >> 2;
2258		h->ip_tos = IPTOS_LOWDELAY;
2259		h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0);
2260		h->ip_len = htons(len);
2261		h->ip_ttl = ttl ? ttl : V_ip_defttl;
2262		h->ip_sum = 0;
2263
2264		pfse->pfse_type = PFSE_IP;
2265		break;
2266#endif /* INET */
2267#ifdef INET6
2268	case AF_INET6:
2269		/* TCP checksum */
2270		th->th_sum = in6_cksum(m, IPPROTO_TCP,
2271		    sizeof(struct ip6_hdr), tlen);
2272
2273		h6->ip6_vfc |= IPV6_VERSION;
2274		h6->ip6_hlim = IPV6_DEFHLIM;
2275
2276		pfse->pfse_type = PFSE_IP6;
2277		break;
2278#endif /* INET6 */
2279	}
2280	pfse->pfse_m = m;
2281	pf_send(pfse);
2282}
2283
2284static void
2285pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
2286    struct pf_rule *r)
2287{
2288	struct pf_send_entry *pfse;
2289	struct mbuf *m0;
2290	struct pf_mtag *pf_mtag;
2291
2292	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
2293	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2294	if (pfse == NULL)
2295		return;
2296
2297	if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) {
2298		free(pfse, M_PFTEMP);
2299		return;
2300	}
2301
2302	if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
2303		free(pfse, M_PFTEMP);
2304		return;
2305	}
2306	/* XXX: revisit */
2307	m0->m_flags |= M_SKIP_FIREWALL;
2308
2309	if (r->rtableid >= 0)
2310		M_SETFIB(m0, r->rtableid);
2311
2312#ifdef ALTQ
2313	if (r->qid) {
2314		pf_mtag->qid = r->qid;
2315		/* add hints for ecn */
2316		pf_mtag->hdr = mtod(m0, struct ip *);
2317	}
2318#endif /* ALTQ */
2319
2320	switch (af) {
2321#ifdef INET
2322	case AF_INET:
2323		pfse->pfse_type = PFSE_ICMP;
2324		break;
2325#endif /* INET */
2326#ifdef INET6
2327	case AF_INET6:
2328		pfse->pfse_type = PFSE_ICMP6;
2329		break;
2330#endif /* INET6 */
2331	}
2332	pfse->pfse_m = m0;
2333	pfse->pfse_icmp_type = type;
2334	pfse->pfse_icmp_code = code;
2335	pf_send(pfse);
2336}
2337
2338/*
2339 * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
2340 * If n is 0, they match if they are equal. If n is != 0, they match if they
2341 * are different.
2342 */
2343int
2344pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
2345    struct pf_addr *b, sa_family_t af)
2346{
2347	int	match = 0;
2348
2349	switch (af) {
2350#ifdef INET
2351	case AF_INET:
2352		if ((a->addr32[0] & m->addr32[0]) ==
2353		    (b->addr32[0] & m->addr32[0]))
2354			match++;
2355		break;
2356#endif /* INET */
2357#ifdef INET6
2358	case AF_INET6:
2359		if (((a->addr32[0] & m->addr32[0]) ==
2360		     (b->addr32[0] & m->addr32[0])) &&
2361		    ((a->addr32[1] & m->addr32[1]) ==
2362		     (b->addr32[1] & m->addr32[1])) &&
2363		    ((a->addr32[2] & m->addr32[2]) ==
2364		     (b->addr32[2] & m->addr32[2])) &&
2365		    ((a->addr32[3] & m->addr32[3]) ==
2366		     (b->addr32[3] & m->addr32[3])))
2367			match++;
2368		break;
2369#endif /* INET6 */
2370	}
2371	if (match) {
2372		if (n)
2373			return (0);
2374		else
2375			return (1);
2376	} else {
2377		if (n)
2378			return (1);
2379		else
2380			return (0);
2381	}
2382}
2383
2384/*
2385 * Return 1 if b <= a <= e, otherwise return 0.
2386 */
2387int
2388pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
2389    struct pf_addr *a, sa_family_t af)
2390{
2391	switch (af) {
2392#ifdef INET
2393	case AF_INET:
2394		if ((a->addr32[0] < b->addr32[0]) ||
2395		    (a->addr32[0] > e->addr32[0]))
2396			return (0);
2397		break;
2398#endif /* INET */
2399#ifdef INET6
2400	case AF_INET6: {
2401		int	i;
2402
2403		/* check a >= b */
2404		for (i = 0; i < 4; ++i)
2405			if (a->addr32[i] > b->addr32[i])
2406				break;
2407			else if (a->addr32[i] < b->addr32[i])
2408				return (0);
2409		/* check a <= e */
2410		for (i = 0; i < 4; ++i)
2411			if (a->addr32[i] < e->addr32[i])
2412				break;
2413			else if (a->addr32[i] > e->addr32[i])
2414				return (0);
2415		break;
2416	}
2417#endif /* INET6 */
2418	}
2419	return (1);
2420}
2421
2422static int
2423pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
2424{
2425	switch (op) {
2426	case PF_OP_IRG:
2427		return ((p > a1) && (p < a2));
2428	case PF_OP_XRG:
2429		return ((p < a1) || (p > a2));
2430	case PF_OP_RRG:
2431		return ((p >= a1) && (p <= a2));
2432	case PF_OP_EQ:
2433		return (p == a1);
2434	case PF_OP_NE:
2435		return (p != a1);
2436	case PF_OP_LT:
2437		return (p < a1);
2438	case PF_OP_LE:
2439		return (p <= a1);
2440	case PF_OP_GT:
2441		return (p > a1);
2442	case PF_OP_GE:
2443		return (p >= a1);
2444	}
2445	return (0); /* never reached */
2446}
2447
2448int
2449pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
2450{
2451	NTOHS(a1);
2452	NTOHS(a2);
2453	NTOHS(p);
2454	return (pf_match(op, a1, a2, p));
2455}
2456
2457static int
2458pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
2459{
2460	if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2461		return (0);
2462	return (pf_match(op, a1, a2, u));
2463}
2464
2465static int
2466pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
2467{
2468	if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2469		return (0);
2470	return (pf_match(op, a1, a2, g));
2471}
2472
2473int
2474pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag)
2475{
2476	if (*tag == -1)
2477		*tag = mtag;
2478
2479	return ((!r->match_tag_not && r->match_tag == *tag) ||
2480	    (r->match_tag_not && r->match_tag != *tag));
2481}
2482
2483int
2484pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag)
2485{
2486
2487	KASSERT(tag > 0, ("%s: tag %d", __func__, tag));
2488
2489	if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL))
2490		return (ENOMEM);
2491
2492	pd->pf_mtag->tag = tag;
2493
2494	return (0);
2495}
2496
2497#define	PF_ANCHOR_STACKSIZE	32
2498struct pf_anchor_stackframe {
2499	struct pf_ruleset	*rs;
2500	struct pf_rule		*r;	/* XXX: + match bit */
2501	struct pf_anchor	*child;
2502};
2503
2504/*
2505 * XXX: We rely on malloc(9) returning pointer aligned addresses.
2506 */
2507#define	PF_ANCHORSTACK_MATCH	0x00000001
2508#define	PF_ANCHORSTACK_MASK	(PF_ANCHORSTACK_MATCH)
2509
2510#define	PF_ANCHOR_MATCH(f)	((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH)
2511#define	PF_ANCHOR_RULE(f)	(struct pf_rule *)			\
2512				((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK)
2513#define	PF_ANCHOR_SET_MATCH(f)	do { (f)->r = (void *) 			\
2514				((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH);  \
2515} while (0)
2516
2517void
2518pf_step_into_anchor(struct pf_anchor_stackframe *stack, int *depth,
2519    struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
2520    int *match)
2521{
2522	struct pf_anchor_stackframe	*f;
2523
2524	PF_RULES_RASSERT();
2525
2526	if (match)
2527		*match = 0;
2528	if (*depth >= PF_ANCHOR_STACKSIZE) {
2529		printf("%s: anchor stack overflow on %s\n",
2530		    __func__, (*r)->anchor->name);
2531		*r = TAILQ_NEXT(*r, entries);
2532		return;
2533	} else if (*depth == 0 && a != NULL)
2534		*a = *r;
2535	f = stack + (*depth)++;
2536	f->rs = *rs;
2537	f->r = *r;
2538	if ((*r)->anchor_wildcard) {
2539		struct pf_anchor_node *parent = &(*r)->anchor->children;
2540
2541		if ((f->child = RB_MIN(pf_anchor_node, parent)) == NULL) {
2542			*r = NULL;
2543			return;
2544		}
2545		*rs = &f->child->ruleset;
2546	} else {
2547		f->child = NULL;
2548		*rs = &(*r)->anchor->ruleset;
2549	}
2550	*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2551}
2552
2553int
2554pf_step_out_of_anchor(struct pf_anchor_stackframe *stack, int *depth,
2555    struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
2556    int *match)
2557{
2558	struct pf_anchor_stackframe	*f;
2559	struct pf_rule *fr;
2560	int quick = 0;
2561
2562	PF_RULES_RASSERT();
2563
2564	do {
2565		if (*depth <= 0)
2566			break;
2567		f = stack + *depth - 1;
2568		fr = PF_ANCHOR_RULE(f);
2569		if (f->child != NULL) {
2570			struct pf_anchor_node *parent;
2571
2572			/*
2573			 * This block traverses through
2574			 * a wildcard anchor.
2575			 */
2576			parent = &fr->anchor->children;
2577			if (match != NULL && *match) {
2578				/*
2579				 * If any of "*" matched, then
2580				 * "foo/ *" matched, mark frame
2581				 * appropriately.
2582				 */
2583				PF_ANCHOR_SET_MATCH(f);
2584				*match = 0;
2585			}
2586			f->child = RB_NEXT(pf_anchor_node, parent, f->child);
2587			if (f->child != NULL) {
2588				*rs = &f->child->ruleset;
2589				*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2590				if (*r == NULL)
2591					continue;
2592				else
2593					break;
2594			}
2595		}
2596		(*depth)--;
2597		if (*depth == 0 && a != NULL)
2598			*a = NULL;
2599		*rs = f->rs;
2600		if (PF_ANCHOR_MATCH(f) || (match != NULL && *match))
2601			quick = fr->quick;
2602		*r = TAILQ_NEXT(fr, entries);
2603	} while (*r == NULL);
2604
2605	return (quick);
2606}
2607
2608#ifdef INET6
2609void
2610pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
2611    struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
2612{
2613	switch (af) {
2614#ifdef INET
2615	case AF_INET:
2616		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2617		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2618		break;
2619#endif /* INET */
2620	case AF_INET6:
2621		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2622		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2623		naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
2624		((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
2625		naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
2626		((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
2627		naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
2628		((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
2629		break;
2630	}
2631}
2632
2633void
2634pf_addr_inc(struct pf_addr *addr, sa_family_t af)
2635{
2636	switch (af) {
2637#ifdef INET
2638	case AF_INET:
2639		addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
2640		break;
2641#endif /* INET */
2642	case AF_INET6:
2643		if (addr->addr32[3] == 0xffffffff) {
2644			addr->addr32[3] = 0;
2645			if (addr->addr32[2] == 0xffffffff) {
2646				addr->addr32[2] = 0;
2647				if (addr->addr32[1] == 0xffffffff) {
2648					addr->addr32[1] = 0;
2649					addr->addr32[0] =
2650					    htonl(ntohl(addr->addr32[0]) + 1);
2651				} else
2652					addr->addr32[1] =
2653					    htonl(ntohl(addr->addr32[1]) + 1);
2654			} else
2655				addr->addr32[2] =
2656				    htonl(ntohl(addr->addr32[2]) + 1);
2657		} else
2658			addr->addr32[3] =
2659			    htonl(ntohl(addr->addr32[3]) + 1);
2660		break;
2661	}
2662}
2663#endif /* INET6 */
2664
2665int
2666pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m)
2667{
2668	struct pf_addr		*saddr, *daddr;
2669	u_int16_t		 sport, dport;
2670	struct inpcbinfo	*pi;
2671	struct inpcb		*inp;
2672
2673	pd->lookup.uid = UID_MAX;
2674	pd->lookup.gid = GID_MAX;
2675
2676	switch (pd->proto) {
2677	case IPPROTO_TCP:
2678		if (pd->hdr.tcp == NULL)
2679			return (-1);
2680		sport = pd->hdr.tcp->th_sport;
2681		dport = pd->hdr.tcp->th_dport;
2682		pi = &V_tcbinfo;
2683		break;
2684	case IPPROTO_UDP:
2685		if (pd->hdr.udp == NULL)
2686			return (-1);
2687		sport = pd->hdr.udp->uh_sport;
2688		dport = pd->hdr.udp->uh_dport;
2689		pi = &V_udbinfo;
2690		break;
2691	default:
2692		return (-1);
2693	}
2694	if (direction == PF_IN) {
2695		saddr = pd->src;
2696		daddr = pd->dst;
2697	} else {
2698		u_int16_t	p;
2699
2700		p = sport;
2701		sport = dport;
2702		dport = p;
2703		saddr = pd->dst;
2704		daddr = pd->src;
2705	}
2706	switch (pd->af) {
2707#ifdef INET
2708	case AF_INET:
2709		inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4,
2710		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
2711		if (inp == NULL) {
2712			inp = in_pcblookup_mbuf(pi, saddr->v4, sport,
2713			   daddr->v4, dport, INPLOOKUP_WILDCARD |
2714			   INPLOOKUP_RLOCKPCB, NULL, m);
2715			if (inp == NULL)
2716				return (-1);
2717		}
2718		break;
2719#endif /* INET */
2720#ifdef INET6
2721	case AF_INET6:
2722		inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6,
2723		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
2724		if (inp == NULL) {
2725			inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport,
2726			    &daddr->v6, dport, INPLOOKUP_WILDCARD |
2727			    INPLOOKUP_RLOCKPCB, NULL, m);
2728			if (inp == NULL)
2729				return (-1);
2730		}
2731		break;
2732#endif /* INET6 */
2733
2734	default:
2735		return (-1);
2736	}
2737	INP_RLOCK_ASSERT(inp);
2738	pd->lookup.uid = inp->inp_cred->cr_uid;
2739	pd->lookup.gid = inp->inp_cred->cr_groups[0];
2740	INP_RUNLOCK(inp);
2741
2742	return (1);
2743}
2744
2745static u_int8_t
2746pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
2747{
2748	int		 hlen;
2749	u_int8_t	 hdr[60];
2750	u_int8_t	*opt, optlen;
2751	u_int8_t	 wscale = 0;
2752
2753	hlen = th_off << 2;		/* hlen <= sizeof(hdr) */
2754	if (hlen <= sizeof(struct tcphdr))
2755		return (0);
2756	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
2757		return (0);
2758	opt = hdr + sizeof(struct tcphdr);
2759	hlen -= sizeof(struct tcphdr);
2760	while (hlen >= 3) {
2761		switch (*opt) {
2762		case TCPOPT_EOL:
2763		case TCPOPT_NOP:
2764			++opt;
2765			--hlen;
2766			break;
2767		case TCPOPT_WINDOW:
2768			wscale = opt[2];
2769			if (wscale > TCP_MAX_WINSHIFT)
2770				wscale = TCP_MAX_WINSHIFT;
2771			wscale |= PF_WSCALE_FLAG;
2772			/* FALLTHROUGH */
2773		default:
2774			optlen = opt[1];
2775			if (optlen < 2)
2776				optlen = 2;
2777			hlen -= optlen;
2778			opt += optlen;
2779			break;
2780		}
2781	}
2782	return (wscale);
2783}
2784
2785static u_int16_t
2786pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
2787{
2788	int		 hlen;
2789	u_int8_t	 hdr[60];
2790	u_int8_t	*opt, optlen;
2791	u_int16_t	 mss = V_tcp_mssdflt;
2792
2793	hlen = th_off << 2;	/* hlen <= sizeof(hdr) */
2794	if (hlen <= sizeof(struct tcphdr))
2795		return (0);
2796	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
2797		return (0);
2798	opt = hdr + sizeof(struct tcphdr);
2799	hlen -= sizeof(struct tcphdr);
2800	while (hlen >= TCPOLEN_MAXSEG) {
2801		switch (*opt) {
2802		case TCPOPT_EOL:
2803		case TCPOPT_NOP:
2804			++opt;
2805			--hlen;
2806			break;
2807		case TCPOPT_MAXSEG:
2808			bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
2809			NTOHS(mss);
2810			/* FALLTHROUGH */
2811		default:
2812			optlen = opt[1];
2813			if (optlen < 2)
2814				optlen = 2;
2815			hlen -= optlen;
2816			opt += optlen;
2817			break;
2818		}
2819	}
2820	return (mss);
2821}
2822
2823static u_int16_t
2824pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer)
2825{
2826#ifdef INET
2827	struct sockaddr_in	*dst;
2828	struct route		 ro;
2829#endif /* INET */
2830#ifdef INET6
2831	struct sockaddr_in6	*dst6;
2832	struct route_in6	 ro6;
2833#endif /* INET6 */
2834	struct rtentry		*rt = NULL;
2835	int			 hlen = 0;
2836	u_int16_t		 mss = V_tcp_mssdflt;
2837
2838	switch (af) {
2839#ifdef INET
2840	case AF_INET:
2841		hlen = sizeof(struct ip);
2842		bzero(&ro, sizeof(ro));
2843		dst = (struct sockaddr_in *)&ro.ro_dst;
2844		dst->sin_family = AF_INET;
2845		dst->sin_len = sizeof(*dst);
2846		dst->sin_addr = addr->v4;
2847		in_rtalloc_ign(&ro, 0, rtableid);
2848		rt = ro.ro_rt;
2849		break;
2850#endif /* INET */
2851#ifdef INET6
2852	case AF_INET6:
2853		hlen = sizeof(struct ip6_hdr);
2854		bzero(&ro6, sizeof(ro6));
2855		dst6 = (struct sockaddr_in6 *)&ro6.ro_dst;
2856		dst6->sin6_family = AF_INET6;
2857		dst6->sin6_len = sizeof(*dst6);
2858		dst6->sin6_addr = addr->v6;
2859		in6_rtalloc_ign(&ro6, 0, rtableid);
2860		rt = ro6.ro_rt;
2861		break;
2862#endif /* INET6 */
2863	}
2864
2865	if (rt && rt->rt_ifp) {
2866		mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr);
2867		mss = max(V_tcp_mssdflt, mss);
2868		RTFREE(rt);
2869	}
2870	mss = min(mss, offer);
2871	mss = max(mss, 64);		/* sanity - at least max opt space */
2872	return (mss);
2873}
2874
2875static void
2876pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr)
2877{
2878	struct pf_rule *r = s->rule.ptr;
2879	struct pf_src_node *sn = NULL;
2880
2881	s->rt_kif = NULL;
2882	if (!r->rt || r->rt == PF_FASTROUTE)
2883		return;
2884	switch (s->key[PF_SK_WIRE]->af) {
2885#ifdef INET
2886	case AF_INET:
2887		pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL, &sn);
2888		s->rt_kif = r->rpool.cur->kif;
2889		break;
2890#endif /* INET */
2891#ifdef INET6
2892	case AF_INET6:
2893		pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL, &sn);
2894		s->rt_kif = r->rpool.cur->kif;
2895		break;
2896#endif /* INET6 */
2897	}
2898}
2899
2900static u_int32_t
2901pf_tcp_iss(struct pf_pdesc *pd)
2902{
2903	MD5_CTX ctx;
2904	u_int32_t digest[4];
2905
2906	if (V_pf_tcp_secret_init == 0) {
2907		read_random(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret));
2908		MD5Init(&V_pf_tcp_secret_ctx);
2909		MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret,
2910		    sizeof(V_pf_tcp_secret));
2911		V_pf_tcp_secret_init = 1;
2912	}
2913
2914	ctx = V_pf_tcp_secret_ctx;
2915
2916	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
2917	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
2918	if (pd->af == AF_INET6) {
2919		MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
2920		MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
2921	} else {
2922		MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
2923		MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
2924	}
2925	MD5Final((u_char *)digest, &ctx);
2926	V_pf_tcp_iss_off += 4096;
2927#define	ISN_RANDOM_INCREMENT (4096 - 1)
2928	return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) +
2929	    V_pf_tcp_iss_off);
2930#undef	ISN_RANDOM_INCREMENT
2931}
2932
2933static int
2934pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
2935    struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd,
2936    struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp)
2937{
2938	struct pf_rule		*nr = NULL;
2939	struct pf_addr		* const saddr = pd->src;
2940	struct pf_addr		* const daddr = pd->dst;
2941	sa_family_t		 af = pd->af;
2942	struct pf_rule		*r, *a = NULL;
2943	struct pf_ruleset	*ruleset = NULL;
2944	struct pf_src_node	*nsn = NULL;
2945	struct tcphdr		*th = pd->hdr.tcp;
2946	struct pf_state_key	*sk = NULL, *nk = NULL;
2947	u_short			 reason;
2948	int			 rewrite = 0, hdrlen = 0;
2949	int			 tag = -1, rtableid = -1;
2950	int			 asd = 0;
2951	int			 match = 0;
2952	int			 state_icmp = 0;
2953	u_int16_t		 sport = 0, dport = 0;
2954	u_int16_t		 bproto_sum = 0, bip_sum = 0;
2955	u_int8_t		 icmptype = 0, icmpcode = 0;
2956	struct pf_anchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
2957
2958	PF_RULES_RASSERT();
2959
2960	if (inp != NULL) {
2961		INP_LOCK_ASSERT(inp);
2962		pd->lookup.uid = inp->inp_cred->cr_uid;
2963		pd->lookup.gid = inp->inp_cred->cr_groups[0];
2964		pd->lookup.done = 1;
2965	}
2966
2967	switch (pd->proto) {
2968	case IPPROTO_TCP:
2969		sport = th->th_sport;
2970		dport = th->th_dport;
2971		hdrlen = sizeof(*th);
2972		break;
2973	case IPPROTO_UDP:
2974		sport = pd->hdr.udp->uh_sport;
2975		dport = pd->hdr.udp->uh_dport;
2976		hdrlen = sizeof(*pd->hdr.udp);
2977		break;
2978#ifdef INET
2979	case IPPROTO_ICMP:
2980		if (pd->af != AF_INET)
2981			break;
2982		sport = dport = pd->hdr.icmp->icmp_id;
2983		hdrlen = sizeof(*pd->hdr.icmp);
2984		icmptype = pd->hdr.icmp->icmp_type;
2985		icmpcode = pd->hdr.icmp->icmp_code;
2986
2987		if (icmptype == ICMP_UNREACH ||
2988		    icmptype == ICMP_SOURCEQUENCH ||
2989		    icmptype == ICMP_REDIRECT ||
2990		    icmptype == ICMP_TIMXCEED ||
2991		    icmptype == ICMP_PARAMPROB)
2992			state_icmp++;
2993		break;
2994#endif /* INET */
2995#ifdef INET6
2996	case IPPROTO_ICMPV6:
2997		if (af != AF_INET6)
2998			break;
2999		sport = dport = pd->hdr.icmp6->icmp6_id;
3000		hdrlen = sizeof(*pd->hdr.icmp6);
3001		icmptype = pd->hdr.icmp6->icmp6_type;
3002		icmpcode = pd->hdr.icmp6->icmp6_code;
3003
3004		if (icmptype == ICMP6_DST_UNREACH ||
3005		    icmptype == ICMP6_PACKET_TOO_BIG ||
3006		    icmptype == ICMP6_TIME_EXCEEDED ||
3007		    icmptype == ICMP6_PARAM_PROB)
3008			state_icmp++;
3009		break;
3010#endif /* INET6 */
3011	default:
3012		sport = dport = hdrlen = 0;
3013		break;
3014	}
3015
3016	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3017
3018	/* check packet for BINAT/NAT/RDR */
3019	if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk,
3020	    &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) {
3021		KASSERT(sk != NULL, ("%s: null sk", __func__));
3022		KASSERT(nk != NULL, ("%s: null nk", __func__));
3023
3024		if (pd->ip_sum)
3025			bip_sum = *pd->ip_sum;
3026
3027		switch (pd->proto) {
3028		case IPPROTO_TCP:
3029			bproto_sum = th->th_sum;
3030			pd->proto_sum = &th->th_sum;
3031
3032			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3033			    nk->port[pd->sidx] != sport) {
3034				pf_change_ap(saddr, &th->th_sport, pd->ip_sum,
3035				    &th->th_sum, &nk->addr[pd->sidx],
3036				    nk->port[pd->sidx], 0, af);
3037				pd->sport = &th->th_sport;
3038				sport = th->th_sport;
3039			}
3040
3041			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3042			    nk->port[pd->didx] != dport) {
3043				pf_change_ap(daddr, &th->th_dport, pd->ip_sum,
3044				    &th->th_sum, &nk->addr[pd->didx],
3045				    nk->port[pd->didx], 0, af);
3046				dport = th->th_dport;
3047				pd->dport = &th->th_dport;
3048			}
3049			rewrite++;
3050			break;
3051		case IPPROTO_UDP:
3052			bproto_sum = pd->hdr.udp->uh_sum;
3053			pd->proto_sum = &pd->hdr.udp->uh_sum;
3054
3055			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3056			    nk->port[pd->sidx] != sport) {
3057				pf_change_ap(saddr, &pd->hdr.udp->uh_sport,
3058				    pd->ip_sum, &pd->hdr.udp->uh_sum,
3059				    &nk->addr[pd->sidx],
3060				    nk->port[pd->sidx], 1, af);
3061				sport = pd->hdr.udp->uh_sport;
3062				pd->sport = &pd->hdr.udp->uh_sport;
3063			}
3064
3065			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3066			    nk->port[pd->didx] != dport) {
3067				pf_change_ap(daddr, &pd->hdr.udp->uh_dport,
3068				    pd->ip_sum, &pd->hdr.udp->uh_sum,
3069				    &nk->addr[pd->didx],
3070				    nk->port[pd->didx], 1, af);
3071				dport = pd->hdr.udp->uh_dport;
3072				pd->dport = &pd->hdr.udp->uh_dport;
3073			}
3074			rewrite++;
3075			break;
3076#ifdef INET
3077		case IPPROTO_ICMP:
3078			nk->port[0] = nk->port[1];
3079			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
3080				pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
3081				    nk->addr[pd->sidx].v4.s_addr, 0);
3082
3083			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
3084				pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
3085				    nk->addr[pd->didx].v4.s_addr, 0);
3086
3087			if (nk->port[1] != pd->hdr.icmp->icmp_id) {
3088				pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
3089				    pd->hdr.icmp->icmp_cksum, sport,
3090				    nk->port[1], 0);
3091				pd->hdr.icmp->icmp_id = nk->port[1];
3092				pd->sport = &pd->hdr.icmp->icmp_id;
3093			}
3094			m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
3095			break;
3096#endif /* INET */
3097#ifdef INET6
3098		case IPPROTO_ICMPV6:
3099			nk->port[0] = nk->port[1];
3100			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
3101				pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
3102				    &nk->addr[pd->sidx], 0);
3103
3104			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
3105				pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
3106				    &nk->addr[pd->didx], 0);
3107			rewrite++;
3108			break;
3109#endif /* INET */
3110		default:
3111			switch (af) {
3112#ifdef INET
3113			case AF_INET:
3114				if (PF_ANEQ(saddr,
3115				    &nk->addr[pd->sidx], AF_INET))
3116					pf_change_a(&saddr->v4.s_addr,
3117					    pd->ip_sum,
3118					    nk->addr[pd->sidx].v4.s_addr, 0);
3119
3120				if (PF_ANEQ(daddr,
3121				    &nk->addr[pd->didx], AF_INET))
3122					pf_change_a(&daddr->v4.s_addr,
3123					    pd->ip_sum,
3124					    nk->addr[pd->didx].v4.s_addr, 0);
3125				break;
3126#endif /* INET */
3127#ifdef INET6
3128			case AF_INET6:
3129				if (PF_ANEQ(saddr,
3130				    &nk->addr[pd->sidx], AF_INET6))
3131					PF_ACPY(saddr, &nk->addr[pd->sidx], af);
3132
3133				if (PF_ANEQ(daddr,
3134				    &nk->addr[pd->didx], AF_INET6))
3135					PF_ACPY(saddr, &nk->addr[pd->didx], af);
3136				break;
3137#endif /* INET */
3138			}
3139			break;
3140		}
3141		if (nr->natpass)
3142			r = NULL;
3143		pd->nat_rule = nr;
3144	}
3145
3146	while (r != NULL) {
3147		r->evaluations++;
3148		if (pfi_kif_match(r->kif, kif) == r->ifnot)
3149			r = r->skip[PF_SKIP_IFP].ptr;
3150		else if (r->direction && r->direction != direction)
3151			r = r->skip[PF_SKIP_DIR].ptr;
3152		else if (r->af && r->af != af)
3153			r = r->skip[PF_SKIP_AF].ptr;
3154		else if (r->proto && r->proto != pd->proto)
3155			r = r->skip[PF_SKIP_PROTO].ptr;
3156		else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
3157		    r->src.neg, kif, M_GETFIB(m)))
3158			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3159		/* tcp/udp only. port_op always 0 in other cases */
3160		else if (r->src.port_op && !pf_match_port(r->src.port_op,
3161		    r->src.port[0], r->src.port[1], sport))
3162			r = r->skip[PF_SKIP_SRC_PORT].ptr;
3163		else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
3164		    r->dst.neg, NULL, M_GETFIB(m)))
3165			r = r->skip[PF_SKIP_DST_ADDR].ptr;
3166		/* tcp/udp only. port_op always 0 in other cases */
3167		else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
3168		    r->dst.port[0], r->dst.port[1], dport))
3169			r = r->skip[PF_SKIP_DST_PORT].ptr;
3170		/* icmp only. type always 0 in other cases */
3171		else if (r->type && r->type != icmptype + 1)
3172			r = TAILQ_NEXT(r, entries);
3173		/* icmp only. type always 0 in other cases */
3174		else if (r->code && r->code != icmpcode + 1)
3175			r = TAILQ_NEXT(r, entries);
3176		else if (r->tos && !(r->tos == pd->tos))
3177			r = TAILQ_NEXT(r, entries);
3178		else if (r->rule_flag & PFRULE_FRAGMENT)
3179			r = TAILQ_NEXT(r, entries);
3180		else if (pd->proto == IPPROTO_TCP &&
3181		    (r->flagset & th->th_flags) != r->flags)
3182			r = TAILQ_NEXT(r, entries);
3183		/* tcp/udp only. uid.op always 0 in other cases */
3184		else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
3185		    pf_socket_lookup(direction, pd, m), 1)) &&
3186		    !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
3187		    pd->lookup.uid))
3188			r = TAILQ_NEXT(r, entries);
3189		/* tcp/udp only. gid.op always 0 in other cases */
3190		else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
3191		    pf_socket_lookup(direction, pd, m), 1)) &&
3192		    !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
3193		    pd->lookup.gid))
3194			r = TAILQ_NEXT(r, entries);
3195		else if (r->prob &&
3196		    r->prob <= arc4random())
3197			r = TAILQ_NEXT(r, entries);
3198		else if (r->match_tag && !pf_match_tag(m, r, &tag,
3199		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
3200			r = TAILQ_NEXT(r, entries);
3201		else if (r->os_fingerprint != PF_OSFP_ANY &&
3202		    (pd->proto != IPPROTO_TCP || !pf_osfp_match(
3203		    pf_osfp_fingerprint(pd, m, off, th),
3204		    r->os_fingerprint)))
3205			r = TAILQ_NEXT(r, entries);
3206		else {
3207			if (r->tag)
3208				tag = r->tag;
3209			if (r->rtableid >= 0)
3210				rtableid = r->rtableid;
3211			if (r->anchor == NULL) {
3212				match = 1;
3213				*rm = r;
3214				*am = a;
3215				*rsm = ruleset;
3216				if ((*rm)->quick)
3217					break;
3218				r = TAILQ_NEXT(r, entries);
3219			} else
3220				pf_step_into_anchor(anchor_stack, &asd,
3221				    &ruleset, PF_RULESET_FILTER, &r, &a,
3222				    &match);
3223		}
3224		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
3225		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
3226			break;
3227	}
3228	r = *rm;
3229	a = *am;
3230	ruleset = *rsm;
3231
3232	REASON_SET(&reason, PFRES_MATCH);
3233
3234	if (r->log || (nr != NULL && nr->log)) {
3235		if (rewrite)
3236			m_copyback(m, off, hdrlen, pd->hdr.any);
3237		PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a,
3238		    ruleset, pd, 1);
3239	}
3240
3241	if ((r->action == PF_DROP) &&
3242	    ((r->rule_flag & PFRULE_RETURNRST) ||
3243	    (r->rule_flag & PFRULE_RETURNICMP) ||
3244	    (r->rule_flag & PFRULE_RETURN))) {
3245		/* undo NAT changes, if they have taken place */
3246		if (nr != NULL) {
3247			PF_ACPY(saddr, &sk->addr[pd->sidx], af);
3248			PF_ACPY(daddr, &sk->addr[pd->didx], af);
3249			if (pd->sport)
3250				*pd->sport = sk->port[pd->sidx];
3251			if (pd->dport)
3252				*pd->dport = sk->port[pd->didx];
3253			if (pd->proto_sum)
3254				*pd->proto_sum = bproto_sum;
3255			if (pd->ip_sum)
3256				*pd->ip_sum = bip_sum;
3257			m_copyback(m, off, hdrlen, pd->hdr.any);
3258		}
3259		if (pd->proto == IPPROTO_TCP &&
3260		    ((r->rule_flag & PFRULE_RETURNRST) ||
3261		    (r->rule_flag & PFRULE_RETURN)) &&
3262		    !(th->th_flags & TH_RST)) {
3263			u_int32_t	 ack = ntohl(th->th_seq) + pd->p_len;
3264			int		 len = 0;
3265#ifdef INET
3266			struct ip	*h4;
3267#endif
3268#ifdef INET6
3269			struct ip6_hdr	*h6;
3270#endif
3271
3272			switch (af) {
3273#ifdef INET
3274			case AF_INET:
3275				h4 = mtod(m, struct ip *);
3276				len = ntohs(h4->ip_len) - off;
3277				break;
3278#endif
3279#ifdef INET6
3280			case AF_INET6:
3281				h6 = mtod(m, struct ip6_hdr *);
3282				len = ntohs(h6->ip6_plen) - (off - sizeof(*h6));
3283				break;
3284#endif
3285			}
3286
3287			if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
3288				REASON_SET(&reason, PFRES_PROTCKSUM);
3289			else {
3290				if (th->th_flags & TH_SYN)
3291					ack++;
3292				if (th->th_flags & TH_FIN)
3293					ack++;
3294				pf_send_tcp(m, r, af, pd->dst,
3295				    pd->src, th->th_dport, th->th_sport,
3296				    ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
3297				    r->return_ttl, 1, 0, kif->pfik_ifp);
3298			}
3299		} else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
3300		    r->return_icmp)
3301			pf_send_icmp(m, r->return_icmp >> 8,
3302			    r->return_icmp & 255, af, r);
3303		else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
3304		    r->return_icmp6)
3305			pf_send_icmp(m, r->return_icmp6 >> 8,
3306			    r->return_icmp6 & 255, af, r);
3307	}
3308
3309	if (r->action == PF_DROP)
3310		goto cleanup;
3311
3312	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3313		REASON_SET(&reason, PFRES_MEMORY);
3314		goto cleanup;
3315	}
3316	if (rtableid >= 0)
3317		M_SETFIB(m, rtableid);
3318
3319	if (!state_icmp && (r->keep_state || nr != NULL ||
3320	    (pd->flags & PFDESC_TCP_NORM))) {
3321		int action;
3322		action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off,
3323		    sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum,
3324		    hdrlen);
3325		if (action != PF_PASS)
3326			return (action);
3327	} else {
3328		if (sk != NULL)
3329			uma_zfree(V_pf_state_key_z, sk);
3330		if (nk != NULL)
3331			uma_zfree(V_pf_state_key_z, nk);
3332	}
3333
3334	/* copy back packet headers if we performed NAT operations */
3335	if (rewrite)
3336		m_copyback(m, off, hdrlen, pd->hdr.any);
3337
3338	if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) &&
3339	    direction == PF_OUT &&
3340	    pfsync_defer_ptr != NULL && pfsync_defer_ptr(*sm, m))
3341		/*
3342		 * We want the state created, but we dont
3343		 * want to send this in case a partner
3344		 * firewall has to know about it to allow
3345		 * replies through it.
3346		 */
3347		return (PF_DEFER);
3348
3349	return (PF_PASS);
3350
3351cleanup:
3352	if (sk != NULL)
3353		uma_zfree(V_pf_state_key_z, sk);
3354	if (nk != NULL)
3355		uma_zfree(V_pf_state_key_z, nk);
3356	return (PF_DROP);
3357}
3358
3359static int
3360pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a,
3361    struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk,
3362    struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport,
3363    u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm,
3364    int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen)
3365{
3366	struct pf_state		*s = NULL;
3367	struct pf_src_node	*sn = NULL;
3368	struct tcphdr		*th = pd->hdr.tcp;
3369	u_int16_t		 mss = V_tcp_mssdflt;
3370	u_short			 reason;
3371
3372	/* check maximums */
3373	if (r->max_states && (r->states_cur >= r->max_states)) {
3374		V_pf_status.lcounters[LCNT_STATES]++;
3375		REASON_SET(&reason, PFRES_MAXSTATES);
3376		return (PF_DROP);
3377	}
3378	/* src node for filter rule */
3379	if ((r->rule_flag & PFRULE_SRCTRACK ||
3380	    r->rpool.opts & PF_POOL_STICKYADDR) &&
3381	    pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
3382		REASON_SET(&reason, PFRES_SRCLIMIT);
3383		goto csfailed;
3384	}
3385	/* src node for translation rule */
3386	if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
3387	    pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
3388		REASON_SET(&reason, PFRES_SRCLIMIT);
3389		goto csfailed;
3390	}
3391	s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO);
3392	if (s == NULL) {
3393		REASON_SET(&reason, PFRES_MEMORY);
3394		goto csfailed;
3395	}
3396	s->rule.ptr = r;
3397	s->nat_rule.ptr = nr;
3398	s->anchor.ptr = a;
3399	STATE_INC_COUNTERS(s);
3400	if (r->allow_opts)
3401		s->state_flags |= PFSTATE_ALLOWOPTS;
3402	if (r->rule_flag & PFRULE_STATESLOPPY)
3403		s->state_flags |= PFSTATE_SLOPPY;
3404	s->log = r->log & PF_LOG_ALL;
3405	s->sync_state = PFSYNC_S_NONE;
3406	if (nr != NULL)
3407		s->log |= nr->log & PF_LOG_ALL;
3408	switch (pd->proto) {
3409	case IPPROTO_TCP:
3410		s->src.seqlo = ntohl(th->th_seq);
3411		s->src.seqhi = s->src.seqlo + pd->p_len + 1;
3412		if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
3413		    r->keep_state == PF_STATE_MODULATE) {
3414			/* Generate sequence number modulator */
3415			if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
3416			    0)
3417				s->src.seqdiff = 1;
3418			pf_change_a(&th->th_seq, &th->th_sum,
3419			    htonl(s->src.seqlo + s->src.seqdiff), 0);
3420			*rewrite = 1;
3421		} else
3422			s->src.seqdiff = 0;
3423		if (th->th_flags & TH_SYN) {
3424			s->src.seqhi++;
3425			s->src.wscale = pf_get_wscale(m, off,
3426			    th->th_off, pd->af);
3427		}
3428		s->src.max_win = MAX(ntohs(th->th_win), 1);
3429		if (s->src.wscale & PF_WSCALE_MASK) {
3430			/* Remove scale factor from initial window */
3431			int win = s->src.max_win;
3432			win += 1 << (s->src.wscale & PF_WSCALE_MASK);
3433			s->src.max_win = (win - 1) >>
3434			    (s->src.wscale & PF_WSCALE_MASK);
3435		}
3436		if (th->th_flags & TH_FIN)
3437			s->src.seqhi++;
3438		s->dst.seqhi = 1;
3439		s->dst.max_win = 1;
3440		s->src.state = TCPS_SYN_SENT;
3441		s->dst.state = TCPS_CLOSED;
3442		s->timeout = PFTM_TCP_FIRST_PACKET;
3443		break;
3444	case IPPROTO_UDP:
3445		s->src.state = PFUDPS_SINGLE;
3446		s->dst.state = PFUDPS_NO_TRAFFIC;
3447		s->timeout = PFTM_UDP_FIRST_PACKET;
3448		break;
3449	case IPPROTO_ICMP:
3450#ifdef INET6
3451	case IPPROTO_ICMPV6:
3452#endif
3453		s->timeout = PFTM_ICMP_FIRST_PACKET;
3454		break;
3455	default:
3456		s->src.state = PFOTHERS_SINGLE;
3457		s->dst.state = PFOTHERS_NO_TRAFFIC;
3458		s->timeout = PFTM_OTHER_FIRST_PACKET;
3459	}
3460
3461	s->creation = time_uptime;
3462	s->expire = time_uptime;
3463
3464	if (sn != NULL) {
3465		s->src_node = sn;
3466		s->src_node->states++;
3467	}
3468	if (nsn != NULL) {
3469		/* XXX We only modify one side for now. */
3470		PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
3471		s->nat_src_node = nsn;
3472		s->nat_src_node->states++;
3473	}
3474	if (pd->proto == IPPROTO_TCP) {
3475		if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
3476		    off, pd, th, &s->src, &s->dst)) {
3477			REASON_SET(&reason, PFRES_MEMORY);
3478			pf_src_tree_remove_state(s);
3479			STATE_DEC_COUNTERS(s);
3480			uma_zfree(V_pf_state_z, s);
3481			return (PF_DROP);
3482		}
3483		if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
3484		    pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
3485		    &s->src, &s->dst, rewrite)) {
3486			/* This really shouldn't happen!!! */
3487			DPFPRINTF(PF_DEBUG_URGENT,
3488			    ("pf_normalize_tcp_stateful failed on first pkt"));
3489			pf_normalize_tcp_cleanup(s);
3490			pf_src_tree_remove_state(s);
3491			STATE_DEC_COUNTERS(s);
3492			uma_zfree(V_pf_state_z, s);
3493			return (PF_DROP);
3494		}
3495	}
3496	s->direction = pd->dir;
3497
3498	/*
3499	 * sk/nk could already been setup by pf_get_translation().
3500	 */
3501	if (nr == NULL) {
3502		KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p",
3503		    __func__, nr, sk, nk));
3504		sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport);
3505		if (sk == NULL)
3506			goto csfailed;
3507		nk = sk;
3508	} else
3509		KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p",
3510		    __func__, nr, sk, nk));
3511
3512	/* Swap sk/nk for PF_OUT. */
3513	if (pf_state_insert(BOUND_IFACE(r, kif),
3514	    (pd->dir == PF_IN) ? sk : nk,
3515	    (pd->dir == PF_IN) ? nk : sk, s)) {
3516		if (pd->proto == IPPROTO_TCP)
3517			pf_normalize_tcp_cleanup(s);
3518		REASON_SET(&reason, PFRES_STATEINS);
3519		pf_src_tree_remove_state(s);
3520		STATE_DEC_COUNTERS(s);
3521		uma_zfree(V_pf_state_z, s);
3522		return (PF_DROP);
3523	} else
3524		*sm = s;
3525
3526	pf_set_rt_ifp(s, pd->src);	/* needs s->state_key set */
3527	if (tag > 0)
3528		s->tag = tag;
3529	if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
3530	    TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
3531		s->src.state = PF_TCPS_PROXY_SRC;
3532		/* undo NAT changes, if they have taken place */
3533		if (nr != NULL) {
3534			struct pf_state_key *skt = s->key[PF_SK_WIRE];
3535			if (pd->dir == PF_OUT)
3536				skt = s->key[PF_SK_STACK];
3537			PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
3538			PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
3539			if (pd->sport)
3540				*pd->sport = skt->port[pd->sidx];
3541			if (pd->dport)
3542				*pd->dport = skt->port[pd->didx];
3543			if (pd->proto_sum)
3544				*pd->proto_sum = bproto_sum;
3545			if (pd->ip_sum)
3546				*pd->ip_sum = bip_sum;
3547			m_copyback(m, off, hdrlen, pd->hdr.any);
3548		}
3549		s->src.seqhi = htonl(arc4random());
3550		/* Find mss option */
3551		int rtid = M_GETFIB(m);
3552		mss = pf_get_mss(m, off, th->th_off, pd->af);
3553		mss = pf_calc_mss(pd->src, pd->af, rtid, mss);
3554		mss = pf_calc_mss(pd->dst, pd->af, rtid, mss);
3555		s->src.mss = mss;
3556		pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport,
3557		    th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
3558		    TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL);
3559		REASON_SET(&reason, PFRES_SYNPROXY);
3560		return (PF_SYNPROXY_DROP);
3561	}
3562
3563	return (PF_PASS);
3564
3565csfailed:
3566	if (sk != NULL)
3567		uma_zfree(V_pf_state_key_z, sk);
3568	if (nk != NULL)
3569		uma_zfree(V_pf_state_key_z, nk);
3570
3571	if (sn != NULL && sn->states == 0 && sn->expire == 0)
3572		pf_remove_src_node(sn);
3573
3574	if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0)
3575		pf_remove_src_node(nsn);
3576
3577	return (PF_DROP);
3578}
3579
3580static int
3581pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
3582    struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am,
3583    struct pf_ruleset **rsm)
3584{
3585	struct pf_rule		*r, *a = NULL;
3586	struct pf_ruleset	*ruleset = NULL;
3587	sa_family_t		 af = pd->af;
3588	u_short			 reason;
3589	int			 tag = -1;
3590	int			 asd = 0;
3591	int			 match = 0;
3592	struct pf_anchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
3593
3594	PF_RULES_RASSERT();
3595
3596	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3597	while (r != NULL) {
3598		r->evaluations++;
3599		if (pfi_kif_match(r->kif, kif) == r->ifnot)
3600			r = r->skip[PF_SKIP_IFP].ptr;
3601		else if (r->direction && r->direction != direction)
3602			r = r->skip[PF_SKIP_DIR].ptr;
3603		else if (r->af && r->af != af)
3604			r = r->skip[PF_SKIP_AF].ptr;
3605		else if (r->proto && r->proto != pd->proto)
3606			r = r->skip[PF_SKIP_PROTO].ptr;
3607		else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
3608		    r->src.neg, kif, M_GETFIB(m)))
3609			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3610		else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
3611		    r->dst.neg, NULL, M_GETFIB(m)))
3612			r = r->skip[PF_SKIP_DST_ADDR].ptr;
3613		else if (r->tos && !(r->tos == pd->tos))
3614			r = TAILQ_NEXT(r, entries);
3615		else if (r->os_fingerprint != PF_OSFP_ANY)
3616			r = TAILQ_NEXT(r, entries);
3617		else if (pd->proto == IPPROTO_UDP &&
3618		    (r->src.port_op || r->dst.port_op))
3619			r = TAILQ_NEXT(r, entries);
3620		else if (pd->proto == IPPROTO_TCP &&
3621		    (r->src.port_op || r->dst.port_op || r->flagset))
3622			r = TAILQ_NEXT(r, entries);
3623		else if ((pd->proto == IPPROTO_ICMP ||
3624		    pd->proto == IPPROTO_ICMPV6) &&
3625		    (r->type || r->code))
3626			r = TAILQ_NEXT(r, entries);
3627		else if (r->prob && r->prob <=
3628		    (arc4random() % (UINT_MAX - 1) + 1))
3629			r = TAILQ_NEXT(r, entries);
3630		else if (r->match_tag && !pf_match_tag(m, r, &tag,
3631		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
3632			r = TAILQ_NEXT(r, entries);
3633		else {
3634			if (r->anchor == NULL) {
3635				match = 1;
3636				*rm = r;
3637				*am = a;
3638				*rsm = ruleset;
3639				if ((*rm)->quick)
3640					break;
3641				r = TAILQ_NEXT(r, entries);
3642			} else
3643				pf_step_into_anchor(anchor_stack, &asd,
3644				    &ruleset, PF_RULESET_FILTER, &r, &a,
3645				    &match);
3646		}
3647		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
3648		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
3649			break;
3650	}
3651	r = *rm;
3652	a = *am;
3653	ruleset = *rsm;
3654
3655	REASON_SET(&reason, PFRES_MATCH);
3656
3657	if (r->log)
3658		PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd,
3659		    1);
3660
3661	if (r->action != PF_PASS)
3662		return (PF_DROP);
3663
3664	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3665		REASON_SET(&reason, PFRES_MEMORY);
3666		return (PF_DROP);
3667	}
3668
3669	return (PF_PASS);
3670}
3671
3672static int
3673pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
3674	struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
3675	struct pf_pdesc *pd, u_short *reason, int *copyback)
3676{
3677	struct tcphdr		*th = pd->hdr.tcp;
3678	u_int16_t		 win = ntohs(th->th_win);
3679	u_int32_t		 ack, end, seq, orig_seq;
3680	u_int8_t		 sws, dws;
3681	int			 ackskew;
3682
3683	if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
3684		sws = src->wscale & PF_WSCALE_MASK;
3685		dws = dst->wscale & PF_WSCALE_MASK;
3686	} else
3687		sws = dws = 0;
3688
3689	/*
3690	 * Sequence tracking algorithm from Guido van Rooij's paper:
3691	 *   http://www.madison-gurkha.com/publications/tcp_filtering/
3692	 *	tcp_filtering.ps
3693	 */
3694
3695	orig_seq = seq = ntohl(th->th_seq);
3696	if (src->seqlo == 0) {
3697		/* First packet from this end. Set its state */
3698
3699		if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
3700		    src->scrub == NULL) {
3701			if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
3702				REASON_SET(reason, PFRES_MEMORY);
3703				return (PF_DROP);
3704			}
3705		}
3706
3707		/* Deferred generation of sequence number modulator */
3708		if (dst->seqdiff && !src->seqdiff) {
3709			/* use random iss for the TCP server */
3710			while ((src->seqdiff = arc4random() - seq) == 0)
3711				;
3712			ack = ntohl(th->th_ack) - dst->seqdiff;
3713			pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
3714			    src->seqdiff), 0);
3715			pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
3716			*copyback = 1;
3717		} else {
3718			ack = ntohl(th->th_ack);
3719		}
3720
3721		end = seq + pd->p_len;
3722		if (th->th_flags & TH_SYN) {
3723			end++;
3724			if (dst->wscale & PF_WSCALE_FLAG) {
3725				src->wscale = pf_get_wscale(m, off, th->th_off,
3726				    pd->af);
3727				if (src->wscale & PF_WSCALE_FLAG) {
3728					/* Remove scale factor from initial
3729					 * window */
3730					sws = src->wscale & PF_WSCALE_MASK;
3731					win = ((u_int32_t)win + (1 << sws) - 1)
3732					    >> sws;
3733					dws = dst->wscale & PF_WSCALE_MASK;
3734				} else {
3735					/* fixup other window */
3736					dst->max_win <<= dst->wscale &
3737					    PF_WSCALE_MASK;
3738					/* in case of a retrans SYN|ACK */
3739					dst->wscale = 0;
3740				}
3741			}
3742		}
3743		if (th->th_flags & TH_FIN)
3744			end++;
3745
3746		src->seqlo = seq;
3747		if (src->state < TCPS_SYN_SENT)
3748			src->state = TCPS_SYN_SENT;
3749
3750		/*
3751		 * May need to slide the window (seqhi may have been set by
3752		 * the crappy stack check or if we picked up the connection
3753		 * after establishment)
3754		 */
3755		if (src->seqhi == 1 ||
3756		    SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
3757			src->seqhi = end + MAX(1, dst->max_win << dws);
3758		if (win > src->max_win)
3759			src->max_win = win;
3760
3761	} else {
3762		ack = ntohl(th->th_ack) - dst->seqdiff;
3763		if (src->seqdiff) {
3764			/* Modulate sequence numbers */
3765			pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
3766			    src->seqdiff), 0);
3767			pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
3768			*copyback = 1;
3769		}
3770		end = seq + pd->p_len;
3771		if (th->th_flags & TH_SYN)
3772			end++;
3773		if (th->th_flags & TH_FIN)
3774			end++;
3775	}
3776
3777	if ((th->th_flags & TH_ACK) == 0) {
3778		/* Let it pass through the ack skew check */
3779		ack = dst->seqlo;
3780	} else if ((ack == 0 &&
3781	    (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
3782	    /* broken tcp stacks do not set ack */
3783	    (dst->state < TCPS_SYN_SENT)) {
3784		/*
3785		 * Many stacks (ours included) will set the ACK number in an
3786		 * FIN|ACK if the SYN times out -- no sequence to ACK.
3787		 */
3788		ack = dst->seqlo;
3789	}
3790
3791	if (seq == end) {
3792		/* Ease sequencing restrictions on no data packets */
3793		seq = src->seqlo;
3794		end = seq;
3795	}
3796
3797	ackskew = dst->seqlo - ack;
3798
3799
3800	/*
3801	 * Need to demodulate the sequence numbers in any TCP SACK options
3802	 * (Selective ACK). We could optionally validate the SACK values
3803	 * against the current ACK window, either forwards or backwards, but
3804	 * I'm not confident that SACK has been implemented properly
3805	 * everywhere. It wouldn't surprise me if several stacks accidently
3806	 * SACK too far backwards of previously ACKed data. There really aren't
3807	 * any security implications of bad SACKing unless the target stack
3808	 * doesn't validate the option length correctly. Someone trying to
3809	 * spoof into a TCP connection won't bother blindly sending SACK
3810	 * options anyway.
3811	 */
3812	if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
3813		if (pf_modulate_sack(m, off, pd, th, dst))
3814			*copyback = 1;
3815	}
3816
3817
3818#define	MAXACKWINDOW (0xffff + 1500)	/* 1500 is an arbitrary fudge factor */
3819	if (SEQ_GEQ(src->seqhi, end) &&
3820	    /* Last octet inside other's window space */
3821	    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
3822	    /* Retrans: not more than one window back */
3823	    (ackskew >= -MAXACKWINDOW) &&
3824	    /* Acking not more than one reassembled fragment backwards */
3825	    (ackskew <= (MAXACKWINDOW << sws)) &&
3826	    /* Acking not more than one window forward */
3827	    ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
3828	    (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
3829	    (pd->flags & PFDESC_IP_REAS) == 0)) {
3830	    /* Require an exact/+1 sequence match on resets when possible */
3831
3832		if (dst->scrub || src->scrub) {
3833			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
3834			    *state, src, dst, copyback))
3835				return (PF_DROP);
3836		}
3837
3838		/* update max window */
3839		if (src->max_win < win)
3840			src->max_win = win;
3841		/* synchronize sequencing */
3842		if (SEQ_GT(end, src->seqlo))
3843			src->seqlo = end;
3844		/* slide the window of what the other end can send */
3845		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
3846			dst->seqhi = ack + MAX((win << sws), 1);
3847
3848
3849		/* update states */
3850		if (th->th_flags & TH_SYN)
3851			if (src->state < TCPS_SYN_SENT)
3852				src->state = TCPS_SYN_SENT;
3853		if (th->th_flags & TH_FIN)
3854			if (src->state < TCPS_CLOSING)
3855				src->state = TCPS_CLOSING;
3856		if (th->th_flags & TH_ACK) {
3857			if (dst->state == TCPS_SYN_SENT) {
3858				dst->state = TCPS_ESTABLISHED;
3859				if (src->state == TCPS_ESTABLISHED &&
3860				    (*state)->src_node != NULL &&
3861				    pf_src_connlimit(state)) {
3862					REASON_SET(reason, PFRES_SRCLIMIT);
3863					return (PF_DROP);
3864				}
3865			} else if (dst->state == TCPS_CLOSING)
3866				dst->state = TCPS_FIN_WAIT_2;
3867		}
3868		if (th->th_flags & TH_RST)
3869			src->state = dst->state = TCPS_TIME_WAIT;
3870
3871		/* update expire time */
3872		(*state)->expire = time_uptime;
3873		if (src->state >= TCPS_FIN_WAIT_2 &&
3874		    dst->state >= TCPS_FIN_WAIT_2)
3875			(*state)->timeout = PFTM_TCP_CLOSED;
3876		else if (src->state >= TCPS_CLOSING &&
3877		    dst->state >= TCPS_CLOSING)
3878			(*state)->timeout = PFTM_TCP_FIN_WAIT;
3879		else if (src->state < TCPS_ESTABLISHED ||
3880		    dst->state < TCPS_ESTABLISHED)
3881			(*state)->timeout = PFTM_TCP_OPENING;
3882		else if (src->state >= TCPS_CLOSING ||
3883		    dst->state >= TCPS_CLOSING)
3884			(*state)->timeout = PFTM_TCP_CLOSING;
3885		else
3886			(*state)->timeout = PFTM_TCP_ESTABLISHED;
3887
3888		/* Fall through to PASS packet */
3889
3890	} else if ((dst->state < TCPS_SYN_SENT ||
3891		dst->state >= TCPS_FIN_WAIT_2 ||
3892		src->state >= TCPS_FIN_WAIT_2) &&
3893	    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
3894	    /* Within a window forward of the originating packet */
3895	    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
3896	    /* Within a window backward of the originating packet */
3897
3898		/*
3899		 * This currently handles three situations:
3900		 *  1) Stupid stacks will shotgun SYNs before their peer
3901		 *     replies.
3902		 *  2) When PF catches an already established stream (the
3903		 *     firewall rebooted, the state table was flushed, routes
3904		 *     changed...)
3905		 *  3) Packets get funky immediately after the connection
3906		 *     closes (this should catch Solaris spurious ACK|FINs
3907		 *     that web servers like to spew after a close)
3908		 *
3909		 * This must be a little more careful than the above code
3910		 * since packet floods will also be caught here. We don't
3911		 * update the TTL here to mitigate the damage of a packet
3912		 * flood and so the same code can handle awkward establishment
3913		 * and a loosened connection close.
3914		 * In the establishment case, a correct peer response will
3915		 * validate the connection, go through the normal state code
3916		 * and keep updating the state TTL.
3917		 */
3918
3919		if (V_pf_status.debug >= PF_DEBUG_MISC) {
3920			printf("pf: loose state match: ");
3921			pf_print_state(*state);
3922			pf_print_flags(th->th_flags);
3923			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
3924			    "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack,
3925			    pd->p_len, ackskew, (unsigned long long)(*state)->packets[0],
3926			    (unsigned long long)(*state)->packets[1],
3927			    pd->dir == PF_IN ? "in" : "out",
3928			    pd->dir == (*state)->direction ? "fwd" : "rev");
3929		}
3930
3931		if (dst->scrub || src->scrub) {
3932			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
3933			    *state, src, dst, copyback))
3934				return (PF_DROP);
3935		}
3936
3937		/* update max window */
3938		if (src->max_win < win)
3939			src->max_win = win;
3940		/* synchronize sequencing */
3941		if (SEQ_GT(end, src->seqlo))
3942			src->seqlo = end;
3943		/* slide the window of what the other end can send */
3944		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
3945			dst->seqhi = ack + MAX((win << sws), 1);
3946
3947		/*
3948		 * Cannot set dst->seqhi here since this could be a shotgunned
3949		 * SYN and not an already established connection.
3950		 */
3951
3952		if (th->th_flags & TH_FIN)
3953			if (src->state < TCPS_CLOSING)
3954				src->state = TCPS_CLOSING;
3955		if (th->th_flags & TH_RST)
3956			src->state = dst->state = TCPS_TIME_WAIT;
3957
3958		/* Fall through to PASS packet */
3959
3960	} else {
3961		if ((*state)->dst.state == TCPS_SYN_SENT &&
3962		    (*state)->src.state == TCPS_SYN_SENT) {
3963			/* Send RST for state mismatches during handshake */
3964			if (!(th->th_flags & TH_RST))
3965				pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
3966				    pd->dst, pd->src, th->th_dport,
3967				    th->th_sport, ntohl(th->th_ack), 0,
3968				    TH_RST, 0, 0,
3969				    (*state)->rule.ptr->return_ttl, 1, 0,
3970				    kif->pfik_ifp);
3971			src->seqlo = 0;
3972			src->seqhi = 1;
3973			src->max_win = 1;
3974		} else if (V_pf_status.debug >= PF_DEBUG_MISC) {
3975			printf("pf: BAD state: ");
3976			pf_print_state(*state);
3977			pf_print_flags(th->th_flags);
3978			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
3979			    "pkts=%llu:%llu dir=%s,%s\n",
3980			    seq, orig_seq, ack, pd->p_len, ackskew,
3981			    (unsigned long long)(*state)->packets[0],
3982			    (unsigned long long)(*state)->packets[1],
3983			    pd->dir == PF_IN ? "in" : "out",
3984			    pd->dir == (*state)->direction ? "fwd" : "rev");
3985			printf("pf: State failure on: %c %c %c %c | %c %c\n",
3986			    SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
3987			    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
3988			    ' ': '2',
3989			    (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
3990			    (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
3991			    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
3992			    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
3993		}
3994		REASON_SET(reason, PFRES_BADSTATE);
3995		return (PF_DROP);
3996	}
3997
3998	return (PF_PASS);
3999}
4000
4001static int
4002pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
4003	struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
4004{
4005	struct tcphdr		*th = pd->hdr.tcp;
4006
4007	if (th->th_flags & TH_SYN)
4008		if (src->state < TCPS_SYN_SENT)
4009			src->state = TCPS_SYN_SENT;
4010	if (th->th_flags & TH_FIN)
4011		if (src->state < TCPS_CLOSING)
4012			src->state = TCPS_CLOSING;
4013	if (th->th_flags & TH_ACK) {
4014		if (dst->state == TCPS_SYN_SENT) {
4015			dst->state = TCPS_ESTABLISHED;
4016			if (src->state == TCPS_ESTABLISHED &&
4017			    (*state)->src_node != NULL &&
4018			    pf_src_connlimit(state)) {
4019				REASON_SET(reason, PFRES_SRCLIMIT);
4020				return (PF_DROP);
4021			}
4022		} else if (dst->state == TCPS_CLOSING) {
4023			dst->state = TCPS_FIN_WAIT_2;
4024		} else if (src->state == TCPS_SYN_SENT &&
4025		    dst->state < TCPS_SYN_SENT) {
4026			/*
4027			 * Handle a special sloppy case where we only see one
4028			 * half of the connection. If there is a ACK after
4029			 * the initial SYN without ever seeing a packet from
4030			 * the destination, set the connection to established.
4031			 */
4032			dst->state = src->state = TCPS_ESTABLISHED;
4033			if ((*state)->src_node != NULL &&
4034			    pf_src_connlimit(state)) {
4035				REASON_SET(reason, PFRES_SRCLIMIT);
4036				return (PF_DROP);
4037			}
4038		} else if (src->state == TCPS_CLOSING &&
4039		    dst->state == TCPS_ESTABLISHED &&
4040		    dst->seqlo == 0) {
4041			/*
4042			 * Handle the closing of half connections where we
4043			 * don't see the full bidirectional FIN/ACK+ACK
4044			 * handshake.
4045			 */
4046			dst->state = TCPS_CLOSING;
4047		}
4048	}
4049	if (th->th_flags & TH_RST)
4050		src->state = dst->state = TCPS_TIME_WAIT;
4051
4052	/* update expire time */
4053	(*state)->expire = time_uptime;
4054	if (src->state >= TCPS_FIN_WAIT_2 &&
4055	    dst->state >= TCPS_FIN_WAIT_2)
4056		(*state)->timeout = PFTM_TCP_CLOSED;
4057	else if (src->state >= TCPS_CLOSING &&
4058	    dst->state >= TCPS_CLOSING)
4059		(*state)->timeout = PFTM_TCP_FIN_WAIT;
4060	else if (src->state < TCPS_ESTABLISHED ||
4061	    dst->state < TCPS_ESTABLISHED)
4062		(*state)->timeout = PFTM_TCP_OPENING;
4063	else if (src->state >= TCPS_CLOSING ||
4064	    dst->state >= TCPS_CLOSING)
4065		(*state)->timeout = PFTM_TCP_CLOSING;
4066	else
4067		(*state)->timeout = PFTM_TCP_ESTABLISHED;
4068
4069	return (PF_PASS);
4070}
4071
4072static int
4073pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
4074    struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
4075    u_short *reason)
4076{
4077	struct pf_state_key_cmp	 key;
4078	struct tcphdr		*th = pd->hdr.tcp;
4079	int			 copyback = 0;
4080	struct pf_state_peer	*src, *dst;
4081	struct pf_state_key	*sk;
4082
4083	bzero(&key, sizeof(key));
4084	key.af = pd->af;
4085	key.proto = IPPROTO_TCP;
4086	if (direction == PF_IN)	{	/* wire side, straight */
4087		PF_ACPY(&key.addr[0], pd->src, key.af);
4088		PF_ACPY(&key.addr[1], pd->dst, key.af);
4089		key.port[0] = th->th_sport;
4090		key.port[1] = th->th_dport;
4091	} else {			/* stack side, reverse */
4092		PF_ACPY(&key.addr[1], pd->src, key.af);
4093		PF_ACPY(&key.addr[0], pd->dst, key.af);
4094		key.port[1] = th->th_sport;
4095		key.port[0] = th->th_dport;
4096	}
4097
4098	STATE_LOOKUP(kif, &key, direction, *state, pd);
4099
4100	if (direction == (*state)->direction) {
4101		src = &(*state)->src;
4102		dst = &(*state)->dst;
4103	} else {
4104		src = &(*state)->dst;
4105		dst = &(*state)->src;
4106	}
4107
4108	sk = (*state)->key[pd->didx];
4109
4110	if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
4111		if (direction != (*state)->direction) {
4112			REASON_SET(reason, PFRES_SYNPROXY);
4113			return (PF_SYNPROXY_DROP);
4114		}
4115		if (th->th_flags & TH_SYN) {
4116			if (ntohl(th->th_seq) != (*state)->src.seqlo) {
4117				REASON_SET(reason, PFRES_SYNPROXY);
4118				return (PF_DROP);
4119			}
4120			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4121			    pd->src, th->th_dport, th->th_sport,
4122			    (*state)->src.seqhi, ntohl(th->th_seq) + 1,
4123			    TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL);
4124			REASON_SET(reason, PFRES_SYNPROXY);
4125			return (PF_SYNPROXY_DROP);
4126		} else if (!(th->th_flags & TH_ACK) ||
4127		    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4128		    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4129			REASON_SET(reason, PFRES_SYNPROXY);
4130			return (PF_DROP);
4131		} else if ((*state)->src_node != NULL &&
4132		    pf_src_connlimit(state)) {
4133			REASON_SET(reason, PFRES_SRCLIMIT);
4134			return (PF_DROP);
4135		} else
4136			(*state)->src.state = PF_TCPS_PROXY_DST;
4137	}
4138	if ((*state)->src.state == PF_TCPS_PROXY_DST) {
4139		if (direction == (*state)->direction) {
4140			if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
4141			    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4142			    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4143				REASON_SET(reason, PFRES_SYNPROXY);
4144				return (PF_DROP);
4145			}
4146			(*state)->src.max_win = MAX(ntohs(th->th_win), 1);
4147			if ((*state)->dst.seqhi == 1)
4148				(*state)->dst.seqhi = htonl(arc4random());
4149			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4150			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
4151			    sk->port[pd->sidx], sk->port[pd->didx],
4152			    (*state)->dst.seqhi, 0, TH_SYN, 0,
4153			    (*state)->src.mss, 0, 0, (*state)->tag, NULL);
4154			REASON_SET(reason, PFRES_SYNPROXY);
4155			return (PF_SYNPROXY_DROP);
4156		} else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
4157		    (TH_SYN|TH_ACK)) ||
4158		    (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
4159			REASON_SET(reason, PFRES_SYNPROXY);
4160			return (PF_DROP);
4161		} else {
4162			(*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
4163			(*state)->dst.seqlo = ntohl(th->th_seq);
4164			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4165			    pd->src, th->th_dport, th->th_sport,
4166			    ntohl(th->th_ack), ntohl(th->th_seq) + 1,
4167			    TH_ACK, (*state)->src.max_win, 0, 0, 0,
4168			    (*state)->tag, NULL);
4169			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4170			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
4171			    sk->port[pd->sidx], sk->port[pd->didx],
4172			    (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
4173			    TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL);
4174			(*state)->src.seqdiff = (*state)->dst.seqhi -
4175			    (*state)->src.seqlo;
4176			(*state)->dst.seqdiff = (*state)->src.seqhi -
4177			    (*state)->dst.seqlo;
4178			(*state)->src.seqhi = (*state)->src.seqlo +
4179			    (*state)->dst.max_win;
4180			(*state)->dst.seqhi = (*state)->dst.seqlo +
4181			    (*state)->src.max_win;
4182			(*state)->src.wscale = (*state)->dst.wscale = 0;
4183			(*state)->src.state = (*state)->dst.state =
4184			    TCPS_ESTABLISHED;
4185			REASON_SET(reason, PFRES_SYNPROXY);
4186			return (PF_SYNPROXY_DROP);
4187		}
4188	}
4189
4190	if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
4191	    dst->state >= TCPS_FIN_WAIT_2 &&
4192	    src->state >= TCPS_FIN_WAIT_2) {
4193		if (V_pf_status.debug >= PF_DEBUG_MISC) {
4194			printf("pf: state reuse ");
4195			pf_print_state(*state);
4196			pf_print_flags(th->th_flags);
4197			printf("\n");
4198		}
4199		/* XXX make sure it's the same direction ?? */
4200		(*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
4201		pf_unlink_state(*state, PF_ENTER_LOCKED);
4202		*state = NULL;
4203		return (PF_DROP);
4204	}
4205
4206	if ((*state)->state_flags & PFSTATE_SLOPPY) {
4207		if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP)
4208			return (PF_DROP);
4209	} else {
4210		if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason,
4211		    &copyback) == PF_DROP)
4212			return (PF_DROP);
4213	}
4214
4215	/* translate source/destination address, if necessary */
4216	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4217		struct pf_state_key *nk = (*state)->key[pd->didx];
4218
4219		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4220		    nk->port[pd->sidx] != th->th_sport)
4221			pf_change_ap(pd->src, &th->th_sport, pd->ip_sum,
4222			    &th->th_sum, &nk->addr[pd->sidx],
4223			    nk->port[pd->sidx], 0, pd->af);
4224
4225		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4226		    nk->port[pd->didx] != th->th_dport)
4227			pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum,
4228			    &th->th_sum, &nk->addr[pd->didx],
4229			    nk->port[pd->didx], 0, pd->af);
4230		copyback = 1;
4231	}
4232
4233	/* Copyback sequence modulation or stateful scrub changes if needed */
4234	if (copyback)
4235		m_copyback(m, off, sizeof(*th), (caddr_t)th);
4236
4237	return (PF_PASS);
4238}
4239
4240static int
4241pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
4242    struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
4243{
4244	struct pf_state_peer	*src, *dst;
4245	struct pf_state_key_cmp	 key;
4246	struct udphdr		*uh = pd->hdr.udp;
4247
4248	bzero(&key, sizeof(key));
4249	key.af = pd->af;
4250	key.proto = IPPROTO_UDP;
4251	if (direction == PF_IN)	{	/* wire side, straight */
4252		PF_ACPY(&key.addr[0], pd->src, key.af);
4253		PF_ACPY(&key.addr[1], pd->dst, key.af);
4254		key.port[0] = uh->uh_sport;
4255		key.port[1] = uh->uh_dport;
4256	} else {			/* stack side, reverse */
4257		PF_ACPY(&key.addr[1], pd->src, key.af);
4258		PF_ACPY(&key.addr[0], pd->dst, key.af);
4259		key.port[1] = uh->uh_sport;
4260		key.port[0] = uh->uh_dport;
4261	}
4262
4263	STATE_LOOKUP(kif, &key, direction, *state, pd);
4264
4265	if (direction == (*state)->direction) {
4266		src = &(*state)->src;
4267		dst = &(*state)->dst;
4268	} else {
4269		src = &(*state)->dst;
4270		dst = &(*state)->src;
4271	}
4272
4273	/* update states */
4274	if (src->state < PFUDPS_SINGLE)
4275		src->state = PFUDPS_SINGLE;
4276	if (dst->state == PFUDPS_SINGLE)
4277		dst->state = PFUDPS_MULTIPLE;
4278
4279	/* update expire time */
4280	(*state)->expire = time_uptime;
4281	if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
4282		(*state)->timeout = PFTM_UDP_MULTIPLE;
4283	else
4284		(*state)->timeout = PFTM_UDP_SINGLE;
4285
4286	/* translate source/destination address, if necessary */
4287	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4288		struct pf_state_key *nk = (*state)->key[pd->didx];
4289
4290		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4291		    nk->port[pd->sidx] != uh->uh_sport)
4292			pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum,
4293			    &uh->uh_sum, &nk->addr[pd->sidx],
4294			    nk->port[pd->sidx], 1, pd->af);
4295
4296		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4297		    nk->port[pd->didx] != uh->uh_dport)
4298			pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum,
4299			    &uh->uh_sum, &nk->addr[pd->didx],
4300			    nk->port[pd->didx], 1, pd->af);
4301		m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
4302	}
4303
4304	return (PF_PASS);
4305}
4306
4307static int
4308pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
4309    struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
4310{
4311	struct pf_addr  *saddr = pd->src, *daddr = pd->dst;
4312	u_int16_t	 icmpid = 0, *icmpsum;
4313	u_int8_t	 icmptype;
4314	int		 state_icmp = 0;
4315	struct pf_state_key_cmp key;
4316
4317	bzero(&key, sizeof(key));
4318	switch (pd->proto) {
4319#ifdef INET
4320	case IPPROTO_ICMP:
4321		icmptype = pd->hdr.icmp->icmp_type;
4322		icmpid = pd->hdr.icmp->icmp_id;
4323		icmpsum = &pd->hdr.icmp->icmp_cksum;
4324
4325		if (icmptype == ICMP_UNREACH ||
4326		    icmptype == ICMP_SOURCEQUENCH ||
4327		    icmptype == ICMP_REDIRECT ||
4328		    icmptype == ICMP_TIMXCEED ||
4329		    icmptype == ICMP_PARAMPROB)
4330			state_icmp++;
4331		break;
4332#endif /* INET */
4333#ifdef INET6
4334	case IPPROTO_ICMPV6:
4335		icmptype = pd->hdr.icmp6->icmp6_type;
4336		icmpid = pd->hdr.icmp6->icmp6_id;
4337		icmpsum = &pd->hdr.icmp6->icmp6_cksum;
4338
4339		if (icmptype == ICMP6_DST_UNREACH ||
4340		    icmptype == ICMP6_PACKET_TOO_BIG ||
4341		    icmptype == ICMP6_TIME_EXCEEDED ||
4342		    icmptype == ICMP6_PARAM_PROB)
4343			state_icmp++;
4344		break;
4345#endif /* INET6 */
4346	}
4347
4348	if (!state_icmp) {
4349
4350		/*
4351		 * ICMP query/reply message not related to a TCP/UDP packet.
4352		 * Search for an ICMP state.
4353		 */
4354		key.af = pd->af;
4355		key.proto = pd->proto;
4356		key.port[0] = key.port[1] = icmpid;
4357		if (direction == PF_IN)	{	/* wire side, straight */
4358			PF_ACPY(&key.addr[0], pd->src, key.af);
4359			PF_ACPY(&key.addr[1], pd->dst, key.af);
4360		} else {			/* stack side, reverse */
4361			PF_ACPY(&key.addr[1], pd->src, key.af);
4362			PF_ACPY(&key.addr[0], pd->dst, key.af);
4363		}
4364
4365		STATE_LOOKUP(kif, &key, direction, *state, pd);
4366
4367		(*state)->expire = time_uptime;
4368		(*state)->timeout = PFTM_ICMP_ERROR_REPLY;
4369
4370		/* translate source/destination address, if necessary */
4371		if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4372			struct pf_state_key *nk = (*state)->key[pd->didx];
4373
4374			switch (pd->af) {
4375#ifdef INET
4376			case AF_INET:
4377				if (PF_ANEQ(pd->src,
4378				    &nk->addr[pd->sidx], AF_INET))
4379					pf_change_a(&saddr->v4.s_addr,
4380					    pd->ip_sum,
4381					    nk->addr[pd->sidx].v4.s_addr, 0);
4382
4383				if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
4384				    AF_INET))
4385					pf_change_a(&daddr->v4.s_addr,
4386					    pd->ip_sum,
4387					    nk->addr[pd->didx].v4.s_addr, 0);
4388
4389				if (nk->port[0] !=
4390				    pd->hdr.icmp->icmp_id) {
4391					pd->hdr.icmp->icmp_cksum =
4392					    pf_cksum_fixup(
4393					    pd->hdr.icmp->icmp_cksum, icmpid,
4394					    nk->port[pd->sidx], 0);
4395					pd->hdr.icmp->icmp_id =
4396					    nk->port[pd->sidx];
4397				}
4398
4399				m_copyback(m, off, ICMP_MINLEN,
4400				    (caddr_t )pd->hdr.icmp);
4401				break;
4402#endif /* INET */
4403#ifdef INET6
4404			case AF_INET6:
4405				if (PF_ANEQ(pd->src,
4406				    &nk->addr[pd->sidx], AF_INET6))
4407					pf_change_a6(saddr,
4408					    &pd->hdr.icmp6->icmp6_cksum,
4409					    &nk->addr[pd->sidx], 0);
4410
4411				if (PF_ANEQ(pd->dst,
4412				    &nk->addr[pd->didx], AF_INET6))
4413					pf_change_a6(daddr,
4414					    &pd->hdr.icmp6->icmp6_cksum,
4415					    &nk->addr[pd->didx], 0);
4416
4417				m_copyback(m, off, sizeof(struct icmp6_hdr),
4418				    (caddr_t )pd->hdr.icmp6);
4419				break;
4420#endif /* INET6 */
4421			}
4422		}
4423		return (PF_PASS);
4424
4425	} else {
4426		/*
4427		 * ICMP error message in response to a TCP/UDP packet.
4428		 * Extract the inner TCP/UDP header and search for that state.
4429		 */
4430
4431		struct pf_pdesc	pd2;
4432		bzero(&pd2, sizeof pd2);
4433#ifdef INET
4434		struct ip	h2;
4435#endif /* INET */
4436#ifdef INET6
4437		struct ip6_hdr	h2_6;
4438		int		terminal = 0;
4439#endif /* INET6 */
4440		int		ipoff2 = 0;
4441		int		off2 = 0;
4442
4443		pd2.af = pd->af;
4444		/* Payload packet is from the opposite direction. */
4445		pd2.sidx = (direction == PF_IN) ? 1 : 0;
4446		pd2.didx = (direction == PF_IN) ? 0 : 1;
4447		switch (pd->af) {
4448#ifdef INET
4449		case AF_INET:
4450			/* offset of h2 in mbuf chain */
4451			ipoff2 = off + ICMP_MINLEN;
4452
4453			if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
4454			    NULL, reason, pd2.af)) {
4455				DPFPRINTF(PF_DEBUG_MISC,
4456				    ("pf: ICMP error message too short "
4457				    "(ip)\n"));
4458				return (PF_DROP);
4459			}
4460			/*
4461			 * ICMP error messages don't refer to non-first
4462			 * fragments
4463			 */
4464			if (h2.ip_off & htons(IP_OFFMASK)) {
4465				REASON_SET(reason, PFRES_FRAG);
4466				return (PF_DROP);
4467			}
4468
4469			/* offset of protocol header that follows h2 */
4470			off2 = ipoff2 + (h2.ip_hl << 2);
4471
4472			pd2.proto = h2.ip_p;
4473			pd2.src = (struct pf_addr *)&h2.ip_src;
4474			pd2.dst = (struct pf_addr *)&h2.ip_dst;
4475			pd2.ip_sum = &h2.ip_sum;
4476			break;
4477#endif /* INET */
4478#ifdef INET6
4479		case AF_INET6:
4480			ipoff2 = off + sizeof(struct icmp6_hdr);
4481
4482			if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
4483			    NULL, reason, pd2.af)) {
4484				DPFPRINTF(PF_DEBUG_MISC,
4485				    ("pf: ICMP error message too short "
4486				    "(ip6)\n"));
4487				return (PF_DROP);
4488			}
4489			pd2.proto = h2_6.ip6_nxt;
4490			pd2.src = (struct pf_addr *)&h2_6.ip6_src;
4491			pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
4492			pd2.ip_sum = NULL;
4493			off2 = ipoff2 + sizeof(h2_6);
4494			do {
4495				switch (pd2.proto) {
4496				case IPPROTO_FRAGMENT:
4497					/*
4498					 * ICMPv6 error messages for
4499					 * non-first fragments
4500					 */
4501					REASON_SET(reason, PFRES_FRAG);
4502					return (PF_DROP);
4503				case IPPROTO_AH:
4504				case IPPROTO_HOPOPTS:
4505				case IPPROTO_ROUTING:
4506				case IPPROTO_DSTOPTS: {
4507					/* get next header and header length */
4508					struct ip6_ext opt6;
4509
4510					if (!pf_pull_hdr(m, off2, &opt6,
4511					    sizeof(opt6), NULL, reason,
4512					    pd2.af)) {
4513						DPFPRINTF(PF_DEBUG_MISC,
4514						    ("pf: ICMPv6 short opt\n"));
4515						return (PF_DROP);
4516					}
4517					if (pd2.proto == IPPROTO_AH)
4518						off2 += (opt6.ip6e_len + 2) * 4;
4519					else
4520						off2 += (opt6.ip6e_len + 1) * 8;
4521					pd2.proto = opt6.ip6e_nxt;
4522					/* goto the next header */
4523					break;
4524				}
4525				default:
4526					terminal++;
4527					break;
4528				}
4529			} while (!terminal);
4530			break;
4531#endif /* INET6 */
4532		}
4533
4534		switch (pd2.proto) {
4535		case IPPROTO_TCP: {
4536			struct tcphdr		 th;
4537			u_int32_t		 seq;
4538			struct pf_state_peer	*src, *dst;
4539			u_int8_t		 dws;
4540			int			 copyback = 0;
4541
4542			/*
4543			 * Only the first 8 bytes of the TCP header can be
4544			 * expected. Don't access any TCP header fields after
4545			 * th_seq, an ackskew test is not possible.
4546			 */
4547			if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
4548			    pd2.af)) {
4549				DPFPRINTF(PF_DEBUG_MISC,
4550				    ("pf: ICMP error message too short "
4551				    "(tcp)\n"));
4552				return (PF_DROP);
4553			}
4554
4555			key.af = pd2.af;
4556			key.proto = IPPROTO_TCP;
4557			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4558			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4559			key.port[pd2.sidx] = th.th_sport;
4560			key.port[pd2.didx] = th.th_dport;
4561
4562			STATE_LOOKUP(kif, &key, direction, *state, pd);
4563
4564			if (direction == (*state)->direction) {
4565				src = &(*state)->dst;
4566				dst = &(*state)->src;
4567			} else {
4568				src = &(*state)->src;
4569				dst = &(*state)->dst;
4570			}
4571
4572			if (src->wscale && dst->wscale)
4573				dws = dst->wscale & PF_WSCALE_MASK;
4574			else
4575				dws = 0;
4576
4577			/* Demodulate sequence number */
4578			seq = ntohl(th.th_seq) - src->seqdiff;
4579			if (src->seqdiff) {
4580				pf_change_a(&th.th_seq, icmpsum,
4581				    htonl(seq), 0);
4582				copyback = 1;
4583			}
4584
4585			if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
4586			    (!SEQ_GEQ(src->seqhi, seq) ||
4587			    !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
4588				if (V_pf_status.debug >= PF_DEBUG_MISC) {
4589					printf("pf: BAD ICMP %d:%d ",
4590					    icmptype, pd->hdr.icmp->icmp_code);
4591					pf_print_host(pd->src, 0, pd->af);
4592					printf(" -> ");
4593					pf_print_host(pd->dst, 0, pd->af);
4594					printf(" state: ");
4595					pf_print_state(*state);
4596					printf(" seq=%u\n", seq);
4597				}
4598				REASON_SET(reason, PFRES_BADSTATE);
4599				return (PF_DROP);
4600			} else {
4601				if (V_pf_status.debug >= PF_DEBUG_MISC) {
4602					printf("pf: OK ICMP %d:%d ",
4603					    icmptype, pd->hdr.icmp->icmp_code);
4604					pf_print_host(pd->src, 0, pd->af);
4605					printf(" -> ");
4606					pf_print_host(pd->dst, 0, pd->af);
4607					printf(" state: ");
4608					pf_print_state(*state);
4609					printf(" seq=%u\n", seq);
4610				}
4611			}
4612
4613			/* translate source/destination address, if necessary */
4614			if ((*state)->key[PF_SK_WIRE] !=
4615			    (*state)->key[PF_SK_STACK]) {
4616				struct pf_state_key *nk =
4617				    (*state)->key[pd->didx];
4618
4619				if (PF_ANEQ(pd2.src,
4620				    &nk->addr[pd2.sidx], pd2.af) ||
4621				    nk->port[pd2.sidx] != th.th_sport)
4622					pf_change_icmp(pd2.src, &th.th_sport,
4623					    daddr, &nk->addr[pd2.sidx],
4624					    nk->port[pd2.sidx], NULL,
4625					    pd2.ip_sum, icmpsum,
4626					    pd->ip_sum, 0, pd2.af);
4627
4628				if (PF_ANEQ(pd2.dst,
4629				    &nk->addr[pd2.didx], pd2.af) ||
4630				    nk->port[pd2.didx] != th.th_dport)
4631					pf_change_icmp(pd2.dst, &th.th_dport,
4632					    NULL, /* XXX Inbound NAT? */
4633					    &nk->addr[pd2.didx],
4634					    nk->port[pd2.didx], NULL,
4635					    pd2.ip_sum, icmpsum,
4636					    pd->ip_sum, 0, pd2.af);
4637				copyback = 1;
4638			}
4639
4640			if (copyback) {
4641				switch (pd2.af) {
4642#ifdef INET
4643				case AF_INET:
4644					m_copyback(m, off, ICMP_MINLEN,
4645					    (caddr_t )pd->hdr.icmp);
4646					m_copyback(m, ipoff2, sizeof(h2),
4647					    (caddr_t )&h2);
4648					break;
4649#endif /* INET */
4650#ifdef INET6
4651				case AF_INET6:
4652					m_copyback(m, off,
4653					    sizeof(struct icmp6_hdr),
4654					    (caddr_t )pd->hdr.icmp6);
4655					m_copyback(m, ipoff2, sizeof(h2_6),
4656					    (caddr_t )&h2_6);
4657					break;
4658#endif /* INET6 */
4659				}
4660				m_copyback(m, off2, 8, (caddr_t)&th);
4661			}
4662
4663			return (PF_PASS);
4664			break;
4665		}
4666		case IPPROTO_UDP: {
4667			struct udphdr		uh;
4668
4669			if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
4670			    NULL, reason, pd2.af)) {
4671				DPFPRINTF(PF_DEBUG_MISC,
4672				    ("pf: ICMP error message too short "
4673				    "(udp)\n"));
4674				return (PF_DROP);
4675			}
4676
4677			key.af = pd2.af;
4678			key.proto = IPPROTO_UDP;
4679			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4680			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4681			key.port[pd2.sidx] = uh.uh_sport;
4682			key.port[pd2.didx] = uh.uh_dport;
4683
4684			STATE_LOOKUP(kif, &key, direction, *state, pd);
4685
4686			/* translate source/destination address, if necessary */
4687			if ((*state)->key[PF_SK_WIRE] !=
4688			    (*state)->key[PF_SK_STACK]) {
4689				struct pf_state_key *nk =
4690				    (*state)->key[pd->didx];
4691
4692				if (PF_ANEQ(pd2.src,
4693				    &nk->addr[pd2.sidx], pd2.af) ||
4694				    nk->port[pd2.sidx] != uh.uh_sport)
4695					pf_change_icmp(pd2.src, &uh.uh_sport,
4696					    daddr, &nk->addr[pd2.sidx],
4697					    nk->port[pd2.sidx], &uh.uh_sum,
4698					    pd2.ip_sum, icmpsum,
4699					    pd->ip_sum, 1, pd2.af);
4700
4701				if (PF_ANEQ(pd2.dst,
4702				    &nk->addr[pd2.didx], pd2.af) ||
4703				    nk->port[pd2.didx] != uh.uh_dport)
4704					pf_change_icmp(pd2.dst, &uh.uh_dport,
4705					    NULL, /* XXX Inbound NAT? */
4706					    &nk->addr[pd2.didx],
4707					    nk->port[pd2.didx], &uh.uh_sum,
4708					    pd2.ip_sum, icmpsum,
4709					    pd->ip_sum, 1, pd2.af);
4710
4711				switch (pd2.af) {
4712#ifdef INET
4713				case AF_INET:
4714					m_copyback(m, off, ICMP_MINLEN,
4715					    (caddr_t )pd->hdr.icmp);
4716					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4717					break;
4718#endif /* INET */
4719#ifdef INET6
4720				case AF_INET6:
4721					m_copyback(m, off,
4722					    sizeof(struct icmp6_hdr),
4723					    (caddr_t )pd->hdr.icmp6);
4724					m_copyback(m, ipoff2, sizeof(h2_6),
4725					    (caddr_t )&h2_6);
4726					break;
4727#endif /* INET6 */
4728				}
4729				m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
4730			}
4731			return (PF_PASS);
4732			break;
4733		}
4734#ifdef INET
4735		case IPPROTO_ICMP: {
4736			struct icmp		iih;
4737
4738			if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
4739			    NULL, reason, pd2.af)) {
4740				DPFPRINTF(PF_DEBUG_MISC,
4741				    ("pf: ICMP error message too short i"
4742				    "(icmp)\n"));
4743				return (PF_DROP);
4744			}
4745
4746			key.af = pd2.af;
4747			key.proto = IPPROTO_ICMP;
4748			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4749			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4750			key.port[0] = key.port[1] = iih.icmp_id;
4751
4752			STATE_LOOKUP(kif, &key, direction, *state, pd);
4753
4754			/* translate source/destination address, if necessary */
4755			if ((*state)->key[PF_SK_WIRE] !=
4756			    (*state)->key[PF_SK_STACK]) {
4757				struct pf_state_key *nk =
4758				    (*state)->key[pd->didx];
4759
4760				if (PF_ANEQ(pd2.src,
4761				    &nk->addr[pd2.sidx], pd2.af) ||
4762				    nk->port[pd2.sidx] != iih.icmp_id)
4763					pf_change_icmp(pd2.src, &iih.icmp_id,
4764					    daddr, &nk->addr[pd2.sidx],
4765					    nk->port[pd2.sidx], NULL,
4766					    pd2.ip_sum, icmpsum,
4767					    pd->ip_sum, 0, AF_INET);
4768
4769				if (PF_ANEQ(pd2.dst,
4770				    &nk->addr[pd2.didx], pd2.af) ||
4771				    nk->port[pd2.didx] != iih.icmp_id)
4772					pf_change_icmp(pd2.dst, &iih.icmp_id,
4773					    NULL, /* XXX Inbound NAT? */
4774					    &nk->addr[pd2.didx],
4775					    nk->port[pd2.didx], NULL,
4776					    pd2.ip_sum, icmpsum,
4777					    pd->ip_sum, 0, AF_INET);
4778
4779				m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
4780				m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4781				m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
4782			}
4783			return (PF_PASS);
4784			break;
4785		}
4786#endif /* INET */
4787#ifdef INET6
4788		case IPPROTO_ICMPV6: {
4789			struct icmp6_hdr	iih;
4790
4791			if (!pf_pull_hdr(m, off2, &iih,
4792			    sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
4793				DPFPRINTF(PF_DEBUG_MISC,
4794				    ("pf: ICMP error message too short "
4795				    "(icmp6)\n"));
4796				return (PF_DROP);
4797			}
4798
4799			key.af = pd2.af;
4800			key.proto = IPPROTO_ICMPV6;
4801			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4802			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4803			key.port[0] = key.port[1] = iih.icmp6_id;
4804
4805			STATE_LOOKUP(kif, &key, direction, *state, pd);
4806
4807			/* translate source/destination address, if necessary */
4808			if ((*state)->key[PF_SK_WIRE] !=
4809			    (*state)->key[PF_SK_STACK]) {
4810				struct pf_state_key *nk =
4811				    (*state)->key[pd->didx];
4812
4813				if (PF_ANEQ(pd2.src,
4814				    &nk->addr[pd2.sidx], pd2.af) ||
4815				    nk->port[pd2.sidx] != iih.icmp6_id)
4816					pf_change_icmp(pd2.src, &iih.icmp6_id,
4817					    daddr, &nk->addr[pd2.sidx],
4818					    nk->port[pd2.sidx], NULL,
4819					    pd2.ip_sum, icmpsum,
4820					    pd->ip_sum, 0, AF_INET6);
4821
4822				if (PF_ANEQ(pd2.dst,
4823				    &nk->addr[pd2.didx], pd2.af) ||
4824				    nk->port[pd2.didx] != iih.icmp6_id)
4825					pf_change_icmp(pd2.dst, &iih.icmp6_id,
4826					    NULL, /* XXX Inbound NAT? */
4827					    &nk->addr[pd2.didx],
4828					    nk->port[pd2.didx], NULL,
4829					    pd2.ip_sum, icmpsum,
4830					    pd->ip_sum, 0, AF_INET6);
4831
4832				m_copyback(m, off, sizeof(struct icmp6_hdr),
4833				    (caddr_t)pd->hdr.icmp6);
4834				m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
4835				m_copyback(m, off2, sizeof(struct icmp6_hdr),
4836				    (caddr_t)&iih);
4837			}
4838			return (PF_PASS);
4839			break;
4840		}
4841#endif /* INET6 */
4842		default: {
4843			key.af = pd2.af;
4844			key.proto = pd2.proto;
4845			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4846			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4847			key.port[0] = key.port[1] = 0;
4848
4849			STATE_LOOKUP(kif, &key, direction, *state, pd);
4850
4851			/* translate source/destination address, if necessary */
4852			if ((*state)->key[PF_SK_WIRE] !=
4853			    (*state)->key[PF_SK_STACK]) {
4854				struct pf_state_key *nk =
4855				    (*state)->key[pd->didx];
4856
4857				if (PF_ANEQ(pd2.src,
4858				    &nk->addr[pd2.sidx], pd2.af))
4859					pf_change_icmp(pd2.src, NULL, daddr,
4860					    &nk->addr[pd2.sidx], 0, NULL,
4861					    pd2.ip_sum, icmpsum,
4862					    pd->ip_sum, 0, pd2.af);
4863
4864				if (PF_ANEQ(pd2.dst,
4865				    &nk->addr[pd2.didx], pd2.af))
4866					pf_change_icmp(pd2.src, NULL,
4867					    NULL, /* XXX Inbound NAT? */
4868					    &nk->addr[pd2.didx], 0, NULL,
4869					    pd2.ip_sum, icmpsum,
4870					    pd->ip_sum, 0, pd2.af);
4871
4872				switch (pd2.af) {
4873#ifdef INET
4874				case AF_INET:
4875					m_copyback(m, off, ICMP_MINLEN,
4876					    (caddr_t)pd->hdr.icmp);
4877					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4878					break;
4879#endif /* INET */
4880#ifdef INET6
4881				case AF_INET6:
4882					m_copyback(m, off,
4883					    sizeof(struct icmp6_hdr),
4884					    (caddr_t )pd->hdr.icmp6);
4885					m_copyback(m, ipoff2, sizeof(h2_6),
4886					    (caddr_t )&h2_6);
4887					break;
4888#endif /* INET6 */
4889				}
4890			}
4891			return (PF_PASS);
4892			break;
4893		}
4894		}
4895	}
4896}
4897
4898static int
4899pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif,
4900    struct mbuf *m, struct pf_pdesc *pd)
4901{
4902	struct pf_state_peer	*src, *dst;
4903	struct pf_state_key_cmp	 key;
4904
4905	bzero(&key, sizeof(key));
4906	key.af = pd->af;
4907	key.proto = pd->proto;
4908	if (direction == PF_IN)	{
4909		PF_ACPY(&key.addr[0], pd->src, key.af);
4910		PF_ACPY(&key.addr[1], pd->dst, key.af);
4911		key.port[0] = key.port[1] = 0;
4912	} else {
4913		PF_ACPY(&key.addr[1], pd->src, key.af);
4914		PF_ACPY(&key.addr[0], pd->dst, key.af);
4915		key.port[1] = key.port[0] = 0;
4916	}
4917
4918	STATE_LOOKUP(kif, &key, direction, *state, pd);
4919
4920	if (direction == (*state)->direction) {
4921		src = &(*state)->src;
4922		dst = &(*state)->dst;
4923	} else {
4924		src = &(*state)->dst;
4925		dst = &(*state)->src;
4926	}
4927
4928	/* update states */
4929	if (src->state < PFOTHERS_SINGLE)
4930		src->state = PFOTHERS_SINGLE;
4931	if (dst->state == PFOTHERS_SINGLE)
4932		dst->state = PFOTHERS_MULTIPLE;
4933
4934	/* update expire time */
4935	(*state)->expire = time_uptime;
4936	if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
4937		(*state)->timeout = PFTM_OTHER_MULTIPLE;
4938	else
4939		(*state)->timeout = PFTM_OTHER_SINGLE;
4940
4941	/* translate source/destination address, if necessary */
4942	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4943		struct pf_state_key *nk = (*state)->key[pd->didx];
4944
4945		KASSERT(nk, ("%s: nk is null", __func__));
4946		KASSERT(pd, ("%s: pd is null", __func__));
4947		KASSERT(pd->src, ("%s: pd->src is null", __func__));
4948		KASSERT(pd->dst, ("%s: pd->dst is null", __func__));
4949		switch (pd->af) {
4950#ifdef INET
4951		case AF_INET:
4952			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
4953				pf_change_a(&pd->src->v4.s_addr,
4954				    pd->ip_sum,
4955				    nk->addr[pd->sidx].v4.s_addr,
4956				    0);
4957
4958
4959			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
4960				pf_change_a(&pd->dst->v4.s_addr,
4961				    pd->ip_sum,
4962				    nk->addr[pd->didx].v4.s_addr,
4963				    0);
4964
4965				break;
4966#endif /* INET */
4967#ifdef INET6
4968		case AF_INET6:
4969			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
4970				PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
4971
4972			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
4973				PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
4974#endif /* INET6 */
4975		}
4976	}
4977	return (PF_PASS);
4978}
4979
4980/*
4981 * ipoff and off are measured from the start of the mbuf chain.
4982 * h must be at "ipoff" on the mbuf chain.
4983 */
4984void *
4985pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
4986    u_short *actionp, u_short *reasonp, sa_family_t af)
4987{
4988	switch (af) {
4989#ifdef INET
4990	case AF_INET: {
4991		struct ip	*h = mtod(m, struct ip *);
4992		u_int16_t	 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
4993
4994		if (fragoff) {
4995			if (fragoff >= len)
4996				ACTION_SET(actionp, PF_PASS);
4997			else {
4998				ACTION_SET(actionp, PF_DROP);
4999				REASON_SET(reasonp, PFRES_FRAG);
5000			}
5001			return (NULL);
5002		}
5003		if (m->m_pkthdr.len < off + len ||
5004		    ntohs(h->ip_len) < off + len) {
5005			ACTION_SET(actionp, PF_DROP);
5006			REASON_SET(reasonp, PFRES_SHORT);
5007			return (NULL);
5008		}
5009		break;
5010	}
5011#endif /* INET */
5012#ifdef INET6
5013	case AF_INET6: {
5014		struct ip6_hdr	*h = mtod(m, struct ip6_hdr *);
5015
5016		if (m->m_pkthdr.len < off + len ||
5017		    (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
5018		    (unsigned)(off + len)) {
5019			ACTION_SET(actionp, PF_DROP);
5020			REASON_SET(reasonp, PFRES_SHORT);
5021			return (NULL);
5022		}
5023		break;
5024	}
5025#endif /* INET6 */
5026	}
5027	m_copydata(m, off, len, p);
5028	return (p);
5029}
5030
5031int
5032pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
5033    int rtableid)
5034{
5035#ifdef RADIX_MPATH
5036	struct radix_node_head	*rnh;
5037#endif
5038	struct sockaddr_in	*dst;
5039	int			 ret = 1;
5040	int			 check_mpath;
5041#ifdef INET6
5042	struct sockaddr_in6	*dst6;
5043	struct route_in6	 ro;
5044#else
5045	struct route		 ro;
5046#endif
5047	struct radix_node	*rn;
5048	struct rtentry		*rt;
5049	struct ifnet		*ifp;
5050
5051	check_mpath = 0;
5052#ifdef RADIX_MPATH
5053	/* XXX: stick to table 0 for now */
5054	rnh = rt_tables_get_rnh(0, af);
5055	if (rnh != NULL && rn_mpath_capable(rnh))
5056		check_mpath = 1;
5057#endif
5058	bzero(&ro, sizeof(ro));
5059	switch (af) {
5060	case AF_INET:
5061		dst = satosin(&ro.ro_dst);
5062		dst->sin_family = AF_INET;
5063		dst->sin_len = sizeof(*dst);
5064		dst->sin_addr = addr->v4;
5065		break;
5066#ifdef INET6
5067	case AF_INET6:
5068		/*
5069		 * Skip check for addresses with embedded interface scope,
5070		 * as they would always match anyway.
5071		 */
5072		if (IN6_IS_SCOPE_EMBED(&addr->v6))
5073			goto out;
5074		dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
5075		dst6->sin6_family = AF_INET6;
5076		dst6->sin6_len = sizeof(*dst6);
5077		dst6->sin6_addr = addr->v6;
5078		break;
5079#endif /* INET6 */
5080	default:
5081		return (0);
5082	}
5083
5084	/* Skip checks for ipsec interfaces */
5085	if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
5086		goto out;
5087
5088	switch (af) {
5089#ifdef INET6
5090	case AF_INET6:
5091		in6_rtalloc_ign(&ro, 0, rtableid);
5092		break;
5093#endif
5094#ifdef INET
5095	case AF_INET:
5096		in_rtalloc_ign((struct route *)&ro, 0, rtableid);
5097		break;
5098#endif
5099	default:
5100		rtalloc_ign((struct route *)&ro, 0);	/* No/default FIB. */
5101		break;
5102	}
5103
5104	if (ro.ro_rt != NULL) {
5105		/* No interface given, this is a no-route check */
5106		if (kif == NULL)
5107			goto out;
5108
5109		if (kif->pfik_ifp == NULL) {
5110			ret = 0;
5111			goto out;
5112		}
5113
5114		/* Perform uRPF check if passed input interface */
5115		ret = 0;
5116		rn = (struct radix_node *)ro.ro_rt;
5117		do {
5118			rt = (struct rtentry *)rn;
5119			ifp = rt->rt_ifp;
5120
5121			if (kif->pfik_ifp == ifp)
5122				ret = 1;
5123#ifdef RADIX_MPATH
5124			rn = rn_mpath_next(rn);
5125#endif
5126		} while (check_mpath == 1 && rn != NULL && ret == 0);
5127	} else
5128		ret = 0;
5129out:
5130	if (ro.ro_rt != NULL)
5131		RTFREE(ro.ro_rt);
5132	return (ret);
5133}
5134
5135#ifdef INET
5136static void
5137pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5138    struct pf_state *s, struct pf_pdesc *pd)
5139{
5140	struct mbuf		*m0, *m1;
5141	struct sockaddr_in	dst;
5142	struct ip		*ip;
5143	struct ifnet		*ifp = NULL;
5144	struct pf_addr		 naddr;
5145	struct pf_src_node	*sn = NULL;
5146	int			 error = 0;
5147	uint16_t		 ip_len, ip_off;
5148
5149	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5150	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5151	    __func__));
5152
5153	if ((pd->pf_mtag == NULL &&
5154	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5155	    pd->pf_mtag->routed++ > 3) {
5156		m0 = *m;
5157		*m = NULL;
5158		goto bad_locked;
5159	}
5160
5161	if (r->rt == PF_DUPTO) {
5162		if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5163			if (s)
5164				PF_STATE_UNLOCK(s);
5165			return;
5166		}
5167	} else {
5168		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5169			if (s)
5170				PF_STATE_UNLOCK(s);
5171			return;
5172		}
5173		m0 = *m;
5174	}
5175
5176	ip = mtod(m0, struct ip *);
5177
5178	bzero(&dst, sizeof(dst));
5179	dst.sin_family = AF_INET;
5180	dst.sin_len = sizeof(dst);
5181	dst.sin_addr = ip->ip_dst;
5182
5183	if (r->rt == PF_FASTROUTE) {
5184		struct rtentry *rt;
5185
5186		if (s)
5187			PF_STATE_UNLOCK(s);
5188		rt = rtalloc1_fib(sintosa(&dst), 0, 0, M_GETFIB(m0));
5189		if (rt == NULL) {
5190			RTFREE_LOCKED(rt);
5191			KMOD_IPSTAT_INC(ips_noroute);
5192			error = EHOSTUNREACH;
5193			goto bad;
5194		}
5195
5196		ifp = rt->rt_ifp;
5197		rt->rt_rmx.rmx_pksent++;
5198
5199		if (rt->rt_flags & RTF_GATEWAY)
5200			bcopy(satosin(rt->rt_gateway), &dst, sizeof(dst));
5201		RTFREE_LOCKED(rt);
5202	} else {
5203		if (TAILQ_EMPTY(&r->rpool.list)) {
5204			DPFPRINTF(PF_DEBUG_URGENT,
5205			    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5206			goto bad_locked;
5207		}
5208		if (s == NULL) {
5209			pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
5210			    &naddr, NULL, &sn);
5211			if (!PF_AZERO(&naddr, AF_INET))
5212				dst.sin_addr.s_addr = naddr.v4.s_addr;
5213			ifp = r->rpool.cur->kif ?
5214			    r->rpool.cur->kif->pfik_ifp : NULL;
5215		} else {
5216			if (!PF_AZERO(&s->rt_addr, AF_INET))
5217				dst.sin_addr.s_addr =
5218				    s->rt_addr.v4.s_addr;
5219			ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5220			PF_STATE_UNLOCK(s);
5221		}
5222	}
5223	if (ifp == NULL)
5224		goto bad;
5225
5226	if (oifp != ifp) {
5227		if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS)
5228			goto bad;
5229		else if (m0 == NULL)
5230			goto done;
5231		if (m0->m_len < sizeof(struct ip)) {
5232			DPFPRINTF(PF_DEBUG_URGENT,
5233			    ("%s: m0->m_len < sizeof(struct ip)\n", __func__));
5234			goto bad;
5235		}
5236		ip = mtod(m0, struct ip *);
5237	}
5238
5239	if (ifp->if_flags & IFF_LOOPBACK)
5240		m0->m_flags |= M_SKIP_FIREWALL;
5241
5242	ip_len = ntohs(ip->ip_len);
5243	ip_off = ntohs(ip->ip_off);
5244
5245	/* Copied from FreeBSD 10.0-CURRENT ip_output. */
5246	m0->m_pkthdr.csum_flags |= CSUM_IP;
5247	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
5248		in_delayed_cksum(m0);
5249		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
5250	}
5251#ifdef SCTP
5252	if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
5253		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
5254		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
5255	}
5256#endif
5257
5258	/*
5259	 * If small enough for interface, or the interface will take
5260	 * care of the fragmentation for us, we can just send directly.
5261	 */
5262	if (ip_len <= ifp->if_mtu ||
5263	    (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
5264	    ((ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
5265		ip->ip_sum = 0;
5266		if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
5267			ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
5268			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
5269		}
5270		m0->m_flags &= ~(M_PROTOFLAGS);
5271		error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5272		goto done;
5273	}
5274
5275	/* Balk when DF bit is set or the interface didn't support TSO. */
5276	if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) {
5277		error = EMSGSIZE;
5278		KMOD_IPSTAT_INC(ips_cantfrag);
5279		if (r->rt != PF_DUPTO) {
5280			icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
5281			    ifp->if_mtu);
5282			goto done;
5283		} else
5284			goto bad;
5285	}
5286
5287	error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist);
5288	if (error)
5289		goto bad;
5290
5291	for (; m0; m0 = m1) {
5292		m1 = m0->m_nextpkt;
5293		m0->m_nextpkt = NULL;
5294		if (error == 0) {
5295			m0->m_flags &= ~(M_PROTOFLAGS);
5296			error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5297		} else
5298			m_freem(m0);
5299	}
5300
5301	if (error == 0)
5302		KMOD_IPSTAT_INC(ips_fragmented);
5303
5304done:
5305	if (r->rt != PF_DUPTO)
5306		*m = NULL;
5307	return;
5308
5309bad_locked:
5310	if (s)
5311		PF_STATE_UNLOCK(s);
5312bad:
5313	m_freem(m0);
5314	goto done;
5315}
5316#endif /* INET */
5317
5318#ifdef INET6
5319static void
5320pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5321    struct pf_state *s, struct pf_pdesc *pd)
5322{
5323	struct mbuf		*m0;
5324	struct sockaddr_in6	dst;
5325	struct ip6_hdr		*ip6;
5326	struct ifnet		*ifp = NULL;
5327	struct pf_addr		 naddr;
5328	struct pf_src_node	*sn = NULL;
5329
5330	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5331	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5332	    __func__));
5333
5334	if ((pd->pf_mtag == NULL &&
5335	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5336	    pd->pf_mtag->routed++ > 3) {
5337		m0 = *m;
5338		*m = NULL;
5339		goto bad_locked;
5340	}
5341
5342	if (r->rt == PF_DUPTO) {
5343		if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5344			if (s)
5345				PF_STATE_UNLOCK(s);
5346			return;
5347		}
5348	} else {
5349		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5350			if (s)
5351				PF_STATE_UNLOCK(s);
5352			return;
5353		}
5354		m0 = *m;
5355	}
5356
5357	ip6 = mtod(m0, struct ip6_hdr *);
5358
5359	bzero(&dst, sizeof(dst));
5360	dst.sin6_family = AF_INET6;
5361	dst.sin6_len = sizeof(dst);
5362	dst.sin6_addr = ip6->ip6_dst;
5363
5364	/* Cheat. XXX why only in the v6 case??? */
5365	if (r->rt == PF_FASTROUTE) {
5366		if (s)
5367			PF_STATE_UNLOCK(s);
5368		m0->m_flags |= M_SKIP_FIREWALL;
5369		ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
5370		return;
5371	}
5372
5373	if (TAILQ_EMPTY(&r->rpool.list)) {
5374		DPFPRINTF(PF_DEBUG_URGENT,
5375		    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5376		goto bad_locked;
5377	}
5378	if (s == NULL) {
5379		pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
5380		    &naddr, NULL, &sn);
5381		if (!PF_AZERO(&naddr, AF_INET6))
5382			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5383			    &naddr, AF_INET6);
5384		ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
5385	} else {
5386		if (!PF_AZERO(&s->rt_addr, AF_INET6))
5387			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5388			    &s->rt_addr, AF_INET6);
5389		ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5390	}
5391
5392	if (s)
5393		PF_STATE_UNLOCK(s);
5394
5395	if (ifp == NULL)
5396		goto bad;
5397
5398	if (oifp != ifp) {
5399		if (pf_test6(PF_OUT, ifp, &m0, NULL) != PF_PASS)
5400			goto bad;
5401		else if (m0 == NULL)
5402			goto done;
5403		if (m0->m_len < sizeof(struct ip6_hdr)) {
5404			DPFPRINTF(PF_DEBUG_URGENT,
5405			    ("%s: m0->m_len < sizeof(struct ip6_hdr)\n",
5406			    __func__));
5407			goto bad;
5408		}
5409		ip6 = mtod(m0, struct ip6_hdr *);
5410	}
5411
5412	if (ifp->if_flags & IFF_LOOPBACK)
5413		m0->m_flags |= M_SKIP_FIREWALL;
5414
5415	/*
5416	 * If the packet is too large for the outgoing interface,
5417	 * send back an icmp6 error.
5418	 */
5419	if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr))
5420		dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
5421	if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu)
5422		nd6_output(ifp, ifp, m0, &dst, NULL);
5423	else {
5424		in6_ifstat_inc(ifp, ifs6_in_toobig);
5425		if (r->rt != PF_DUPTO)
5426			icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
5427		else
5428			goto bad;
5429	}
5430
5431done:
5432	if (r->rt != PF_DUPTO)
5433		*m = NULL;
5434	return;
5435
5436bad_locked:
5437	if (s)
5438		PF_STATE_UNLOCK(s);
5439bad:
5440	m_freem(m0);
5441	goto done;
5442}
5443#endif /* INET6 */
5444
5445/*
5446 * FreeBSD supports cksum offloads for the following drivers.
5447 *  em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4),
5448 *   ti(4), txp(4), xl(4)
5449 *
5450 * CSUM_DATA_VALID | CSUM_PSEUDO_HDR :
5451 *  network driver performed cksum including pseudo header, need to verify
5452 *   csum_data
5453 * CSUM_DATA_VALID :
5454 *  network driver performed cksum, needs to additional pseudo header
5455 *  cksum computation with partial csum_data(i.e. lack of H/W support for
5456 *  pseudo header, for instance hme(4), sk(4) and possibly gem(4))
5457 *
5458 * After validating the cksum of packet, set both flag CSUM_DATA_VALID and
5459 * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
5460 * TCP/UDP layer.
5461 * Also, set csum_data to 0xffff to force cksum validation.
5462 */
5463static int
5464pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
5465{
5466	u_int16_t sum = 0;
5467	int hw_assist = 0;
5468	struct ip *ip;
5469
5470	if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
5471		return (1);
5472	if (m->m_pkthdr.len < off + len)
5473		return (1);
5474
5475	switch (p) {
5476	case IPPROTO_TCP:
5477		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5478			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5479				sum = m->m_pkthdr.csum_data;
5480			} else {
5481				ip = mtod(m, struct ip *);
5482				sum = in_pseudo(ip->ip_src.s_addr,
5483				ip->ip_dst.s_addr, htonl((u_short)len +
5484				m->m_pkthdr.csum_data + IPPROTO_TCP));
5485			}
5486			sum ^= 0xffff;
5487			++hw_assist;
5488		}
5489		break;
5490	case IPPROTO_UDP:
5491		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5492			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5493				sum = m->m_pkthdr.csum_data;
5494			} else {
5495				ip = mtod(m, struct ip *);
5496				sum = in_pseudo(ip->ip_src.s_addr,
5497				ip->ip_dst.s_addr, htonl((u_short)len +
5498				m->m_pkthdr.csum_data + IPPROTO_UDP));
5499			}
5500			sum ^= 0xffff;
5501			++hw_assist;
5502		}
5503		break;
5504	case IPPROTO_ICMP:
5505#ifdef INET6
5506	case IPPROTO_ICMPV6:
5507#endif /* INET6 */
5508		break;
5509	default:
5510		return (1);
5511	}
5512
5513	if (!hw_assist) {
5514		switch (af) {
5515		case AF_INET:
5516			if (p == IPPROTO_ICMP) {
5517				if (m->m_len < off)
5518					return (1);
5519				m->m_data += off;
5520				m->m_len -= off;
5521				sum = in_cksum(m, len);
5522				m->m_data -= off;
5523				m->m_len += off;
5524			} else {
5525				if (m->m_len < sizeof(struct ip))
5526					return (1);
5527				sum = in4_cksum(m, p, off, len);
5528			}
5529			break;
5530#ifdef INET6
5531		case AF_INET6:
5532			if (m->m_len < sizeof(struct ip6_hdr))
5533				return (1);
5534			sum = in6_cksum(m, p, off, len);
5535			break;
5536#endif /* INET6 */
5537		default:
5538			return (1);
5539		}
5540	}
5541	if (sum) {
5542		switch (p) {
5543		case IPPROTO_TCP:
5544		    {
5545			KMOD_TCPSTAT_INC(tcps_rcvbadsum);
5546			break;
5547		    }
5548		case IPPROTO_UDP:
5549		    {
5550			KMOD_UDPSTAT_INC(udps_badsum);
5551			break;
5552		    }
5553#ifdef INET
5554		case IPPROTO_ICMP:
5555		    {
5556			KMOD_ICMPSTAT_INC(icps_checksum);
5557			break;
5558		    }
5559#endif
5560#ifdef INET6
5561		case IPPROTO_ICMPV6:
5562		    {
5563			KMOD_ICMP6STAT_INC(icp6s_checksum);
5564			break;
5565		    }
5566#endif /* INET6 */
5567		}
5568		return (1);
5569	} else {
5570		if (p == IPPROTO_TCP || p == IPPROTO_UDP) {
5571			m->m_pkthdr.csum_flags |=
5572			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
5573			m->m_pkthdr.csum_data = 0xffff;
5574		}
5575	}
5576	return (0);
5577}
5578
5579
5580#ifdef INET
5581int
5582pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
5583{
5584	struct pfi_kif		*kif;
5585	u_short			 action, reason = 0, log = 0;
5586	struct mbuf		*m = *m0;
5587	struct ip		*h = NULL;
5588	struct m_tag		*ipfwtag;
5589	struct pf_rule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
5590	struct pf_state		*s = NULL;
5591	struct pf_ruleset	*ruleset = NULL;
5592	struct pf_pdesc		 pd;
5593	int			 off, dirndx, pqid = 0;
5594
5595	M_ASSERTPKTHDR(m);
5596
5597	if (!V_pf_status.running)
5598		return (PF_PASS);
5599
5600	memset(&pd, 0, sizeof(pd));
5601
5602	kif = (struct pfi_kif *)ifp->if_pf_kif;
5603
5604	if (kif == NULL) {
5605		DPFPRINTF(PF_DEBUG_URGENT,
5606		    ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
5607		return (PF_DROP);
5608	}
5609	if (kif->pfik_flags & PFI_IFLAG_SKIP)
5610		return (PF_PASS);
5611
5612	if (m->m_flags & M_SKIP_FIREWALL)
5613		return (PF_PASS);
5614
5615	pd.pf_mtag = pf_find_mtag(m);
5616
5617	PF_RULES_RLOCK();
5618
5619	if (ip_divert_ptr != NULL &&
5620	    ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) {
5621		struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1);
5622		if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) {
5623			if (pd.pf_mtag == NULL &&
5624			    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5625				action = PF_DROP;
5626				goto done;
5627			}
5628			pd.pf_mtag->flags |= PF_PACKET_LOOPED;
5629			m_tag_delete(m, ipfwtag);
5630		}
5631		if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) {
5632			m->m_flags |= M_FASTFWD_OURS;
5633			pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT;
5634		}
5635	} else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
5636		/* We do IP header normalization and packet reassembly here */
5637		action = PF_DROP;
5638		goto done;
5639	}
5640	m = *m0;	/* pf_normalize messes with m0 */
5641	h = mtod(m, struct ip *);
5642
5643	off = h->ip_hl << 2;
5644	if (off < (int)sizeof(struct ip)) {
5645		action = PF_DROP;
5646		REASON_SET(&reason, PFRES_SHORT);
5647		log = 1;
5648		goto done;
5649	}
5650
5651	pd.src = (struct pf_addr *)&h->ip_src;
5652	pd.dst = (struct pf_addr *)&h->ip_dst;
5653	pd.sport = pd.dport = NULL;
5654	pd.ip_sum = &h->ip_sum;
5655	pd.proto_sum = NULL;
5656	pd.proto = h->ip_p;
5657	pd.dir = dir;
5658	pd.sidx = (dir == PF_IN) ? 0 : 1;
5659	pd.didx = (dir == PF_IN) ? 1 : 0;
5660	pd.af = AF_INET;
5661	pd.tos = h->ip_tos;
5662	pd.tot_len = ntohs(h->ip_len);
5663
5664	/* handle fragments that didn't get reassembled by normalization */
5665	if (h->ip_off & htons(IP_MF | IP_OFFMASK)) {
5666		action = pf_test_fragment(&r, dir, kif, m, h,
5667		    &pd, &a, &ruleset);
5668		goto done;
5669	}
5670
5671	switch (h->ip_p) {
5672
5673	case IPPROTO_TCP: {
5674		struct tcphdr	th;
5675
5676		pd.hdr.tcp = &th;
5677		if (!pf_pull_hdr(m, off, &th, sizeof(th),
5678		    &action, &reason, AF_INET)) {
5679			log = action != PF_PASS;
5680			goto done;
5681		}
5682		pd.p_len = pd.tot_len - off - (th.th_off << 2);
5683		if ((th.th_flags & TH_ACK) && pd.p_len == 0)
5684			pqid = 1;
5685		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
5686		if (action == PF_DROP)
5687			goto done;
5688		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
5689		    &reason);
5690		if (action == PF_PASS) {
5691			if (pfsync_update_state_ptr != NULL)
5692				pfsync_update_state_ptr(s);
5693			r = s->rule.ptr;
5694			a = s->anchor.ptr;
5695			log = s->log;
5696		} else if (s == NULL)
5697			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5698			    &a, &ruleset, inp);
5699		break;
5700	}
5701
5702	case IPPROTO_UDP: {
5703		struct udphdr	uh;
5704
5705		pd.hdr.udp = &uh;
5706		if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
5707		    &action, &reason, AF_INET)) {
5708			log = action != PF_PASS;
5709			goto done;
5710		}
5711		if (uh.uh_dport == 0 ||
5712		    ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
5713		    ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
5714			action = PF_DROP;
5715			REASON_SET(&reason, PFRES_SHORT);
5716			goto done;
5717		}
5718		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
5719		if (action == PF_PASS) {
5720			if (pfsync_update_state_ptr != NULL)
5721				pfsync_update_state_ptr(s);
5722			r = s->rule.ptr;
5723			a = s->anchor.ptr;
5724			log = s->log;
5725		} else if (s == NULL)
5726			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5727			    &a, &ruleset, inp);
5728		break;
5729	}
5730
5731	case IPPROTO_ICMP: {
5732		struct icmp	ih;
5733
5734		pd.hdr.icmp = &ih;
5735		if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
5736		    &action, &reason, AF_INET)) {
5737			log = action != PF_PASS;
5738			goto done;
5739		}
5740		action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
5741		    &reason);
5742		if (action == PF_PASS) {
5743			if (pfsync_update_state_ptr != NULL)
5744				pfsync_update_state_ptr(s);
5745			r = s->rule.ptr;
5746			a = s->anchor.ptr;
5747			log = s->log;
5748		} else if (s == NULL)
5749			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5750			    &a, &ruleset, inp);
5751		break;
5752	}
5753
5754#ifdef INET6
5755	case IPPROTO_ICMPV6: {
5756		action = PF_DROP;
5757		DPFPRINTF(PF_DEBUG_MISC,
5758		    ("pf: dropping IPv4 packet with ICMPv6 payload\n"));
5759		goto done;
5760	}
5761#endif
5762
5763	default:
5764		action = pf_test_state_other(&s, dir, kif, m, &pd);
5765		if (action == PF_PASS) {
5766			if (pfsync_update_state_ptr != NULL)
5767				pfsync_update_state_ptr(s);
5768			r = s->rule.ptr;
5769			a = s->anchor.ptr;
5770			log = s->log;
5771		} else if (s == NULL)
5772			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5773			    &a, &ruleset, inp);
5774		break;
5775	}
5776
5777done:
5778	PF_RULES_RUNLOCK();
5779	if (action == PF_PASS && h->ip_hl > 5 &&
5780	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
5781		action = PF_DROP;
5782		REASON_SET(&reason, PFRES_IPOPTIONS);
5783		log = 1;
5784		DPFPRINTF(PF_DEBUG_MISC,
5785		    ("pf: dropping packet with ip options\n"));
5786	}
5787
5788	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
5789		action = PF_DROP;
5790		REASON_SET(&reason, PFRES_MEMORY);
5791	}
5792	if (r->rtableid >= 0)
5793		M_SETFIB(m, r->rtableid);
5794
5795#ifdef ALTQ
5796	if (action == PF_PASS && r->qid) {
5797		if (pd.pf_mtag == NULL &&
5798		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5799			action = PF_DROP;
5800			REASON_SET(&reason, PFRES_MEMORY);
5801		}
5802		if (pqid || (pd.tos & IPTOS_LOWDELAY))
5803			pd.pf_mtag->qid = r->pqid;
5804		else
5805			pd.pf_mtag->qid = r->qid;
5806		/* add hints for ecn */
5807		pd.pf_mtag->hdr = h;
5808
5809	}
5810#endif /* ALTQ */
5811
5812	/*
5813	 * connections redirected to loopback should not match sockets
5814	 * bound specifically to loopback due to security implications,
5815	 * see tcp_input() and in_pcblookup_listen().
5816	 */
5817	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
5818	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
5819	    (s->nat_rule.ptr->action == PF_RDR ||
5820	    s->nat_rule.ptr->action == PF_BINAT) &&
5821	    (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
5822		m->m_flags |= M_SKIP_FIREWALL;
5823
5824	if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL &&
5825	    !PACKET_LOOPED(&pd)) {
5826
5827		ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
5828		    sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
5829		if (ipfwtag != NULL) {
5830			((struct ipfw_rule_ref *)(ipfwtag+1))->info =
5831			    ntohs(r->divert.port);
5832			((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir;
5833
5834			if (s)
5835				PF_STATE_UNLOCK(s);
5836
5837			m_tag_prepend(m, ipfwtag);
5838			if (m->m_flags & M_FASTFWD_OURS) {
5839				if (pd.pf_mtag == NULL &&
5840				    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5841					action = PF_DROP;
5842					REASON_SET(&reason, PFRES_MEMORY);
5843					log = 1;
5844					DPFPRINTF(PF_DEBUG_MISC,
5845					    ("pf: failed to allocate tag\n"));
5846				}
5847				pd.pf_mtag->flags |= PF_FASTFWD_OURS_PRESENT;
5848				m->m_flags &= ~M_FASTFWD_OURS;
5849			}
5850			ip_divert_ptr(*m0, dir ==  PF_IN ? DIR_IN : DIR_OUT);
5851			*m0 = NULL;
5852
5853			return (action);
5854		} else {
5855			/* XXX: ipfw has the same behaviour! */
5856			action = PF_DROP;
5857			REASON_SET(&reason, PFRES_MEMORY);
5858			log = 1;
5859			DPFPRINTF(PF_DEBUG_MISC,
5860			    ("pf: failed to allocate divert tag\n"));
5861		}
5862	}
5863
5864	if (log) {
5865		struct pf_rule *lr;
5866
5867		if (s != NULL && s->nat_rule.ptr != NULL &&
5868		    s->nat_rule.ptr->log & PF_LOG_ALL)
5869			lr = s->nat_rule.ptr;
5870		else
5871			lr = r;
5872		PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd,
5873		    (s == NULL));
5874	}
5875
5876	kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
5877	kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
5878
5879	if (action == PF_PASS || r->action == PF_DROP) {
5880		dirndx = (dir == PF_OUT);
5881		r->packets[dirndx]++;
5882		r->bytes[dirndx] += pd.tot_len;
5883		if (a != NULL) {
5884			a->packets[dirndx]++;
5885			a->bytes[dirndx] += pd.tot_len;
5886		}
5887		if (s != NULL) {
5888			if (s->nat_rule.ptr != NULL) {
5889				s->nat_rule.ptr->packets[dirndx]++;
5890				s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
5891			}
5892			if (s->src_node != NULL) {
5893				s->src_node->packets[dirndx]++;
5894				s->src_node->bytes[dirndx] += pd.tot_len;
5895			}
5896			if (s->nat_src_node != NULL) {
5897				s->nat_src_node->packets[dirndx]++;
5898				s->nat_src_node->bytes[dirndx] += pd.tot_len;
5899			}
5900			dirndx = (dir == s->direction) ? 0 : 1;
5901			s->packets[dirndx]++;
5902			s->bytes[dirndx] += pd.tot_len;
5903		}
5904		tr = r;
5905		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
5906		if (nr != NULL && r == &V_pf_default_rule)
5907			tr = nr;
5908		if (tr->src.addr.type == PF_ADDR_TABLE)
5909			pfr_update_stats(tr->src.addr.p.tbl,
5910			    (s == NULL) ? pd.src :
5911			    &s->key[(s->direction == PF_IN)]->
5912				addr[(s->direction == PF_OUT)],
5913			    pd.af, pd.tot_len, dir == PF_OUT,
5914			    r->action == PF_PASS, tr->src.neg);
5915		if (tr->dst.addr.type == PF_ADDR_TABLE)
5916			pfr_update_stats(tr->dst.addr.p.tbl,
5917			    (s == NULL) ? pd.dst :
5918			    &s->key[(s->direction == PF_IN)]->
5919				addr[(s->direction == PF_IN)],
5920			    pd.af, pd.tot_len, dir == PF_OUT,
5921			    r->action == PF_PASS, tr->dst.neg);
5922	}
5923
5924	switch (action) {
5925	case PF_SYNPROXY_DROP:
5926		m_freem(*m0);
5927	case PF_DEFER:
5928		*m0 = NULL;
5929		action = PF_PASS;
5930		break;
5931	default:
5932		/* pf_route() returns unlocked. */
5933		if (r->rt) {
5934			pf_route(m0, r, dir, kif->pfik_ifp, s, &pd);
5935			return (action);
5936		}
5937		break;
5938	}
5939	if (s)
5940		PF_STATE_UNLOCK(s);
5941
5942	return (action);
5943}
5944#endif /* INET */
5945
5946#ifdef INET6
5947int
5948pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
5949{
5950	struct pfi_kif		*kif;
5951	u_short			 action, reason = 0, log = 0;
5952	struct mbuf		*m = *m0, *n = NULL;
5953	struct ip6_hdr		*h = NULL;
5954	struct pf_rule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
5955	struct pf_state		*s = NULL;
5956	struct pf_ruleset	*ruleset = NULL;
5957	struct pf_pdesc		 pd;
5958	int			 off, terminal = 0, dirndx, rh_cnt = 0;
5959
5960	M_ASSERTPKTHDR(m);
5961
5962	if (!V_pf_status.running)
5963		return (PF_PASS);
5964
5965	memset(&pd, 0, sizeof(pd));
5966	pd.pf_mtag = pf_find_mtag(m);
5967
5968	if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED)
5969		return (PF_PASS);
5970
5971	kif = (struct pfi_kif *)ifp->if_pf_kif;
5972	if (kif == NULL) {
5973		DPFPRINTF(PF_DEBUG_URGENT,
5974		    ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
5975		return (PF_DROP);
5976	}
5977	if (kif->pfik_flags & PFI_IFLAG_SKIP)
5978		return (PF_PASS);
5979
5980	PF_RULES_RLOCK();
5981
5982	/* We do IP header normalization and packet reassembly here */
5983	if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
5984		action = PF_DROP;
5985		goto done;
5986	}
5987	m = *m0;	/* pf_normalize messes with m0 */
5988	h = mtod(m, struct ip6_hdr *);
5989
5990#if 1
5991	/*
5992	 * we do not support jumbogram yet.  if we keep going, zero ip6_plen
5993	 * will do something bad, so drop the packet for now.
5994	 */
5995	if (htons(h->ip6_plen) == 0) {
5996		action = PF_DROP;
5997		REASON_SET(&reason, PFRES_NORM);	/*XXX*/
5998		goto done;
5999	}
6000#endif
6001
6002	pd.src = (struct pf_addr *)&h->ip6_src;
6003	pd.dst = (struct pf_addr *)&h->ip6_dst;
6004	pd.sport = pd.dport = NULL;
6005	pd.ip_sum = NULL;
6006	pd.proto_sum = NULL;
6007	pd.dir = dir;
6008	pd.sidx = (dir == PF_IN) ? 0 : 1;
6009	pd.didx = (dir == PF_IN) ? 1 : 0;
6010	pd.af = AF_INET6;
6011	pd.tos = 0;
6012	pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
6013
6014	off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
6015	pd.proto = h->ip6_nxt;
6016	do {
6017		switch (pd.proto) {
6018		case IPPROTO_FRAGMENT:
6019			action = pf_test_fragment(&r, dir, kif, m, h,
6020			    &pd, &a, &ruleset);
6021			if (action == PF_DROP)
6022				REASON_SET(&reason, PFRES_FRAG);
6023			goto done;
6024		case IPPROTO_ROUTING: {
6025			struct ip6_rthdr rthdr;
6026
6027			if (rh_cnt++) {
6028				DPFPRINTF(PF_DEBUG_MISC,
6029				    ("pf: IPv6 more than one rthdr\n"));
6030				action = PF_DROP;
6031				REASON_SET(&reason, PFRES_IPOPTIONS);
6032				log = 1;
6033				goto done;
6034			}
6035			if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
6036			    &reason, pd.af)) {
6037				DPFPRINTF(PF_DEBUG_MISC,
6038				    ("pf: IPv6 short rthdr\n"));
6039				action = PF_DROP;
6040				REASON_SET(&reason, PFRES_SHORT);
6041				log = 1;
6042				goto done;
6043			}
6044			if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
6045				DPFPRINTF(PF_DEBUG_MISC,
6046				    ("pf: IPv6 rthdr0\n"));
6047				action = PF_DROP;
6048				REASON_SET(&reason, PFRES_IPOPTIONS);
6049				log = 1;
6050				goto done;
6051			}
6052			/* FALLTHROUGH */
6053		}
6054		case IPPROTO_AH:
6055		case IPPROTO_HOPOPTS:
6056		case IPPROTO_DSTOPTS: {
6057			/* get next header and header length */
6058			struct ip6_ext	opt6;
6059
6060			if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
6061			    NULL, &reason, pd.af)) {
6062				DPFPRINTF(PF_DEBUG_MISC,
6063				    ("pf: IPv6 short opt\n"));
6064				action = PF_DROP;
6065				log = 1;
6066				goto done;
6067			}
6068			if (pd.proto == IPPROTO_AH)
6069				off += (opt6.ip6e_len + 2) * 4;
6070			else
6071				off += (opt6.ip6e_len + 1) * 8;
6072			pd.proto = opt6.ip6e_nxt;
6073			/* goto the next header */
6074			break;
6075		}
6076		default:
6077			terminal++;
6078			break;
6079		}
6080	} while (!terminal);
6081
6082	/* if there's no routing header, use unmodified mbuf for checksumming */
6083	if (!n)
6084		n = m;
6085
6086	switch (pd.proto) {
6087
6088	case IPPROTO_TCP: {
6089		struct tcphdr	th;
6090
6091		pd.hdr.tcp = &th;
6092		if (!pf_pull_hdr(m, off, &th, sizeof(th),
6093		    &action, &reason, AF_INET6)) {
6094			log = action != PF_PASS;
6095			goto done;
6096		}
6097		pd.p_len = pd.tot_len - off - (th.th_off << 2);
6098		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
6099		if (action == PF_DROP)
6100			goto done;
6101		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
6102		    &reason);
6103		if (action == PF_PASS) {
6104			if (pfsync_update_state_ptr != NULL)
6105				pfsync_update_state_ptr(s);
6106			r = s->rule.ptr;
6107			a = s->anchor.ptr;
6108			log = s->log;
6109		} else if (s == NULL)
6110			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6111			    &a, &ruleset, inp);
6112		break;
6113	}
6114
6115	case IPPROTO_UDP: {
6116		struct udphdr	uh;
6117
6118		pd.hdr.udp = &uh;
6119		if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
6120		    &action, &reason, AF_INET6)) {
6121			log = action != PF_PASS;
6122			goto done;
6123		}
6124		if (uh.uh_dport == 0 ||
6125		    ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
6126		    ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
6127			action = PF_DROP;
6128			REASON_SET(&reason, PFRES_SHORT);
6129			goto done;
6130		}
6131		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
6132		if (action == PF_PASS) {
6133			if (pfsync_update_state_ptr != NULL)
6134				pfsync_update_state_ptr(s);
6135			r = s->rule.ptr;
6136			a = s->anchor.ptr;
6137			log = s->log;
6138		} else if (s == NULL)
6139			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6140			    &a, &ruleset, inp);
6141		break;
6142	}
6143
6144	case IPPROTO_ICMP: {
6145		action = PF_DROP;
6146		DPFPRINTF(PF_DEBUG_MISC,
6147		    ("pf: dropping IPv6 packet with ICMPv4 payload\n"));
6148		goto done;
6149	}
6150
6151	case IPPROTO_ICMPV6: {
6152		struct icmp6_hdr	ih;
6153
6154		pd.hdr.icmp6 = &ih;
6155		if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
6156		    &action, &reason, AF_INET6)) {
6157			log = action != PF_PASS;
6158			goto done;
6159		}
6160		action = pf_test_state_icmp(&s, dir, kif,
6161		    m, off, h, &pd, &reason);
6162		if (action == PF_PASS) {
6163			if (pfsync_update_state_ptr != NULL)
6164				pfsync_update_state_ptr(s);
6165			r = s->rule.ptr;
6166			a = s->anchor.ptr;
6167			log = s->log;
6168		} else if (s == NULL)
6169			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6170			    &a, &ruleset, inp);
6171		break;
6172	}
6173
6174	default:
6175		action = pf_test_state_other(&s, dir, kif, m, &pd);
6176		if (action == PF_PASS) {
6177			if (pfsync_update_state_ptr != NULL)
6178				pfsync_update_state_ptr(s);
6179			r = s->rule.ptr;
6180			a = s->anchor.ptr;
6181			log = s->log;
6182		} else if (s == NULL)
6183			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6184			    &a, &ruleset, inp);
6185		break;
6186	}
6187
6188done:
6189	PF_RULES_RUNLOCK();
6190	if (n != m) {
6191		m_freem(n);
6192		n = NULL;
6193	}
6194
6195	/* handle dangerous IPv6 extension headers. */
6196	if (action == PF_PASS && rh_cnt &&
6197	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
6198		action = PF_DROP;
6199		REASON_SET(&reason, PFRES_IPOPTIONS);
6200		log = 1;
6201		DPFPRINTF(PF_DEBUG_MISC,
6202		    ("pf: dropping packet with dangerous v6 headers\n"));
6203	}
6204
6205	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
6206		action = PF_DROP;
6207		REASON_SET(&reason, PFRES_MEMORY);
6208	}
6209	if (r->rtableid >= 0)
6210		M_SETFIB(m, r->rtableid);
6211
6212#ifdef ALTQ
6213	if (action == PF_PASS && r->qid) {
6214		if (pd.pf_mtag == NULL &&
6215		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
6216			action = PF_DROP;
6217			REASON_SET(&reason, PFRES_MEMORY);
6218		}
6219		if (pd.tos & IPTOS_LOWDELAY)
6220			pd.pf_mtag->qid = r->pqid;
6221		else
6222			pd.pf_mtag->qid = r->qid;
6223		/* add hints for ecn */
6224		pd.pf_mtag->hdr = h;
6225	}
6226#endif /* ALTQ */
6227
6228	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
6229	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
6230	    (s->nat_rule.ptr->action == PF_RDR ||
6231	    s->nat_rule.ptr->action == PF_BINAT) &&
6232	    IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
6233		m->m_flags |= M_SKIP_FIREWALL;
6234
6235	/* XXX: Anybody working on it?! */
6236	if (r->divert.port)
6237		printf("pf: divert(9) is not supported for IPv6\n");
6238
6239	if (log) {
6240		struct pf_rule *lr;
6241
6242		if (s != NULL && s->nat_rule.ptr != NULL &&
6243		    s->nat_rule.ptr->log & PF_LOG_ALL)
6244			lr = s->nat_rule.ptr;
6245		else
6246			lr = r;
6247		PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset,
6248		    &pd, (s == NULL));
6249	}
6250
6251	kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
6252	kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
6253
6254	if (action == PF_PASS || r->action == PF_DROP) {
6255		dirndx = (dir == PF_OUT);
6256		r->packets[dirndx]++;
6257		r->bytes[dirndx] += pd.tot_len;
6258		if (a != NULL) {
6259			a->packets[dirndx]++;
6260			a->bytes[dirndx] += pd.tot_len;
6261		}
6262		if (s != NULL) {
6263			if (s->nat_rule.ptr != NULL) {
6264				s->nat_rule.ptr->packets[dirndx]++;
6265				s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
6266			}
6267			if (s->src_node != NULL) {
6268				s->src_node->packets[dirndx]++;
6269				s->src_node->bytes[dirndx] += pd.tot_len;
6270			}
6271			if (s->nat_src_node != NULL) {
6272				s->nat_src_node->packets[dirndx]++;
6273				s->nat_src_node->bytes[dirndx] += pd.tot_len;
6274			}
6275			dirndx = (dir == s->direction) ? 0 : 1;
6276			s->packets[dirndx]++;
6277			s->bytes[dirndx] += pd.tot_len;
6278		}
6279		tr = r;
6280		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
6281		if (nr != NULL && r == &V_pf_default_rule)
6282			tr = nr;
6283		if (tr->src.addr.type == PF_ADDR_TABLE)
6284			pfr_update_stats(tr->src.addr.p.tbl,
6285			    (s == NULL) ? pd.src :
6286			    &s->key[(s->direction == PF_IN)]->addr[0],
6287			    pd.af, pd.tot_len, dir == PF_OUT,
6288			    r->action == PF_PASS, tr->src.neg);
6289		if (tr->dst.addr.type == PF_ADDR_TABLE)
6290			pfr_update_stats(tr->dst.addr.p.tbl,
6291			    (s == NULL) ? pd.dst :
6292			    &s->key[(s->direction == PF_IN)]->addr[1],
6293			    pd.af, pd.tot_len, dir == PF_OUT,
6294			    r->action == PF_PASS, tr->dst.neg);
6295	}
6296
6297	switch (action) {
6298	case PF_SYNPROXY_DROP:
6299		m_freem(*m0);
6300	case PF_DEFER:
6301		*m0 = NULL;
6302		action = PF_PASS;
6303		break;
6304	default:
6305		/* pf_route6() returns unlocked. */
6306		if (r->rt) {
6307			pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd);
6308			return (action);
6309		}
6310		break;
6311	}
6312
6313	if (s)
6314		PF_STATE_UNLOCK(s);
6315
6316	return (action);
6317}
6318#endif /* INET6 */
6319