pf.c revision 264454
1/*-
2 * Copyright (c) 2001 Daniel Hartmeier
3 * Copyright (c) 2002 - 2008 Henning Brauer
4 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 *    - Redistributions of source code must retain the above copyright
12 *      notice, this list of conditions and the following disclaimer.
13 *    - Redistributions in binary form must reproduce the above
14 *      copyright notice, this list of conditions and the following
15 *      disclaimer in the documentation and/or other materials provided
16 *      with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 * Effort sponsored in part by the Defense Advanced Research Projects
32 * Agency (DARPA) and Air Force Research Laboratory, Air Force
33 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
34 *
35 *	$OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: stable/10/sys/netpfil/pf/pf.c 264454 2014-04-14 09:36:15Z mm $");
40
41#include "opt_inet.h"
42#include "opt_inet6.h"
43#include "opt_bpf.h"
44#include "opt_pf.h"
45
46#include <sys/param.h>
47#include <sys/bus.h>
48#include <sys/endian.h>
49#include <sys/hash.h>
50#include <sys/interrupt.h>
51#include <sys/kernel.h>
52#include <sys/kthread.h>
53#include <sys/limits.h>
54#include <sys/mbuf.h>
55#include <sys/md5.h>
56#include <sys/random.h>
57#include <sys/refcount.h>
58#include <sys/socket.h>
59#include <sys/sysctl.h>
60#include <sys/taskqueue.h>
61#include <sys/ucred.h>
62
63#include <net/if.h>
64#include <net/if_types.h>
65#include <net/route.h>
66#include <net/radix_mpath.h>
67#include <net/vnet.h>
68
69#include <net/pfvar.h>
70#include <net/if_pflog.h>
71#include <net/if_pfsync.h>
72
73#include <netinet/in_pcb.h>
74#include <netinet/in_var.h>
75#include <netinet/ip.h>
76#include <netinet/ip_fw.h>
77#include <netinet/ip_icmp.h>
78#include <netinet/icmp_var.h>
79#include <netinet/ip_var.h>
80#include <netinet/tcp.h>
81#include <netinet/tcp_fsm.h>
82#include <netinet/tcp_seq.h>
83#include <netinet/tcp_timer.h>
84#include <netinet/tcp_var.h>
85#include <netinet/udp.h>
86#include <netinet/udp_var.h>
87
88#include <netpfil/ipfw/ip_fw_private.h> /* XXX: only for DIR_IN/DIR_OUT */
89
90#ifdef INET6
91#include <netinet/ip6.h>
92#include <netinet/icmp6.h>
93#include <netinet6/nd6.h>
94#include <netinet6/ip6_var.h>
95#include <netinet6/in6_pcb.h>
96#endif /* INET6 */
97
98#include <machine/in_cksum.h>
99#include <security/mac/mac_framework.h>
100
101#define	DPFPRINTF(n, x)	if (V_pf_status.debug >= (n)) printf x
102
103/*
104 * Global variables
105 */
106
107/* state tables */
108VNET_DEFINE(struct pf_altqqueue,	 pf_altqs[2]);
109VNET_DEFINE(struct pf_palist,		 pf_pabuf);
110VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_active);
111VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_inactive);
112VNET_DEFINE(struct pf_status,		 pf_status);
113
114VNET_DEFINE(u_int32_t,			 ticket_altqs_active);
115VNET_DEFINE(u_int32_t,			 ticket_altqs_inactive);
116VNET_DEFINE(int,			 altqs_inactive_open);
117VNET_DEFINE(u_int32_t,			 ticket_pabuf);
118
119VNET_DEFINE(MD5_CTX,			 pf_tcp_secret_ctx);
120#define	V_pf_tcp_secret_ctx		 VNET(pf_tcp_secret_ctx)
121VNET_DEFINE(u_char,			 pf_tcp_secret[16]);
122#define	V_pf_tcp_secret			 VNET(pf_tcp_secret)
123VNET_DEFINE(int,			 pf_tcp_secret_init);
124#define	V_pf_tcp_secret_init		 VNET(pf_tcp_secret_init)
125VNET_DEFINE(int,			 pf_tcp_iss_off);
126#define	V_pf_tcp_iss_off		 VNET(pf_tcp_iss_off)
127
128/*
129 * Queue for pf_intr() sends.
130 */
131static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations");
132struct pf_send_entry {
133	STAILQ_ENTRY(pf_send_entry)	pfse_next;
134	struct mbuf			*pfse_m;
135	enum {
136		PFSE_IP,
137		PFSE_IP6,
138		PFSE_ICMP,
139		PFSE_ICMP6,
140	}				pfse_type;
141	union {
142		struct route		ro;
143		struct {
144			int		type;
145			int		code;
146			int		mtu;
147		} icmpopts;
148	} u;
149#define	pfse_ro		u.ro
150#define	pfse_icmp_type	u.icmpopts.type
151#define	pfse_icmp_code	u.icmpopts.code
152#define	pfse_icmp_mtu	u.icmpopts.mtu
153};
154
155STAILQ_HEAD(pf_send_head, pf_send_entry);
156static VNET_DEFINE(struct pf_send_head, pf_sendqueue);
157#define	V_pf_sendqueue	VNET(pf_sendqueue)
158
159static struct mtx pf_sendqueue_mtx;
160#define	PF_SENDQ_LOCK()		mtx_lock(&pf_sendqueue_mtx)
161#define	PF_SENDQ_UNLOCK()	mtx_unlock(&pf_sendqueue_mtx)
162
163/*
164 * Queue for pf_overload_task() tasks.
165 */
166struct pf_overload_entry {
167	SLIST_ENTRY(pf_overload_entry)	next;
168	struct pf_addr  		addr;
169	sa_family_t			af;
170	uint8_t				dir;
171	struct pf_rule  		*rule;
172};
173
174SLIST_HEAD(pf_overload_head, pf_overload_entry);
175static VNET_DEFINE(struct pf_overload_head, pf_overloadqueue);
176#define V_pf_overloadqueue	VNET(pf_overloadqueue)
177static VNET_DEFINE(struct task, pf_overloadtask);
178#define	V_pf_overloadtask	VNET(pf_overloadtask)
179
180static struct mtx pf_overloadqueue_mtx;
181#define	PF_OVERLOADQ_LOCK()	mtx_lock(&pf_overloadqueue_mtx)
182#define	PF_OVERLOADQ_UNLOCK()	mtx_unlock(&pf_overloadqueue_mtx)
183
184VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules);
185struct mtx pf_unlnkdrules_mtx;
186
187static VNET_DEFINE(uma_zone_t,	pf_sources_z);
188#define	V_pf_sources_z	VNET(pf_sources_z)
189static VNET_DEFINE(uma_zone_t,	pf_mtag_z);
190#define	V_pf_mtag_z	VNET(pf_mtag_z)
191VNET_DEFINE(uma_zone_t,	 pf_state_z);
192VNET_DEFINE(uma_zone_t,	 pf_state_key_z);
193
194VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]);
195#define	PFID_CPUBITS	8
196#define	PFID_CPUSHIFT	(sizeof(uint64_t) * NBBY - PFID_CPUBITS)
197#define	PFID_CPUMASK	((uint64_t)((1 << PFID_CPUBITS) - 1) <<	PFID_CPUSHIFT)
198#define	PFID_MAXID	(~PFID_CPUMASK)
199CTASSERT((1 << PFID_CPUBITS) > MAXCPU);
200
201static void		 pf_src_tree_remove_state(struct pf_state *);
202static void		 pf_init_threshold(struct pf_threshold *, u_int32_t,
203			    u_int32_t);
204static void		 pf_add_threshold(struct pf_threshold *);
205static int		 pf_check_threshold(struct pf_threshold *);
206
207static void		 pf_change_ap(struct pf_addr *, u_int16_t *,
208			    u_int16_t *, u_int16_t *, struct pf_addr *,
209			    u_int16_t, u_int8_t, sa_family_t);
210static int		 pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
211			    struct tcphdr *, struct pf_state_peer *);
212static void		 pf_change_icmp(struct pf_addr *, u_int16_t *,
213			    struct pf_addr *, struct pf_addr *, u_int16_t,
214			    u_int16_t *, u_int16_t *, u_int16_t *,
215			    u_int16_t *, u_int8_t, sa_family_t);
216static void		 pf_send_tcp(struct mbuf *,
217			    const struct pf_rule *, sa_family_t,
218			    const struct pf_addr *, const struct pf_addr *,
219			    u_int16_t, u_int16_t, u_int32_t, u_int32_t,
220			    u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
221			    u_int16_t, struct ifnet *);
222static void		 pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
223			    sa_family_t, struct pf_rule *);
224static void		 pf_detach_state(struct pf_state *);
225static int		 pf_state_key_attach(struct pf_state_key *,
226			    struct pf_state_key *, struct pf_state *);
227static void		 pf_state_key_detach(struct pf_state *, int);
228static int		 pf_state_key_ctor(void *, int, void *, int);
229static u_int32_t	 pf_tcp_iss(struct pf_pdesc *);
230static int		 pf_test_rule(struct pf_rule **, struct pf_state **,
231			    int, struct pfi_kif *, struct mbuf *, int,
232			    struct pf_pdesc *, struct pf_rule **,
233			    struct pf_ruleset **, struct inpcb *);
234static int		 pf_create_state(struct pf_rule *, struct pf_rule *,
235			    struct pf_rule *, struct pf_pdesc *,
236			    struct pf_src_node *, struct pf_state_key *,
237			    struct pf_state_key *, struct mbuf *, int,
238			    u_int16_t, u_int16_t, int *, struct pfi_kif *,
239			    struct pf_state **, int, u_int16_t, u_int16_t,
240			    int);
241static int		 pf_test_fragment(struct pf_rule **, int,
242			    struct pfi_kif *, struct mbuf *, void *,
243			    struct pf_pdesc *, struct pf_rule **,
244			    struct pf_ruleset **);
245static int		 pf_tcp_track_full(struct pf_state_peer *,
246			    struct pf_state_peer *, struct pf_state **,
247			    struct pfi_kif *, struct mbuf *, int,
248			    struct pf_pdesc *, u_short *, int *);
249static int		 pf_tcp_track_sloppy(struct pf_state_peer *,
250			    struct pf_state_peer *, struct pf_state **,
251			    struct pf_pdesc *, u_short *);
252static int		 pf_test_state_tcp(struct pf_state **, int,
253			    struct pfi_kif *, struct mbuf *, int,
254			    void *, struct pf_pdesc *, u_short *);
255static int		 pf_test_state_udp(struct pf_state **, int,
256			    struct pfi_kif *, struct mbuf *, int,
257			    void *, struct pf_pdesc *);
258static int		 pf_test_state_icmp(struct pf_state **, int,
259			    struct pfi_kif *, struct mbuf *, int,
260			    void *, struct pf_pdesc *, u_short *);
261static int		 pf_test_state_other(struct pf_state **, int,
262			    struct pfi_kif *, struct mbuf *, struct pf_pdesc *);
263static u_int8_t		 pf_get_wscale(struct mbuf *, int, u_int16_t,
264			    sa_family_t);
265static u_int16_t	 pf_get_mss(struct mbuf *, int, u_int16_t,
266			    sa_family_t);
267static u_int16_t	 pf_calc_mss(struct pf_addr *, sa_family_t,
268				int, u_int16_t);
269static void		 pf_set_rt_ifp(struct pf_state *,
270			    struct pf_addr *);
271static int		 pf_check_proto_cksum(struct mbuf *, int, int,
272			    u_int8_t, sa_family_t);
273static void		 pf_print_state_parts(struct pf_state *,
274			    struct pf_state_key *, struct pf_state_key *);
275static int		 pf_addr_wrap_neq(struct pf_addr_wrap *,
276			    struct pf_addr_wrap *);
277static struct pf_state	*pf_find_state(struct pfi_kif *,
278			    struct pf_state_key_cmp *, u_int);
279static int		 pf_src_connlimit(struct pf_state **);
280static void		 pf_overload_task(void *v, int pending);
281static int		 pf_insert_src_node(struct pf_src_node **,
282			    struct pf_rule *, struct pf_addr *, sa_family_t);
283static u_int		 pf_purge_expired_states(u_int, int);
284static void		 pf_purge_unlinked_rules(void);
285static int		 pf_mtag_init(void *, int, int);
286static void		 pf_mtag_free(struct m_tag *);
287#ifdef INET
288static void		 pf_route(struct mbuf **, struct pf_rule *, int,
289			    struct ifnet *, struct pf_state *,
290			    struct pf_pdesc *);
291#endif /* INET */
292#ifdef INET6
293static void		 pf_change_a6(struct pf_addr *, u_int16_t *,
294			    struct pf_addr *, u_int8_t);
295static void		 pf_route6(struct mbuf **, struct pf_rule *, int,
296			    struct ifnet *, struct pf_state *,
297			    struct pf_pdesc *);
298#endif /* INET6 */
299
300int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
301
302VNET_DECLARE(int, pf_end_threads);
303
304VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]);
305
306#define	PACKET_LOOPED(pd)	((pd)->pf_mtag &&			\
307				 (pd)->pf_mtag->flags & PF_PACKET_LOOPED)
308
309#define	STATE_LOOKUP(i, k, d, s, pd)					\
310	do {								\
311		(s) = pf_find_state((i), (k), (d));			\
312		if ((s) == NULL)					\
313			return (PF_DROP);				\
314		if (PACKET_LOOPED(pd))					\
315			return (PF_PASS);				\
316		if ((d) == PF_OUT &&					\
317		    (((s)->rule.ptr->rt == PF_ROUTETO &&		\
318		    (s)->rule.ptr->direction == PF_OUT) ||		\
319		    ((s)->rule.ptr->rt == PF_REPLYTO &&			\
320		    (s)->rule.ptr->direction == PF_IN)) &&		\
321		    (s)->rt_kif != NULL &&				\
322		    (s)->rt_kif != (i))					\
323			return (PF_PASS);				\
324	} while (0)
325
326#define	BOUND_IFACE(r, k) \
327	((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all
328
329#define	STATE_INC_COUNTERS(s)						\
330	do {								\
331		counter_u64_add(s->rule.ptr->states_cur, 1);		\
332		counter_u64_add(s->rule.ptr->states_tot, 1);		\
333		if (s->anchor.ptr != NULL) {				\
334			counter_u64_add(s->anchor.ptr->states_cur, 1);	\
335			counter_u64_add(s->anchor.ptr->states_tot, 1);	\
336		}							\
337		if (s->nat_rule.ptr != NULL) {				\
338			counter_u64_add(s->nat_rule.ptr->states_cur, 1);\
339			counter_u64_add(s->nat_rule.ptr->states_tot, 1);\
340		}							\
341	} while (0)
342
343#define	STATE_DEC_COUNTERS(s)						\
344	do {								\
345		if (s->nat_rule.ptr != NULL)				\
346			counter_u64_add(s->nat_rule.ptr->states_cur, -1);\
347		if (s->anchor.ptr != NULL)				\
348			counter_u64_add(s->anchor.ptr->states_cur, -1);	\
349		counter_u64_add(s->rule.ptr->states_cur, -1);		\
350	} while (0)
351
352static MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures");
353VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
354VNET_DEFINE(struct pf_idhash *, pf_idhash);
355VNET_DEFINE(u_long, pf_hashmask);
356VNET_DEFINE(struct pf_srchash *, pf_srchash);
357VNET_DEFINE(u_long, pf_srchashmask);
358
359SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)");
360
361VNET_DEFINE(u_long, pf_hashsize);
362#define	V_pf_hashsize	VNET(pf_hashsize)
363SYSCTL_VNET_UINT(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN,
364    &VNET_NAME(pf_hashsize), 0, "Size of pf(4) states hashtable");
365
366VNET_DEFINE(u_long, pf_srchashsize);
367#define	V_pf_srchashsize	VNET(pf_srchashsize)
368SYSCTL_VNET_UINT(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN,
369    &VNET_NAME(pf_srchashsize), 0, "Size of pf(4) source nodes hashtable");
370
371VNET_DEFINE(void *, pf_swi_cookie);
372
373VNET_DEFINE(uint32_t, pf_hashseed);
374#define	V_pf_hashseed	VNET(pf_hashseed)
375
376static __inline uint32_t
377pf_hashkey(struct pf_state_key *sk)
378{
379	uint32_t h;
380
381	h = jenkins_hash32((uint32_t *)sk,
382	    sizeof(struct pf_state_key_cmp)/sizeof(uint32_t),
383	    V_pf_hashseed);
384
385	return (h & V_pf_hashmask);
386}
387
388static __inline uint32_t
389pf_hashsrc(struct pf_addr *addr, sa_family_t af)
390{
391	uint32_t h;
392
393	switch (af) {
394	case AF_INET:
395		h = jenkins_hash32((uint32_t *)&addr->v4,
396		    sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed);
397		break;
398	case AF_INET6:
399		h = jenkins_hash32((uint32_t *)&addr->v6,
400		    sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed);
401		break;
402	default:
403		panic("%s: unknown address family %u", __func__, af);
404	}
405
406	return (h & V_pf_srchashmask);
407}
408
409#ifdef INET6
410void
411pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
412{
413	switch (af) {
414#ifdef INET
415	case AF_INET:
416		dst->addr32[0] = src->addr32[0];
417		break;
418#endif /* INET */
419	case AF_INET6:
420		dst->addr32[0] = src->addr32[0];
421		dst->addr32[1] = src->addr32[1];
422		dst->addr32[2] = src->addr32[2];
423		dst->addr32[3] = src->addr32[3];
424		break;
425	}
426}
427#endif /* INET6 */
428
429static void
430pf_init_threshold(struct pf_threshold *threshold,
431    u_int32_t limit, u_int32_t seconds)
432{
433	threshold->limit = limit * PF_THRESHOLD_MULT;
434	threshold->seconds = seconds;
435	threshold->count = 0;
436	threshold->last = time_uptime;
437}
438
439static void
440pf_add_threshold(struct pf_threshold *threshold)
441{
442	u_int32_t t = time_uptime, diff = t - threshold->last;
443
444	if (diff >= threshold->seconds)
445		threshold->count = 0;
446	else
447		threshold->count -= threshold->count * diff /
448		    threshold->seconds;
449	threshold->count += PF_THRESHOLD_MULT;
450	threshold->last = t;
451}
452
453static int
454pf_check_threshold(struct pf_threshold *threshold)
455{
456	return (threshold->count > threshold->limit);
457}
458
459static int
460pf_src_connlimit(struct pf_state **state)
461{
462	struct pf_overload_entry *pfoe;
463	int bad = 0;
464
465	PF_STATE_LOCK_ASSERT(*state);
466
467	(*state)->src_node->conn++;
468	(*state)->src.tcp_est = 1;
469	pf_add_threshold(&(*state)->src_node->conn_rate);
470
471	if ((*state)->rule.ptr->max_src_conn &&
472	    (*state)->rule.ptr->max_src_conn <
473	    (*state)->src_node->conn) {
474		V_pf_status.lcounters[LCNT_SRCCONN]++;
475		bad++;
476	}
477
478	if ((*state)->rule.ptr->max_src_conn_rate.limit &&
479	    pf_check_threshold(&(*state)->src_node->conn_rate)) {
480		V_pf_status.lcounters[LCNT_SRCCONNRATE]++;
481		bad++;
482	}
483
484	if (!bad)
485		return (0);
486
487	/* Kill this state. */
488	(*state)->timeout = PFTM_PURGE;
489	(*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
490
491	if ((*state)->rule.ptr->overload_tbl == NULL)
492		return (1);
493
494	/* Schedule overloading and flushing task. */
495	pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT);
496	if (pfoe == NULL)
497		return (1);	/* too bad :( */
498
499	bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr));
500	pfoe->af = (*state)->key[PF_SK_WIRE]->af;
501	pfoe->rule = (*state)->rule.ptr;
502	pfoe->dir = (*state)->direction;
503	PF_OVERLOADQ_LOCK();
504	SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next);
505	PF_OVERLOADQ_UNLOCK();
506	taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask);
507
508	return (1);
509}
510
511static void
512pf_overload_task(void *v, int pending)
513{
514	struct pf_overload_head queue;
515	struct pfr_addr p;
516	struct pf_overload_entry *pfoe, *pfoe1;
517	uint32_t killed = 0;
518
519	CURVNET_SET((struct vnet *)v);
520
521	PF_OVERLOADQ_LOCK();
522	queue = V_pf_overloadqueue;
523	SLIST_INIT(&V_pf_overloadqueue);
524	PF_OVERLOADQ_UNLOCK();
525
526	bzero(&p, sizeof(p));
527	SLIST_FOREACH(pfoe, &queue, next) {
528		V_pf_status.lcounters[LCNT_OVERLOAD_TABLE]++;
529		if (V_pf_status.debug >= PF_DEBUG_MISC) {
530			printf("%s: blocking address ", __func__);
531			pf_print_host(&pfoe->addr, 0, pfoe->af);
532			printf("\n");
533		}
534
535		p.pfra_af = pfoe->af;
536		switch (pfoe->af) {
537#ifdef INET
538		case AF_INET:
539			p.pfra_net = 32;
540			p.pfra_ip4addr = pfoe->addr.v4;
541			break;
542#endif
543#ifdef INET6
544		case AF_INET6:
545			p.pfra_net = 128;
546			p.pfra_ip6addr = pfoe->addr.v6;
547			break;
548#endif
549		}
550
551		PF_RULES_WLOCK();
552		pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second);
553		PF_RULES_WUNLOCK();
554	}
555
556	/*
557	 * Remove those entries, that don't need flushing.
558	 */
559	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
560		if (pfoe->rule->flush == 0) {
561			SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next);
562			free(pfoe, M_PFTEMP);
563		} else
564			V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++;
565
566	/* If nothing to flush, return. */
567	if (SLIST_EMPTY(&queue)) {
568		CURVNET_RESTORE();
569		return;
570	}
571
572	for (int i = 0; i <= V_pf_hashmask; i++) {
573		struct pf_idhash *ih = &V_pf_idhash[i];
574		struct pf_state_key *sk;
575		struct pf_state *s;
576
577		PF_HASHROW_LOCK(ih);
578		LIST_FOREACH(s, &ih->states, entry) {
579		    sk = s->key[PF_SK_WIRE];
580		    SLIST_FOREACH(pfoe, &queue, next)
581			if (sk->af == pfoe->af &&
582			    ((pfoe->rule->flush & PF_FLUSH_GLOBAL) ||
583			    pfoe->rule == s->rule.ptr) &&
584			    ((pfoe->dir == PF_OUT &&
585			    PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) ||
586			    (pfoe->dir == PF_IN &&
587			    PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) {
588				s->timeout = PFTM_PURGE;
589				s->src.state = s->dst.state = TCPS_CLOSED;
590				killed++;
591			}
592		}
593		PF_HASHROW_UNLOCK(ih);
594	}
595	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
596		free(pfoe, M_PFTEMP);
597	if (V_pf_status.debug >= PF_DEBUG_MISC)
598		printf("%s: %u states killed", __func__, killed);
599
600	CURVNET_RESTORE();
601}
602
603/*
604 * Can return locked on failure, so that we can consistently
605 * allocate and insert a new one.
606 */
607struct pf_src_node *
608pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af,
609	int returnlocked)
610{
611	struct pf_srchash *sh;
612	struct pf_src_node *n;
613
614	V_pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
615
616	sh = &V_pf_srchash[pf_hashsrc(src, af)];
617	PF_HASHROW_LOCK(sh);
618	LIST_FOREACH(n, &sh->nodes, entry)
619		if (n->rule.ptr == rule && n->af == af &&
620		    ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) ||
621		    (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0)))
622			break;
623	if (n != NULL || returnlocked == 0)
624		PF_HASHROW_UNLOCK(sh);
625
626	return (n);
627}
628
629static int
630pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
631    struct pf_addr *src, sa_family_t af)
632{
633
634	KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK ||
635	    rule->rpool.opts & PF_POOL_STICKYADDR),
636	    ("%s for non-tracking rule %p", __func__, rule));
637
638	if (*sn == NULL)
639		*sn = pf_find_src_node(src, rule, af, 1);
640
641	if (*sn == NULL) {
642		struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)];
643
644		PF_HASHROW_ASSERT(sh);
645
646		if (!rule->max_src_nodes ||
647		    counter_u64_fetch(rule->src_nodes) < rule->max_src_nodes)
648			(*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO);
649		else
650			V_pf_status.lcounters[LCNT_SRCNODES]++;
651		if ((*sn) == NULL) {
652			PF_HASHROW_UNLOCK(sh);
653			return (-1);
654		}
655
656		pf_init_threshold(&(*sn)->conn_rate,
657		    rule->max_src_conn_rate.limit,
658		    rule->max_src_conn_rate.seconds);
659
660		(*sn)->af = af;
661		(*sn)->rule.ptr = rule;
662		PF_ACPY(&(*sn)->addr, src, af);
663		LIST_INSERT_HEAD(&sh->nodes, *sn, entry);
664		(*sn)->creation = time_uptime;
665		(*sn)->ruletype = rule->action;
666		if ((*sn)->rule.ptr != NULL)
667			counter_u64_add((*sn)->rule.ptr->src_nodes, 1);
668		PF_HASHROW_UNLOCK(sh);
669		V_pf_status.scounters[SCNT_SRC_NODE_INSERT]++;
670		V_pf_status.src_nodes++;
671	} else {
672		if (rule->max_src_states &&
673		    (*sn)->states >= rule->max_src_states) {
674			V_pf_status.lcounters[LCNT_SRCSTATES]++;
675			return (-1);
676		}
677	}
678	return (0);
679}
680
681void
682pf_unlink_src_node_locked(struct pf_src_node *src)
683{
684#ifdef INVARIANTS
685	struct pf_srchash *sh;
686
687	sh = &V_pf_srchash[pf_hashsrc(&src->addr, src->af)];
688	PF_HASHROW_ASSERT(sh);
689#endif
690	LIST_REMOVE(src, entry);
691	if (src->rule.ptr)
692		counter_u64_add(src->rule.ptr->src_nodes, -1);
693	V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
694	V_pf_status.src_nodes--;
695}
696
697void
698pf_unlink_src_node(struct pf_src_node *src)
699{
700	struct pf_srchash *sh;
701
702	sh = &V_pf_srchash[pf_hashsrc(&src->addr, src->af)];
703	PF_HASHROW_LOCK(sh);
704	pf_unlink_src_node_locked(src);
705	PF_HASHROW_UNLOCK(sh);
706}
707
708static void
709pf_free_src_node(struct pf_src_node *sn)
710{
711
712	KASSERT(sn->states == 0, ("%s: %p has refs", __func__, sn));
713	uma_zfree(V_pf_sources_z, sn);
714}
715
716u_int
717pf_free_src_nodes(struct pf_src_node_list *head)
718{
719	struct pf_src_node *sn, *tmp;
720	u_int count = 0;
721
722	LIST_FOREACH_SAFE(sn, head, entry, tmp) {
723		pf_free_src_node(sn);
724		count++;
725	}
726
727	return (count);
728}
729
730/* Data storage structures initialization. */
731void
732pf_initialize()
733{
734	struct pf_keyhash	*kh;
735	struct pf_idhash	*ih;
736	struct pf_srchash	*sh;
737	u_int i;
738
739	TUNABLE_ULONG_FETCH("net.pf.states_hashsize", &V_pf_hashsize);
740	if (V_pf_hashsize == 0 || !powerof2(V_pf_hashsize))
741		V_pf_hashsize = PF_HASHSIZ;
742	TUNABLE_ULONG_FETCH("net.pf.source_nodes_hashsize", &V_pf_srchashsize);
743	if (V_pf_srchashsize == 0 || !powerof2(V_pf_srchashsize))
744		V_pf_srchashsize = PF_HASHSIZ / 4;
745
746	V_pf_hashseed = arc4random();
747
748	/* States and state keys storage. */
749	V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state),
750	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
751	V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z;
752	uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
753	uma_zone_set_warning(V_pf_state_z, "PF states limit reached");
754
755	V_pf_state_key_z = uma_zcreate("pf state keys",
756	    sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL,
757	    UMA_ALIGN_PTR, 0);
758	V_pf_keyhash = malloc(V_pf_hashsize * sizeof(struct pf_keyhash),
759	    M_PFHASH, M_WAITOK | M_ZERO);
760	V_pf_idhash = malloc(V_pf_hashsize * sizeof(struct pf_idhash),
761	    M_PFHASH, M_WAITOK | M_ZERO);
762	V_pf_hashmask = V_pf_hashsize - 1;
763	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
764	    i++, kh++, ih++) {
765		mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK);
766		mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
767	}
768
769	/* Source nodes. */
770	V_pf_sources_z = uma_zcreate("pf source nodes",
771	    sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
772	    0);
773	V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z;
774	uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT);
775	uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached");
776	V_pf_srchash = malloc(V_pf_srchashsize * sizeof(struct pf_srchash),
777	  M_PFHASH, M_WAITOK|M_ZERO);
778	V_pf_srchashmask = V_pf_srchashsize - 1;
779	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++)
780		mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
781
782	/* ALTQ */
783	TAILQ_INIT(&V_pf_altqs[0]);
784	TAILQ_INIT(&V_pf_altqs[1]);
785	TAILQ_INIT(&V_pf_pabuf);
786	V_pf_altqs_active = &V_pf_altqs[0];
787	V_pf_altqs_inactive = &V_pf_altqs[1];
788
789	/* Mbuf tags */
790	V_pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) +
791	    sizeof(struct pf_mtag), NULL, NULL, pf_mtag_init, NULL,
792	    UMA_ALIGN_PTR, 0);
793
794	/* Send & overload+flush queues. */
795	STAILQ_INIT(&V_pf_sendqueue);
796	SLIST_INIT(&V_pf_overloadqueue);
797	TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, curvnet);
798	mtx_init(&pf_sendqueue_mtx, "pf send queue", NULL, MTX_DEF);
799	mtx_init(&pf_overloadqueue_mtx, "pf overload/flush queue", NULL,
800	    MTX_DEF);
801
802	/* Unlinked, but may be referenced rules. */
803	TAILQ_INIT(&V_pf_unlinked_rules);
804	mtx_init(&pf_unlnkdrules_mtx, "pf unlinked rules", NULL, MTX_DEF);
805}
806
807void
808pf_cleanup()
809{
810	struct pf_keyhash	*kh;
811	struct pf_idhash	*ih;
812	struct pf_srchash	*sh;
813	struct pf_send_entry	*pfse, *next;
814	u_int i;
815
816	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
817	    i++, kh++, ih++) {
818		KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
819		    __func__));
820		KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
821		    __func__));
822		mtx_destroy(&kh->lock);
823		mtx_destroy(&ih->lock);
824	}
825	free(V_pf_keyhash, M_PFHASH);
826	free(V_pf_idhash, M_PFHASH);
827
828	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
829		KASSERT(LIST_EMPTY(&sh->nodes),
830		    ("%s: source node hash not empty", __func__));
831		mtx_destroy(&sh->lock);
832	}
833	free(V_pf_srchash, M_PFHASH);
834
835	STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
836		m_freem(pfse->pfse_m);
837		free(pfse, M_PFTEMP);
838	}
839
840	mtx_destroy(&pf_sendqueue_mtx);
841	mtx_destroy(&pf_overloadqueue_mtx);
842	mtx_destroy(&pf_unlnkdrules_mtx);
843
844	uma_zdestroy(V_pf_mtag_z);
845	uma_zdestroy(V_pf_sources_z);
846	uma_zdestroy(V_pf_state_z);
847	uma_zdestroy(V_pf_state_key_z);
848}
849
850static int
851pf_mtag_init(void *mem, int size, int how)
852{
853	struct m_tag *t;
854
855	t = (struct m_tag *)mem;
856	t->m_tag_cookie = MTAG_ABI_COMPAT;
857	t->m_tag_id = PACKET_TAG_PF;
858	t->m_tag_len = sizeof(struct pf_mtag);
859	t->m_tag_free = pf_mtag_free;
860
861	return (0);
862}
863
864static void
865pf_mtag_free(struct m_tag *t)
866{
867
868	uma_zfree(V_pf_mtag_z, t);
869}
870
871struct pf_mtag *
872pf_get_mtag(struct mbuf *m)
873{
874	struct m_tag *mtag;
875
876	if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL)
877		return ((struct pf_mtag *)(mtag + 1));
878
879	mtag = uma_zalloc(V_pf_mtag_z, M_NOWAIT);
880	if (mtag == NULL)
881		return (NULL);
882	bzero(mtag + 1, sizeof(struct pf_mtag));
883	m_tag_prepend(m, mtag);
884
885	return ((struct pf_mtag *)(mtag + 1));
886}
887
888static int
889pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
890    struct pf_state *s)
891{
892	struct pf_keyhash	*khs, *khw, *kh;
893	struct pf_state_key	*sk, *cur;
894	struct pf_state		*si, *olds = NULL;
895	int idx;
896
897	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
898	KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
899	KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
900
901	/*
902	 * We need to lock hash slots of both keys. To avoid deadlock
903	 * we always lock the slot with lower address first. Unlock order
904	 * isn't important.
905	 *
906	 * We also need to lock ID hash slot before dropping key
907	 * locks. On success we return with ID hash slot locked.
908	 */
909
910	if (skw == sks) {
911		khs = khw = &V_pf_keyhash[pf_hashkey(skw)];
912		PF_HASHROW_LOCK(khs);
913	} else {
914		khs = &V_pf_keyhash[pf_hashkey(sks)];
915		khw = &V_pf_keyhash[pf_hashkey(skw)];
916		if (khs == khw) {
917			PF_HASHROW_LOCK(khs);
918		} else if (khs < khw) {
919			PF_HASHROW_LOCK(khs);
920			PF_HASHROW_LOCK(khw);
921		} else {
922			PF_HASHROW_LOCK(khw);
923			PF_HASHROW_LOCK(khs);
924		}
925	}
926
927#define	KEYS_UNLOCK()	do {			\
928	if (khs != khw) {			\
929		PF_HASHROW_UNLOCK(khs);		\
930		PF_HASHROW_UNLOCK(khw);		\
931	} else					\
932		PF_HASHROW_UNLOCK(khs);		\
933} while (0)
934
935	/*
936	 * First run: start with wire key.
937	 */
938	sk = skw;
939	kh = khw;
940	idx = PF_SK_WIRE;
941
942keyattach:
943	LIST_FOREACH(cur, &kh->keys, entry)
944		if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
945			break;
946
947	if (cur != NULL) {
948		/* Key exists. Check for same kif, if none, add to key. */
949		TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
950			struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
951
952			PF_HASHROW_LOCK(ih);
953			if (si->kif == s->kif &&
954			    si->direction == s->direction) {
955				if (sk->proto == IPPROTO_TCP &&
956				    si->src.state >= TCPS_FIN_WAIT_2 &&
957				    si->dst.state >= TCPS_FIN_WAIT_2) {
958					/*
959					 * New state matches an old >FIN_WAIT_2
960					 * state. We can't drop key hash locks,
961					 * thus we can't unlink it properly.
962					 *
963					 * As a workaround we drop it into
964					 * TCPS_CLOSED state, schedule purge
965					 * ASAP and push it into the very end
966					 * of the slot TAILQ, so that it won't
967					 * conflict with our new state.
968					 */
969					si->src.state = si->dst.state =
970					    TCPS_CLOSED;
971					si->timeout = PFTM_PURGE;
972					olds = si;
973				} else {
974					if (V_pf_status.debug >= PF_DEBUG_MISC) {
975						printf("pf: %s key attach "
976						    "failed on %s: ",
977						    (idx == PF_SK_WIRE) ?
978						    "wire" : "stack",
979						    s->kif->pfik_name);
980						pf_print_state_parts(s,
981						    (idx == PF_SK_WIRE) ?
982						    sk : NULL,
983						    (idx == PF_SK_STACK) ?
984						    sk : NULL);
985						printf(", existing: ");
986						pf_print_state_parts(si,
987						    (idx == PF_SK_WIRE) ?
988						    sk : NULL,
989						    (idx == PF_SK_STACK) ?
990						    sk : NULL);
991						printf("\n");
992					}
993					PF_HASHROW_UNLOCK(ih);
994					KEYS_UNLOCK();
995					uma_zfree(V_pf_state_key_z, sk);
996					if (idx == PF_SK_STACK)
997						pf_detach_state(s);
998					return (EEXIST); /* collision! */
999				}
1000			}
1001			PF_HASHROW_UNLOCK(ih);
1002		}
1003		uma_zfree(V_pf_state_key_z, sk);
1004		s->key[idx] = cur;
1005	} else {
1006		LIST_INSERT_HEAD(&kh->keys, sk, entry);
1007		s->key[idx] = sk;
1008	}
1009
1010stateattach:
1011	/* List is sorted, if-bound states before floating. */
1012	if (s->kif == V_pfi_all)
1013		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
1014	else
1015		TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
1016
1017	if (olds) {
1018		TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]);
1019		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds,
1020		    key_list[idx]);
1021		olds = NULL;
1022	}
1023
1024	/*
1025	 * Attach done. See how should we (or should not?)
1026	 * attach a second key.
1027	 */
1028	if (sks == skw) {
1029		s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
1030		idx = PF_SK_STACK;
1031		sks = NULL;
1032		goto stateattach;
1033	} else if (sks != NULL) {
1034		/*
1035		 * Continue attaching with stack key.
1036		 */
1037		sk = sks;
1038		kh = khs;
1039		idx = PF_SK_STACK;
1040		sks = NULL;
1041		goto keyattach;
1042	}
1043
1044	PF_STATE_LOCK(s);
1045	KEYS_UNLOCK();
1046
1047	KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
1048	    ("%s failure", __func__));
1049
1050	return (0);
1051#undef	KEYS_UNLOCK
1052}
1053
1054static void
1055pf_detach_state(struct pf_state *s)
1056{
1057	struct pf_state_key *sks = s->key[PF_SK_STACK];
1058	struct pf_keyhash *kh;
1059
1060	if (sks != NULL) {
1061		kh = &V_pf_keyhash[pf_hashkey(sks)];
1062		PF_HASHROW_LOCK(kh);
1063		if (s->key[PF_SK_STACK] != NULL)
1064			pf_state_key_detach(s, PF_SK_STACK);
1065		/*
1066		 * If both point to same key, then we are done.
1067		 */
1068		if (sks == s->key[PF_SK_WIRE]) {
1069			pf_state_key_detach(s, PF_SK_WIRE);
1070			PF_HASHROW_UNLOCK(kh);
1071			return;
1072		}
1073		PF_HASHROW_UNLOCK(kh);
1074	}
1075
1076	if (s->key[PF_SK_WIRE] != NULL) {
1077		kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])];
1078		PF_HASHROW_LOCK(kh);
1079		if (s->key[PF_SK_WIRE] != NULL)
1080			pf_state_key_detach(s, PF_SK_WIRE);
1081		PF_HASHROW_UNLOCK(kh);
1082	}
1083}
1084
1085static void
1086pf_state_key_detach(struct pf_state *s, int idx)
1087{
1088	struct pf_state_key *sk = s->key[idx];
1089#ifdef INVARIANTS
1090	struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)];
1091
1092	PF_HASHROW_ASSERT(kh);
1093#endif
1094	TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]);
1095	s->key[idx] = NULL;
1096
1097	if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) {
1098		LIST_REMOVE(sk, entry);
1099		uma_zfree(V_pf_state_key_z, sk);
1100	}
1101}
1102
1103static int
1104pf_state_key_ctor(void *mem, int size, void *arg, int flags)
1105{
1106	struct pf_state_key *sk = mem;
1107
1108	bzero(sk, sizeof(struct pf_state_key_cmp));
1109	TAILQ_INIT(&sk->states[PF_SK_WIRE]);
1110	TAILQ_INIT(&sk->states[PF_SK_STACK]);
1111
1112	return (0);
1113}
1114
1115struct pf_state_key *
1116pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr,
1117	struct pf_addr *daddr, u_int16_t sport, u_int16_t dport)
1118{
1119	struct pf_state_key *sk;
1120
1121	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1122	if (sk == NULL)
1123		return (NULL);
1124
1125	PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af);
1126	PF_ACPY(&sk->addr[pd->didx], daddr, pd->af);
1127	sk->port[pd->sidx] = sport;
1128	sk->port[pd->didx] = dport;
1129	sk->proto = pd->proto;
1130	sk->af = pd->af;
1131
1132	return (sk);
1133}
1134
1135struct pf_state_key *
1136pf_state_key_clone(struct pf_state_key *orig)
1137{
1138	struct pf_state_key *sk;
1139
1140	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1141	if (sk == NULL)
1142		return (NULL);
1143
1144	bcopy(orig, sk, sizeof(struct pf_state_key_cmp));
1145
1146	return (sk);
1147}
1148
1149int
1150pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw,
1151    struct pf_state_key *sks, struct pf_state *s)
1152{
1153	struct pf_idhash *ih;
1154	struct pf_state *cur;
1155	int error;
1156
1157	KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]),
1158	    ("%s: sks not pristine", __func__));
1159	KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]),
1160	    ("%s: skw not pristine", __func__));
1161	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
1162
1163	s->kif = kif;
1164
1165	if (s->id == 0 && s->creatorid == 0) {
1166		/* XXX: should be atomic, but probability of collision low */
1167		if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID)
1168			V_pf_stateid[curcpu] = 1;
1169		s->id |= (uint64_t )curcpu << PFID_CPUSHIFT;
1170		s->id = htobe64(s->id);
1171		s->creatorid = V_pf_status.hostid;
1172	}
1173
1174	/* Returns with ID locked on success. */
1175	if ((error = pf_state_key_attach(skw, sks, s)) != 0)
1176		return (error);
1177
1178	ih = &V_pf_idhash[PF_IDHASH(s)];
1179	PF_HASHROW_ASSERT(ih);
1180	LIST_FOREACH(cur, &ih->states, entry)
1181		if (cur->id == s->id && cur->creatorid == s->creatorid)
1182			break;
1183
1184	if (cur != NULL) {
1185		PF_HASHROW_UNLOCK(ih);
1186		if (V_pf_status.debug >= PF_DEBUG_MISC) {
1187			printf("pf: state ID collision: "
1188			    "id: %016llx creatorid: %08x\n",
1189			    (unsigned long long)be64toh(s->id),
1190			    ntohl(s->creatorid));
1191		}
1192		pf_detach_state(s);
1193		return (EEXIST);
1194	}
1195	LIST_INSERT_HEAD(&ih->states, s, entry);
1196	/* One for keys, one for ID hash. */
1197	refcount_init(&s->refs, 2);
1198
1199	V_pf_status.fcounters[FCNT_STATE_INSERT]++;
1200	if (pfsync_insert_state_ptr != NULL)
1201		pfsync_insert_state_ptr(s);
1202
1203	/* Returns locked. */
1204	return (0);
1205}
1206
1207/*
1208 * Find state by ID: returns with locked row on success.
1209 */
1210struct pf_state *
1211pf_find_state_byid(uint64_t id, uint32_t creatorid)
1212{
1213	struct pf_idhash *ih;
1214	struct pf_state *s;
1215
1216	V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
1217
1218	ih = &V_pf_idhash[(be64toh(id) % (V_pf_hashmask + 1))];
1219
1220	PF_HASHROW_LOCK(ih);
1221	LIST_FOREACH(s, &ih->states, entry)
1222		if (s->id == id && s->creatorid == creatorid)
1223			break;
1224
1225	if (s == NULL)
1226		PF_HASHROW_UNLOCK(ih);
1227
1228	return (s);
1229}
1230
1231/*
1232 * Find state by key.
1233 * Returns with ID hash slot locked on success.
1234 */
1235static struct pf_state *
1236pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir)
1237{
1238	struct pf_keyhash	*kh;
1239	struct pf_state_key	*sk;
1240	struct pf_state		*s;
1241	int idx;
1242
1243	V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
1244
1245	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1246
1247	PF_HASHROW_LOCK(kh);
1248	LIST_FOREACH(sk, &kh->keys, entry)
1249		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1250			break;
1251	if (sk == NULL) {
1252		PF_HASHROW_UNLOCK(kh);
1253		return (NULL);
1254	}
1255
1256	idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK);
1257
1258	/* List is sorted, if-bound states before floating ones. */
1259	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx])
1260		if (s->kif == V_pfi_all || s->kif == kif) {
1261			PF_STATE_LOCK(s);
1262			PF_HASHROW_UNLOCK(kh);
1263			if (s->timeout >= PFTM_MAX) {
1264				/*
1265				 * State is either being processed by
1266				 * pf_unlink_state() in an other thread, or
1267				 * is scheduled for immediate expiry.
1268				 */
1269				PF_STATE_UNLOCK(s);
1270				return (NULL);
1271			}
1272			return (s);
1273		}
1274	PF_HASHROW_UNLOCK(kh);
1275
1276	return (NULL);
1277}
1278
1279struct pf_state *
1280pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
1281{
1282	struct pf_keyhash	*kh;
1283	struct pf_state_key	*sk;
1284	struct pf_state		*s, *ret = NULL;
1285	int			 idx, inout = 0;
1286
1287	V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
1288
1289	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1290
1291	PF_HASHROW_LOCK(kh);
1292	LIST_FOREACH(sk, &kh->keys, entry)
1293		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1294			break;
1295	if (sk == NULL) {
1296		PF_HASHROW_UNLOCK(kh);
1297		return (NULL);
1298	}
1299	switch (dir) {
1300	case PF_IN:
1301		idx = PF_SK_WIRE;
1302		break;
1303	case PF_OUT:
1304		idx = PF_SK_STACK;
1305		break;
1306	case PF_INOUT:
1307		idx = PF_SK_WIRE;
1308		inout = 1;
1309		break;
1310	default:
1311		panic("%s: dir %u", __func__, dir);
1312	}
1313second_run:
1314	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
1315		if (more == NULL) {
1316			PF_HASHROW_UNLOCK(kh);
1317			return (s);
1318		}
1319
1320		if (ret)
1321			(*more)++;
1322		else
1323			ret = s;
1324	}
1325	if (inout == 1) {
1326		inout = 0;
1327		idx = PF_SK_STACK;
1328		goto second_run;
1329	}
1330	PF_HASHROW_UNLOCK(kh);
1331
1332	return (ret);
1333}
1334
1335/* END state table stuff */
1336
1337static void
1338pf_send(struct pf_send_entry *pfse)
1339{
1340
1341	PF_SENDQ_LOCK();
1342	STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next);
1343	PF_SENDQ_UNLOCK();
1344	swi_sched(V_pf_swi_cookie, 0);
1345}
1346
1347void
1348pf_intr(void *v)
1349{
1350	struct pf_send_head queue;
1351	struct pf_send_entry *pfse, *next;
1352
1353	CURVNET_SET((struct vnet *)v);
1354
1355	PF_SENDQ_LOCK();
1356	queue = V_pf_sendqueue;
1357	STAILQ_INIT(&V_pf_sendqueue);
1358	PF_SENDQ_UNLOCK();
1359
1360	STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) {
1361		switch (pfse->pfse_type) {
1362#ifdef INET
1363		case PFSE_IP:
1364			ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL);
1365			break;
1366		case PFSE_ICMP:
1367			icmp_error(pfse->pfse_m, pfse->pfse_icmp_type,
1368			    pfse->pfse_icmp_code, 0, pfse->pfse_icmp_mtu);
1369			break;
1370#endif /* INET */
1371#ifdef INET6
1372		case PFSE_IP6:
1373			ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL,
1374			    NULL);
1375			break;
1376		case PFSE_ICMP6:
1377			icmp6_error(pfse->pfse_m, pfse->pfse_icmp_type,
1378			    pfse->pfse_icmp_code, pfse->pfse_icmp_mtu);
1379			break;
1380#endif /* INET6 */
1381		default:
1382			panic("%s: unknown type", __func__);
1383		}
1384		free(pfse, M_PFTEMP);
1385	}
1386	CURVNET_RESTORE();
1387}
1388
1389void
1390pf_purge_thread(void *v)
1391{
1392	u_int idx = 0;
1393
1394	CURVNET_SET((struct vnet *)v);
1395
1396	for (;;) {
1397		PF_RULES_RLOCK();
1398		rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftm", hz / 10);
1399
1400		if (V_pf_end_threads) {
1401			/*
1402			 * To cleanse up all kifs and rules we need
1403			 * two runs: first one clears reference flags,
1404			 * then pf_purge_expired_states() doesn't
1405			 * raise them, and then second run frees.
1406			 */
1407			PF_RULES_RUNLOCK();
1408			pf_purge_unlinked_rules();
1409			pfi_kif_purge();
1410
1411			/*
1412			 * Now purge everything.
1413			 */
1414			pf_purge_expired_states(0, V_pf_hashmask);
1415			pf_purge_expired_fragments();
1416			pf_purge_expired_src_nodes();
1417
1418			/*
1419			 * Now all kifs & rules should be unreferenced,
1420			 * thus should be successfully freed.
1421			 */
1422			pf_purge_unlinked_rules();
1423			pfi_kif_purge();
1424
1425			/*
1426			 * Announce success and exit.
1427			 */
1428			PF_RULES_RLOCK();
1429			V_pf_end_threads++;
1430			PF_RULES_RUNLOCK();
1431			wakeup(pf_purge_thread);
1432			kproc_exit(0);
1433		}
1434		PF_RULES_RUNLOCK();
1435
1436		/* Process 1/interval fraction of the state table every run. */
1437		idx = pf_purge_expired_states(idx, V_pf_hashmask /
1438			    (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10));
1439
1440		/* Purge other expired types every PFTM_INTERVAL seconds. */
1441		if (idx == 0) {
1442			/*
1443			 * Order is important:
1444			 * - states and src nodes reference rules
1445			 * - states and rules reference kifs
1446			 */
1447			pf_purge_expired_fragments();
1448			pf_purge_expired_src_nodes();
1449			pf_purge_unlinked_rules();
1450			pfi_kif_purge();
1451		}
1452	}
1453	/* not reached */
1454	CURVNET_RESTORE();
1455}
1456
1457u_int32_t
1458pf_state_expires(const struct pf_state *state)
1459{
1460	u_int32_t	timeout;
1461	u_int32_t	start;
1462	u_int32_t	end;
1463	u_int32_t	states;
1464
1465	/* handle all PFTM_* > PFTM_MAX here */
1466	if (state->timeout == PFTM_PURGE)
1467		return (time_uptime);
1468	KASSERT(state->timeout != PFTM_UNLINKED,
1469	    ("pf_state_expires: timeout == PFTM_UNLINKED"));
1470	KASSERT((state->timeout < PFTM_MAX),
1471	    ("pf_state_expires: timeout > PFTM_MAX"));
1472	timeout = state->rule.ptr->timeout[state->timeout];
1473	if (!timeout)
1474		timeout = V_pf_default_rule.timeout[state->timeout];
1475	start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
1476	if (start) {
1477		end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
1478		states = counter_u64_fetch(state->rule.ptr->states_cur);
1479	} else {
1480		start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START];
1481		end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END];
1482		states = V_pf_status.states;
1483	}
1484	if (end && states > start && start < end) {
1485		if (states < end)
1486			return (state->expire + timeout * (end - states) /
1487			    (end - start));
1488		else
1489			return (time_uptime);
1490	}
1491	return (state->expire + timeout);
1492}
1493
1494void
1495pf_purge_expired_src_nodes()
1496{
1497	struct pf_src_node_list	 freelist;
1498	struct pf_srchash	*sh;
1499	struct pf_src_node	*cur, *next;
1500	int i;
1501
1502	LIST_INIT(&freelist);
1503	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
1504	    PF_HASHROW_LOCK(sh);
1505	    LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next)
1506		if (cur->states == 0 && cur->expire <= time_uptime) {
1507			pf_unlink_src_node_locked(cur);
1508			LIST_INSERT_HEAD(&freelist, cur, entry);
1509		} else if (cur->rule.ptr != NULL)
1510			cur->rule.ptr->rule_flag |= PFRULE_REFS;
1511	    PF_HASHROW_UNLOCK(sh);
1512	}
1513
1514	pf_free_src_nodes(&freelist);
1515}
1516
1517static void
1518pf_src_tree_remove_state(struct pf_state *s)
1519{
1520	u_int32_t timeout;
1521
1522	if (s->src_node != NULL) {
1523		if (s->src.tcp_est)
1524			--s->src_node->conn;
1525		if (--s->src_node->states == 0) {
1526			timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1527			if (!timeout)
1528				timeout =
1529				    V_pf_default_rule.timeout[PFTM_SRC_NODE];
1530			s->src_node->expire = time_uptime + timeout;
1531		}
1532	}
1533	if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
1534		if (--s->nat_src_node->states == 0) {
1535			timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1536			if (!timeout)
1537				timeout =
1538				    V_pf_default_rule.timeout[PFTM_SRC_NODE];
1539			s->nat_src_node->expire = time_uptime + timeout;
1540		}
1541	}
1542	s->src_node = s->nat_src_node = NULL;
1543}
1544
1545/*
1546 * Unlink and potentilly free a state. Function may be
1547 * called with ID hash row locked, but always returns
1548 * unlocked, since it needs to go through key hash locking.
1549 */
1550int
1551pf_unlink_state(struct pf_state *s, u_int flags)
1552{
1553	struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)];
1554
1555	if ((flags & PF_ENTER_LOCKED) == 0)
1556		PF_HASHROW_LOCK(ih);
1557	else
1558		PF_HASHROW_ASSERT(ih);
1559
1560	if (s->timeout == PFTM_UNLINKED) {
1561		/*
1562		 * State is being processed
1563		 * by pf_unlink_state() in
1564		 * an other thread.
1565		 */
1566		PF_HASHROW_UNLOCK(ih);
1567		return (0);	/* XXXGL: undefined actually */
1568	}
1569
1570	if (s->src.state == PF_TCPS_PROXY_DST) {
1571		/* XXX wire key the right one? */
1572		pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af,
1573		    &s->key[PF_SK_WIRE]->addr[1],
1574		    &s->key[PF_SK_WIRE]->addr[0],
1575		    s->key[PF_SK_WIRE]->port[1],
1576		    s->key[PF_SK_WIRE]->port[0],
1577		    s->src.seqhi, s->src.seqlo + 1,
1578		    TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL);
1579	}
1580
1581	LIST_REMOVE(s, entry);
1582	pf_src_tree_remove_state(s);
1583
1584	if (pfsync_delete_state_ptr != NULL)
1585		pfsync_delete_state_ptr(s);
1586
1587	STATE_DEC_COUNTERS(s);
1588
1589	s->timeout = PFTM_UNLINKED;
1590
1591	PF_HASHROW_UNLOCK(ih);
1592
1593	pf_detach_state(s);
1594	refcount_release(&s->refs);
1595
1596	return (pf_release_state(s));
1597}
1598
1599void
1600pf_free_state(struct pf_state *cur)
1601{
1602
1603	KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur));
1604	KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__,
1605	    cur->timeout));
1606
1607	pf_normalize_tcp_cleanup(cur);
1608	uma_zfree(V_pf_state_z, cur);
1609	V_pf_status.fcounters[FCNT_STATE_REMOVALS]++;
1610}
1611
1612/*
1613 * Called only from pf_purge_thread(), thus serialized.
1614 */
1615static u_int
1616pf_purge_expired_states(u_int i, int maxcheck)
1617{
1618	struct pf_idhash *ih;
1619	struct pf_state *s;
1620
1621	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1622
1623	/*
1624	 * Go through hash and unlink states that expire now.
1625	 */
1626	while (maxcheck > 0) {
1627
1628		ih = &V_pf_idhash[i];
1629relock:
1630		PF_HASHROW_LOCK(ih);
1631		LIST_FOREACH(s, &ih->states, entry) {
1632			if (pf_state_expires(s) <= time_uptime) {
1633				V_pf_status.states -=
1634				    pf_unlink_state(s, PF_ENTER_LOCKED);
1635				goto relock;
1636			}
1637			s->rule.ptr->rule_flag |= PFRULE_REFS;
1638			if (s->nat_rule.ptr != NULL)
1639				s->nat_rule.ptr->rule_flag |= PFRULE_REFS;
1640			if (s->anchor.ptr != NULL)
1641				s->anchor.ptr->rule_flag |= PFRULE_REFS;
1642			s->kif->pfik_flags |= PFI_IFLAG_REFS;
1643			if (s->rt_kif)
1644				s->rt_kif->pfik_flags |= PFI_IFLAG_REFS;
1645		}
1646		PF_HASHROW_UNLOCK(ih);
1647
1648		/* Return when we hit end of hash. */
1649		if (++i > V_pf_hashmask) {
1650			V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1651			return (0);
1652		}
1653
1654		maxcheck--;
1655	}
1656
1657	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1658
1659	return (i);
1660}
1661
1662static void
1663pf_purge_unlinked_rules()
1664{
1665	struct pf_rulequeue tmpq;
1666	struct pf_rule *r, *r1;
1667
1668	/*
1669	 * If we have overloading task pending, then we'd
1670	 * better skip purging this time. There is a tiny
1671	 * probability that overloading task references
1672	 * an already unlinked rule.
1673	 */
1674	PF_OVERLOADQ_LOCK();
1675	if (!SLIST_EMPTY(&V_pf_overloadqueue)) {
1676		PF_OVERLOADQ_UNLOCK();
1677		return;
1678	}
1679	PF_OVERLOADQ_UNLOCK();
1680
1681	/*
1682	 * Do naive mark-and-sweep garbage collecting of old rules.
1683	 * Reference flag is raised by pf_purge_expired_states()
1684	 * and pf_purge_expired_src_nodes().
1685	 *
1686	 * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK,
1687	 * use a temporary queue.
1688	 */
1689	TAILQ_INIT(&tmpq);
1690	PF_UNLNKDRULES_LOCK();
1691	TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) {
1692		if (!(r->rule_flag & PFRULE_REFS)) {
1693			TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries);
1694			TAILQ_INSERT_TAIL(&tmpq, r, entries);
1695		} else
1696			r->rule_flag &= ~PFRULE_REFS;
1697	}
1698	PF_UNLNKDRULES_UNLOCK();
1699
1700	if (!TAILQ_EMPTY(&tmpq)) {
1701		PF_RULES_WLOCK();
1702		TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) {
1703			TAILQ_REMOVE(&tmpq, r, entries);
1704			pf_free_rule(r);
1705		}
1706		PF_RULES_WUNLOCK();
1707	}
1708}
1709
1710void
1711pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
1712{
1713	switch (af) {
1714#ifdef INET
1715	case AF_INET: {
1716		u_int32_t a = ntohl(addr->addr32[0]);
1717		printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
1718		    (a>>8)&255, a&255);
1719		if (p) {
1720			p = ntohs(p);
1721			printf(":%u", p);
1722		}
1723		break;
1724	}
1725#endif /* INET */
1726#ifdef INET6
1727	case AF_INET6: {
1728		u_int16_t b;
1729		u_int8_t i, curstart, curend, maxstart, maxend;
1730		curstart = curend = maxstart = maxend = 255;
1731		for (i = 0; i < 8; i++) {
1732			if (!addr->addr16[i]) {
1733				if (curstart == 255)
1734					curstart = i;
1735				curend = i;
1736			} else {
1737				if ((curend - curstart) >
1738				    (maxend - maxstart)) {
1739					maxstart = curstart;
1740					maxend = curend;
1741				}
1742				curstart = curend = 255;
1743			}
1744		}
1745		if ((curend - curstart) >
1746		    (maxend - maxstart)) {
1747			maxstart = curstart;
1748			maxend = curend;
1749		}
1750		for (i = 0; i < 8; i++) {
1751			if (i >= maxstart && i <= maxend) {
1752				if (i == 0)
1753					printf(":");
1754				if (i == maxend)
1755					printf(":");
1756			} else {
1757				b = ntohs(addr->addr16[i]);
1758				printf("%x", b);
1759				if (i < 7)
1760					printf(":");
1761			}
1762		}
1763		if (p) {
1764			p = ntohs(p);
1765			printf("[%u]", p);
1766		}
1767		break;
1768	}
1769#endif /* INET6 */
1770	}
1771}
1772
1773void
1774pf_print_state(struct pf_state *s)
1775{
1776	pf_print_state_parts(s, NULL, NULL);
1777}
1778
1779static void
1780pf_print_state_parts(struct pf_state *s,
1781    struct pf_state_key *skwp, struct pf_state_key *sksp)
1782{
1783	struct pf_state_key *skw, *sks;
1784	u_int8_t proto, dir;
1785
1786	/* Do our best to fill these, but they're skipped if NULL */
1787	skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
1788	sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
1789	proto = skw ? skw->proto : (sks ? sks->proto : 0);
1790	dir = s ? s->direction : 0;
1791
1792	switch (proto) {
1793	case IPPROTO_IPV4:
1794		printf("IPv4");
1795		break;
1796	case IPPROTO_IPV6:
1797		printf("IPv6");
1798		break;
1799	case IPPROTO_TCP:
1800		printf("TCP");
1801		break;
1802	case IPPROTO_UDP:
1803		printf("UDP");
1804		break;
1805	case IPPROTO_ICMP:
1806		printf("ICMP");
1807		break;
1808	case IPPROTO_ICMPV6:
1809		printf("ICMPv6");
1810		break;
1811	default:
1812		printf("%u", skw->proto);
1813		break;
1814	}
1815	switch (dir) {
1816	case PF_IN:
1817		printf(" in");
1818		break;
1819	case PF_OUT:
1820		printf(" out");
1821		break;
1822	}
1823	if (skw) {
1824		printf(" wire: ");
1825		pf_print_host(&skw->addr[0], skw->port[0], skw->af);
1826		printf(" ");
1827		pf_print_host(&skw->addr[1], skw->port[1], skw->af);
1828	}
1829	if (sks) {
1830		printf(" stack: ");
1831		if (sks != skw) {
1832			pf_print_host(&sks->addr[0], sks->port[0], sks->af);
1833			printf(" ");
1834			pf_print_host(&sks->addr[1], sks->port[1], sks->af);
1835		} else
1836			printf("-");
1837	}
1838	if (s) {
1839		if (proto == IPPROTO_TCP) {
1840			printf(" [lo=%u high=%u win=%u modulator=%u",
1841			    s->src.seqlo, s->src.seqhi,
1842			    s->src.max_win, s->src.seqdiff);
1843			if (s->src.wscale && s->dst.wscale)
1844				printf(" wscale=%u",
1845				    s->src.wscale & PF_WSCALE_MASK);
1846			printf("]");
1847			printf(" [lo=%u high=%u win=%u modulator=%u",
1848			    s->dst.seqlo, s->dst.seqhi,
1849			    s->dst.max_win, s->dst.seqdiff);
1850			if (s->src.wscale && s->dst.wscale)
1851				printf(" wscale=%u",
1852				s->dst.wscale & PF_WSCALE_MASK);
1853			printf("]");
1854		}
1855		printf(" %u:%u", s->src.state, s->dst.state);
1856	}
1857}
1858
1859void
1860pf_print_flags(u_int8_t f)
1861{
1862	if (f)
1863		printf(" ");
1864	if (f & TH_FIN)
1865		printf("F");
1866	if (f & TH_SYN)
1867		printf("S");
1868	if (f & TH_RST)
1869		printf("R");
1870	if (f & TH_PUSH)
1871		printf("P");
1872	if (f & TH_ACK)
1873		printf("A");
1874	if (f & TH_URG)
1875		printf("U");
1876	if (f & TH_ECE)
1877		printf("E");
1878	if (f & TH_CWR)
1879		printf("W");
1880}
1881
1882#define	PF_SET_SKIP_STEPS(i)					\
1883	do {							\
1884		while (head[i] != cur) {			\
1885			head[i]->skip[i].ptr = cur;		\
1886			head[i] = TAILQ_NEXT(head[i], entries);	\
1887		}						\
1888	} while (0)
1889
1890void
1891pf_calc_skip_steps(struct pf_rulequeue *rules)
1892{
1893	struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT];
1894	int i;
1895
1896	cur = TAILQ_FIRST(rules);
1897	prev = cur;
1898	for (i = 0; i < PF_SKIP_COUNT; ++i)
1899		head[i] = cur;
1900	while (cur != NULL) {
1901
1902		if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
1903			PF_SET_SKIP_STEPS(PF_SKIP_IFP);
1904		if (cur->direction != prev->direction)
1905			PF_SET_SKIP_STEPS(PF_SKIP_DIR);
1906		if (cur->af != prev->af)
1907			PF_SET_SKIP_STEPS(PF_SKIP_AF);
1908		if (cur->proto != prev->proto)
1909			PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
1910		if (cur->src.neg != prev->src.neg ||
1911		    pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
1912			PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
1913		if (cur->src.port[0] != prev->src.port[0] ||
1914		    cur->src.port[1] != prev->src.port[1] ||
1915		    cur->src.port_op != prev->src.port_op)
1916			PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
1917		if (cur->dst.neg != prev->dst.neg ||
1918		    pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
1919			PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
1920		if (cur->dst.port[0] != prev->dst.port[0] ||
1921		    cur->dst.port[1] != prev->dst.port[1] ||
1922		    cur->dst.port_op != prev->dst.port_op)
1923			PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
1924
1925		prev = cur;
1926		cur = TAILQ_NEXT(cur, entries);
1927	}
1928	for (i = 0; i < PF_SKIP_COUNT; ++i)
1929		PF_SET_SKIP_STEPS(i);
1930}
1931
1932static int
1933pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
1934{
1935	if (aw1->type != aw2->type)
1936		return (1);
1937	switch (aw1->type) {
1938	case PF_ADDR_ADDRMASK:
1939	case PF_ADDR_RANGE:
1940		if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, 0))
1941			return (1);
1942		if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, 0))
1943			return (1);
1944		return (0);
1945	case PF_ADDR_DYNIFTL:
1946		return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
1947	case PF_ADDR_NOROUTE:
1948	case PF_ADDR_URPFFAILED:
1949		return (0);
1950	case PF_ADDR_TABLE:
1951		return (aw1->p.tbl != aw2->p.tbl);
1952	default:
1953		printf("invalid address type: %d\n", aw1->type);
1954		return (1);
1955	}
1956}
1957
1958u_int16_t
1959pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
1960{
1961	u_int32_t	l;
1962
1963	if (udp && !cksum)
1964		return (0x0000);
1965	l = cksum + old - new;
1966	l = (l >> 16) + (l & 65535);
1967	l = l & 65535;
1968	if (udp && !l)
1969		return (0xFFFF);
1970	return (l);
1971}
1972
1973static void
1974pf_change_ap(struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc,
1975    struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af)
1976{
1977	struct pf_addr	ao;
1978	u_int16_t	po = *p;
1979
1980	PF_ACPY(&ao, a, af);
1981	PF_ACPY(a, an, af);
1982
1983	*p = pn;
1984
1985	switch (af) {
1986#ifdef INET
1987	case AF_INET:
1988		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
1989		    ao.addr16[0], an->addr16[0], 0),
1990		    ao.addr16[1], an->addr16[1], 0);
1991		*p = pn;
1992		*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
1993		    ao.addr16[0], an->addr16[0], u),
1994		    ao.addr16[1], an->addr16[1], u),
1995		    po, pn, u);
1996		break;
1997#endif /* INET */
1998#ifdef INET6
1999	case AF_INET6:
2000		*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2001		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2002		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
2003		    ao.addr16[0], an->addr16[0], u),
2004		    ao.addr16[1], an->addr16[1], u),
2005		    ao.addr16[2], an->addr16[2], u),
2006		    ao.addr16[3], an->addr16[3], u),
2007		    ao.addr16[4], an->addr16[4], u),
2008		    ao.addr16[5], an->addr16[5], u),
2009		    ao.addr16[6], an->addr16[6], u),
2010		    ao.addr16[7], an->addr16[7], u),
2011		    po, pn, u);
2012		break;
2013#endif /* INET6 */
2014	}
2015}
2016
2017
2018/* Changes a u_int32_t.  Uses a void * so there are no align restrictions */
2019void
2020pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
2021{
2022	u_int32_t	ao;
2023
2024	memcpy(&ao, a, sizeof(ao));
2025	memcpy(a, &an, sizeof(u_int32_t));
2026	*c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
2027	    ao % 65536, an % 65536, u);
2028}
2029
2030#ifdef INET6
2031static void
2032pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
2033{
2034	struct pf_addr	ao;
2035
2036	PF_ACPY(&ao, a, AF_INET6);
2037	PF_ACPY(a, an, AF_INET6);
2038
2039	*c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2040	    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2041	    pf_cksum_fixup(pf_cksum_fixup(*c,
2042	    ao.addr16[0], an->addr16[0], u),
2043	    ao.addr16[1], an->addr16[1], u),
2044	    ao.addr16[2], an->addr16[2], u),
2045	    ao.addr16[3], an->addr16[3], u),
2046	    ao.addr16[4], an->addr16[4], u),
2047	    ao.addr16[5], an->addr16[5], u),
2048	    ao.addr16[6], an->addr16[6], u),
2049	    ao.addr16[7], an->addr16[7], u);
2050}
2051#endif /* INET6 */
2052
2053static void
2054pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
2055    struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
2056    u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
2057{
2058	struct pf_addr	oia, ooa;
2059
2060	PF_ACPY(&oia, ia, af);
2061	if (oa)
2062		PF_ACPY(&ooa, oa, af);
2063
2064	/* Change inner protocol port, fix inner protocol checksum. */
2065	if (ip != NULL) {
2066		u_int16_t	oip = *ip;
2067		u_int32_t	opc;
2068
2069		if (pc != NULL)
2070			opc = *pc;
2071		*ip = np;
2072		if (pc != NULL)
2073			*pc = pf_cksum_fixup(*pc, oip, *ip, u);
2074		*ic = pf_cksum_fixup(*ic, oip, *ip, 0);
2075		if (pc != NULL)
2076			*ic = pf_cksum_fixup(*ic, opc, *pc, 0);
2077	}
2078	/* Change inner ip address, fix inner ip and icmp checksums. */
2079	PF_ACPY(ia, na, af);
2080	switch (af) {
2081#ifdef INET
2082	case AF_INET: {
2083		u_int32_t	 oh2c = *h2c;
2084
2085		*h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
2086		    oia.addr16[0], ia->addr16[0], 0),
2087		    oia.addr16[1], ia->addr16[1], 0);
2088		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
2089		    oia.addr16[0], ia->addr16[0], 0),
2090		    oia.addr16[1], ia->addr16[1], 0);
2091		*ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
2092		break;
2093	}
2094#endif /* INET */
2095#ifdef INET6
2096	case AF_INET6:
2097		*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2098		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2099		    pf_cksum_fixup(pf_cksum_fixup(*ic,
2100		    oia.addr16[0], ia->addr16[0], u),
2101		    oia.addr16[1], ia->addr16[1], u),
2102		    oia.addr16[2], ia->addr16[2], u),
2103		    oia.addr16[3], ia->addr16[3], u),
2104		    oia.addr16[4], ia->addr16[4], u),
2105		    oia.addr16[5], ia->addr16[5], u),
2106		    oia.addr16[6], ia->addr16[6], u),
2107		    oia.addr16[7], ia->addr16[7], u);
2108		break;
2109#endif /* INET6 */
2110	}
2111	/* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
2112	if (oa) {
2113		PF_ACPY(oa, na, af);
2114		switch (af) {
2115#ifdef INET
2116		case AF_INET:
2117			*hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
2118			    ooa.addr16[0], oa->addr16[0], 0),
2119			    ooa.addr16[1], oa->addr16[1], 0);
2120			break;
2121#endif /* INET */
2122#ifdef INET6
2123		case AF_INET6:
2124			*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2125			    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2126			    pf_cksum_fixup(pf_cksum_fixup(*ic,
2127			    ooa.addr16[0], oa->addr16[0], u),
2128			    ooa.addr16[1], oa->addr16[1], u),
2129			    ooa.addr16[2], oa->addr16[2], u),
2130			    ooa.addr16[3], oa->addr16[3], u),
2131			    ooa.addr16[4], oa->addr16[4], u),
2132			    ooa.addr16[5], oa->addr16[5], u),
2133			    ooa.addr16[6], oa->addr16[6], u),
2134			    ooa.addr16[7], oa->addr16[7], u);
2135			break;
2136#endif /* INET6 */
2137		}
2138	}
2139}
2140
2141
2142/*
2143 * Need to modulate the sequence numbers in the TCP SACK option
2144 * (credits to Krzysztof Pfaff for report and patch)
2145 */
2146static int
2147pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
2148    struct tcphdr *th, struct pf_state_peer *dst)
2149{
2150	int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
2151	u_int8_t opts[TCP_MAXOLEN], *opt = opts;
2152	int copyback = 0, i, olen;
2153	struct sackblk sack;
2154
2155#define	TCPOLEN_SACKLEN	(TCPOLEN_SACK + 2)
2156	if (hlen < TCPOLEN_SACKLEN ||
2157	    !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
2158		return 0;
2159
2160	while (hlen >= TCPOLEN_SACKLEN) {
2161		olen = opt[1];
2162		switch (*opt) {
2163		case TCPOPT_EOL:	/* FALLTHROUGH */
2164		case TCPOPT_NOP:
2165			opt++;
2166			hlen--;
2167			break;
2168		case TCPOPT_SACK:
2169			if (olen > hlen)
2170				olen = hlen;
2171			if (olen >= TCPOLEN_SACKLEN) {
2172				for (i = 2; i + TCPOLEN_SACK <= olen;
2173				    i += TCPOLEN_SACK) {
2174					memcpy(&sack, &opt[i], sizeof(sack));
2175					pf_change_a(&sack.start, &th->th_sum,
2176					    htonl(ntohl(sack.start) -
2177					    dst->seqdiff), 0);
2178					pf_change_a(&sack.end, &th->th_sum,
2179					    htonl(ntohl(sack.end) -
2180					    dst->seqdiff), 0);
2181					memcpy(&opt[i], &sack, sizeof(sack));
2182				}
2183				copyback = 1;
2184			}
2185			/* FALLTHROUGH */
2186		default:
2187			if (olen < 2)
2188				olen = 2;
2189			hlen -= olen;
2190			opt += olen;
2191		}
2192	}
2193
2194	if (copyback)
2195		m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts);
2196	return (copyback);
2197}
2198
2199static void
2200pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af,
2201    const struct pf_addr *saddr, const struct pf_addr *daddr,
2202    u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
2203    u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
2204    u_int16_t rtag, struct ifnet *ifp)
2205{
2206	struct pf_send_entry *pfse;
2207	struct mbuf	*m;
2208	int		 len, tlen;
2209#ifdef INET
2210	struct ip	*h = NULL;
2211#endif /* INET */
2212#ifdef INET6
2213	struct ip6_hdr	*h6 = NULL;
2214#endif /* INET6 */
2215	struct tcphdr	*th;
2216	char		*opt;
2217	struct pf_mtag  *pf_mtag;
2218
2219	len = 0;
2220	th = NULL;
2221
2222	/* maximum segment size tcp option */
2223	tlen = sizeof(struct tcphdr);
2224	if (mss)
2225		tlen += 4;
2226
2227	switch (af) {
2228#ifdef INET
2229	case AF_INET:
2230		len = sizeof(struct ip) + tlen;
2231		break;
2232#endif /* INET */
2233#ifdef INET6
2234	case AF_INET6:
2235		len = sizeof(struct ip6_hdr) + tlen;
2236		break;
2237#endif /* INET6 */
2238	default:
2239		panic("%s: unsupported af %d", __func__, af);
2240	}
2241
2242	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
2243	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2244	if (pfse == NULL)
2245		return;
2246	m = m_gethdr(M_NOWAIT, MT_DATA);
2247	if (m == NULL) {
2248		free(pfse, M_PFTEMP);
2249		return;
2250	}
2251#ifdef MAC
2252	mac_netinet_firewall_send(m);
2253#endif
2254	if ((pf_mtag = pf_get_mtag(m)) == NULL) {
2255		free(pfse, M_PFTEMP);
2256		m_freem(m);
2257		return;
2258	}
2259	if (tag)
2260		m->m_flags |= M_SKIP_FIREWALL;
2261	pf_mtag->tag = rtag;
2262
2263	if (r != NULL && r->rtableid >= 0)
2264		M_SETFIB(m, r->rtableid);
2265
2266#ifdef ALTQ
2267	if (r != NULL && r->qid) {
2268		pf_mtag->qid = r->qid;
2269
2270		/* add hints for ecn */
2271		pf_mtag->hdr = mtod(m, struct ip *);
2272	}
2273#endif /* ALTQ */
2274	m->m_data += max_linkhdr;
2275	m->m_pkthdr.len = m->m_len = len;
2276	m->m_pkthdr.rcvif = NULL;
2277	bzero(m->m_data, len);
2278	switch (af) {
2279#ifdef INET
2280	case AF_INET:
2281		h = mtod(m, struct ip *);
2282
2283		/* IP header fields included in the TCP checksum */
2284		h->ip_p = IPPROTO_TCP;
2285		h->ip_len = htons(tlen);
2286		h->ip_src.s_addr = saddr->v4.s_addr;
2287		h->ip_dst.s_addr = daddr->v4.s_addr;
2288
2289		th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
2290		break;
2291#endif /* INET */
2292#ifdef INET6
2293	case AF_INET6:
2294		h6 = mtod(m, struct ip6_hdr *);
2295
2296		/* IP header fields included in the TCP checksum */
2297		h6->ip6_nxt = IPPROTO_TCP;
2298		h6->ip6_plen = htons(tlen);
2299		memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
2300		memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
2301
2302		th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
2303		break;
2304#endif /* INET6 */
2305	}
2306
2307	/* TCP header */
2308	th->th_sport = sport;
2309	th->th_dport = dport;
2310	th->th_seq = htonl(seq);
2311	th->th_ack = htonl(ack);
2312	th->th_off = tlen >> 2;
2313	th->th_flags = flags;
2314	th->th_win = htons(win);
2315
2316	if (mss) {
2317		opt = (char *)(th + 1);
2318		opt[0] = TCPOPT_MAXSEG;
2319		opt[1] = 4;
2320		HTONS(mss);
2321		bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
2322	}
2323
2324	switch (af) {
2325#ifdef INET
2326	case AF_INET:
2327		/* TCP checksum */
2328		th->th_sum = in_cksum(m, len);
2329
2330		/* Finish the IP header */
2331		h->ip_v = 4;
2332		h->ip_hl = sizeof(*h) >> 2;
2333		h->ip_tos = IPTOS_LOWDELAY;
2334		h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0);
2335		h->ip_len = htons(len);
2336		h->ip_ttl = ttl ? ttl : V_ip_defttl;
2337		h->ip_sum = 0;
2338
2339		pfse->pfse_type = PFSE_IP;
2340		break;
2341#endif /* INET */
2342#ifdef INET6
2343	case AF_INET6:
2344		/* TCP checksum */
2345		th->th_sum = in6_cksum(m, IPPROTO_TCP,
2346		    sizeof(struct ip6_hdr), tlen);
2347
2348		h6->ip6_vfc |= IPV6_VERSION;
2349		h6->ip6_hlim = IPV6_DEFHLIM;
2350
2351		pfse->pfse_type = PFSE_IP6;
2352		break;
2353#endif /* INET6 */
2354	}
2355	pfse->pfse_m = m;
2356	pf_send(pfse);
2357}
2358
2359static void
2360pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
2361    struct pf_rule *r)
2362{
2363	struct pf_send_entry *pfse;
2364	struct mbuf *m0;
2365	struct pf_mtag *pf_mtag;
2366
2367	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
2368	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2369	if (pfse == NULL)
2370		return;
2371
2372	if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) {
2373		free(pfse, M_PFTEMP);
2374		return;
2375	}
2376
2377	if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
2378		free(pfse, M_PFTEMP);
2379		return;
2380	}
2381	/* XXX: revisit */
2382	m0->m_flags |= M_SKIP_FIREWALL;
2383
2384	if (r->rtableid >= 0)
2385		M_SETFIB(m0, r->rtableid);
2386
2387#ifdef ALTQ
2388	if (r->qid) {
2389		pf_mtag->qid = r->qid;
2390		/* add hints for ecn */
2391		pf_mtag->hdr = mtod(m0, struct ip *);
2392	}
2393#endif /* ALTQ */
2394
2395	switch (af) {
2396#ifdef INET
2397	case AF_INET:
2398		pfse->pfse_type = PFSE_ICMP;
2399		break;
2400#endif /* INET */
2401#ifdef INET6
2402	case AF_INET6:
2403		pfse->pfse_type = PFSE_ICMP6;
2404		break;
2405#endif /* INET6 */
2406	}
2407	pfse->pfse_m = m0;
2408	pfse->pfse_icmp_type = type;
2409	pfse->pfse_icmp_code = code;
2410	pf_send(pfse);
2411}
2412
2413/*
2414 * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
2415 * If n is 0, they match if they are equal. If n is != 0, they match if they
2416 * are different.
2417 */
2418int
2419pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
2420    struct pf_addr *b, sa_family_t af)
2421{
2422	int	match = 0;
2423
2424	switch (af) {
2425#ifdef INET
2426	case AF_INET:
2427		if ((a->addr32[0] & m->addr32[0]) ==
2428		    (b->addr32[0] & m->addr32[0]))
2429			match++;
2430		break;
2431#endif /* INET */
2432#ifdef INET6
2433	case AF_INET6:
2434		if (((a->addr32[0] & m->addr32[0]) ==
2435		     (b->addr32[0] & m->addr32[0])) &&
2436		    ((a->addr32[1] & m->addr32[1]) ==
2437		     (b->addr32[1] & m->addr32[1])) &&
2438		    ((a->addr32[2] & m->addr32[2]) ==
2439		     (b->addr32[2] & m->addr32[2])) &&
2440		    ((a->addr32[3] & m->addr32[3]) ==
2441		     (b->addr32[3] & m->addr32[3])))
2442			match++;
2443		break;
2444#endif /* INET6 */
2445	}
2446	if (match) {
2447		if (n)
2448			return (0);
2449		else
2450			return (1);
2451	} else {
2452		if (n)
2453			return (1);
2454		else
2455			return (0);
2456	}
2457}
2458
2459/*
2460 * Return 1 if b <= a <= e, otherwise return 0.
2461 */
2462int
2463pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
2464    struct pf_addr *a, sa_family_t af)
2465{
2466	switch (af) {
2467#ifdef INET
2468	case AF_INET:
2469		if ((a->addr32[0] < b->addr32[0]) ||
2470		    (a->addr32[0] > e->addr32[0]))
2471			return (0);
2472		break;
2473#endif /* INET */
2474#ifdef INET6
2475	case AF_INET6: {
2476		int	i;
2477
2478		/* check a >= b */
2479		for (i = 0; i < 4; ++i)
2480			if (a->addr32[i] > b->addr32[i])
2481				break;
2482			else if (a->addr32[i] < b->addr32[i])
2483				return (0);
2484		/* check a <= e */
2485		for (i = 0; i < 4; ++i)
2486			if (a->addr32[i] < e->addr32[i])
2487				break;
2488			else if (a->addr32[i] > e->addr32[i])
2489				return (0);
2490		break;
2491	}
2492#endif /* INET6 */
2493	}
2494	return (1);
2495}
2496
2497static int
2498pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
2499{
2500	switch (op) {
2501	case PF_OP_IRG:
2502		return ((p > a1) && (p < a2));
2503	case PF_OP_XRG:
2504		return ((p < a1) || (p > a2));
2505	case PF_OP_RRG:
2506		return ((p >= a1) && (p <= a2));
2507	case PF_OP_EQ:
2508		return (p == a1);
2509	case PF_OP_NE:
2510		return (p != a1);
2511	case PF_OP_LT:
2512		return (p < a1);
2513	case PF_OP_LE:
2514		return (p <= a1);
2515	case PF_OP_GT:
2516		return (p > a1);
2517	case PF_OP_GE:
2518		return (p >= a1);
2519	}
2520	return (0); /* never reached */
2521}
2522
2523int
2524pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
2525{
2526	NTOHS(a1);
2527	NTOHS(a2);
2528	NTOHS(p);
2529	return (pf_match(op, a1, a2, p));
2530}
2531
2532static int
2533pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
2534{
2535	if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2536		return (0);
2537	return (pf_match(op, a1, a2, u));
2538}
2539
2540static int
2541pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
2542{
2543	if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2544		return (0);
2545	return (pf_match(op, a1, a2, g));
2546}
2547
2548int
2549pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag)
2550{
2551	if (*tag == -1)
2552		*tag = mtag;
2553
2554	return ((!r->match_tag_not && r->match_tag == *tag) ||
2555	    (r->match_tag_not && r->match_tag != *tag));
2556}
2557
2558int
2559pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag)
2560{
2561
2562	KASSERT(tag > 0, ("%s: tag %d", __func__, tag));
2563
2564	if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL))
2565		return (ENOMEM);
2566
2567	pd->pf_mtag->tag = tag;
2568
2569	return (0);
2570}
2571
2572#define	PF_ANCHOR_STACKSIZE	32
2573struct pf_anchor_stackframe {
2574	struct pf_ruleset	*rs;
2575	struct pf_rule		*r;	/* XXX: + match bit */
2576	struct pf_anchor	*child;
2577};
2578
2579/*
2580 * XXX: We rely on malloc(9) returning pointer aligned addresses.
2581 */
2582#define	PF_ANCHORSTACK_MATCH	0x00000001
2583#define	PF_ANCHORSTACK_MASK	(PF_ANCHORSTACK_MATCH)
2584
2585#define	PF_ANCHOR_MATCH(f)	((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH)
2586#define	PF_ANCHOR_RULE(f)	(struct pf_rule *)			\
2587				((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK)
2588#define	PF_ANCHOR_SET_MATCH(f)	do { (f)->r = (void *) 			\
2589				((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH);  \
2590} while (0)
2591
2592void
2593pf_step_into_anchor(struct pf_anchor_stackframe *stack, int *depth,
2594    struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
2595    int *match)
2596{
2597	struct pf_anchor_stackframe	*f;
2598
2599	PF_RULES_RASSERT();
2600
2601	if (match)
2602		*match = 0;
2603	if (*depth >= PF_ANCHOR_STACKSIZE) {
2604		printf("%s: anchor stack overflow on %s\n",
2605		    __func__, (*r)->anchor->name);
2606		*r = TAILQ_NEXT(*r, entries);
2607		return;
2608	} else if (*depth == 0 && a != NULL)
2609		*a = *r;
2610	f = stack + (*depth)++;
2611	f->rs = *rs;
2612	f->r = *r;
2613	if ((*r)->anchor_wildcard) {
2614		struct pf_anchor_node *parent = &(*r)->anchor->children;
2615
2616		if ((f->child = RB_MIN(pf_anchor_node, parent)) == NULL) {
2617			*r = NULL;
2618			return;
2619		}
2620		*rs = &f->child->ruleset;
2621	} else {
2622		f->child = NULL;
2623		*rs = &(*r)->anchor->ruleset;
2624	}
2625	*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2626}
2627
2628int
2629pf_step_out_of_anchor(struct pf_anchor_stackframe *stack, int *depth,
2630    struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
2631    int *match)
2632{
2633	struct pf_anchor_stackframe	*f;
2634	struct pf_rule *fr;
2635	int quick = 0;
2636
2637	PF_RULES_RASSERT();
2638
2639	do {
2640		if (*depth <= 0)
2641			break;
2642		f = stack + *depth - 1;
2643		fr = PF_ANCHOR_RULE(f);
2644		if (f->child != NULL) {
2645			struct pf_anchor_node *parent;
2646
2647			/*
2648			 * This block traverses through
2649			 * a wildcard anchor.
2650			 */
2651			parent = &fr->anchor->children;
2652			if (match != NULL && *match) {
2653				/*
2654				 * If any of "*" matched, then
2655				 * "foo/ *" matched, mark frame
2656				 * appropriately.
2657				 */
2658				PF_ANCHOR_SET_MATCH(f);
2659				*match = 0;
2660			}
2661			f->child = RB_NEXT(pf_anchor_node, parent, f->child);
2662			if (f->child != NULL) {
2663				*rs = &f->child->ruleset;
2664				*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2665				if (*r == NULL)
2666					continue;
2667				else
2668					break;
2669			}
2670		}
2671		(*depth)--;
2672		if (*depth == 0 && a != NULL)
2673			*a = NULL;
2674		*rs = f->rs;
2675		if (PF_ANCHOR_MATCH(f) || (match != NULL && *match))
2676			quick = fr->quick;
2677		*r = TAILQ_NEXT(fr, entries);
2678	} while (*r == NULL);
2679
2680	return (quick);
2681}
2682
2683#ifdef INET6
2684void
2685pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
2686    struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
2687{
2688	switch (af) {
2689#ifdef INET
2690	case AF_INET:
2691		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2692		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2693		break;
2694#endif /* INET */
2695	case AF_INET6:
2696		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2697		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2698		naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
2699		((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
2700		naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
2701		((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
2702		naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
2703		((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
2704		break;
2705	}
2706}
2707
2708void
2709pf_addr_inc(struct pf_addr *addr, sa_family_t af)
2710{
2711	switch (af) {
2712#ifdef INET
2713	case AF_INET:
2714		addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
2715		break;
2716#endif /* INET */
2717	case AF_INET6:
2718		if (addr->addr32[3] == 0xffffffff) {
2719			addr->addr32[3] = 0;
2720			if (addr->addr32[2] == 0xffffffff) {
2721				addr->addr32[2] = 0;
2722				if (addr->addr32[1] == 0xffffffff) {
2723					addr->addr32[1] = 0;
2724					addr->addr32[0] =
2725					    htonl(ntohl(addr->addr32[0]) + 1);
2726				} else
2727					addr->addr32[1] =
2728					    htonl(ntohl(addr->addr32[1]) + 1);
2729			} else
2730				addr->addr32[2] =
2731				    htonl(ntohl(addr->addr32[2]) + 1);
2732		} else
2733			addr->addr32[3] =
2734			    htonl(ntohl(addr->addr32[3]) + 1);
2735		break;
2736	}
2737}
2738#endif /* INET6 */
2739
2740int
2741pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m)
2742{
2743	struct pf_addr		*saddr, *daddr;
2744	u_int16_t		 sport, dport;
2745	struct inpcbinfo	*pi;
2746	struct inpcb		*inp;
2747
2748	pd->lookup.uid = UID_MAX;
2749	pd->lookup.gid = GID_MAX;
2750
2751	switch (pd->proto) {
2752	case IPPROTO_TCP:
2753		if (pd->hdr.tcp == NULL)
2754			return (-1);
2755		sport = pd->hdr.tcp->th_sport;
2756		dport = pd->hdr.tcp->th_dport;
2757		pi = &V_tcbinfo;
2758		break;
2759	case IPPROTO_UDP:
2760		if (pd->hdr.udp == NULL)
2761			return (-1);
2762		sport = pd->hdr.udp->uh_sport;
2763		dport = pd->hdr.udp->uh_dport;
2764		pi = &V_udbinfo;
2765		break;
2766	default:
2767		return (-1);
2768	}
2769	if (direction == PF_IN) {
2770		saddr = pd->src;
2771		daddr = pd->dst;
2772	} else {
2773		u_int16_t	p;
2774
2775		p = sport;
2776		sport = dport;
2777		dport = p;
2778		saddr = pd->dst;
2779		daddr = pd->src;
2780	}
2781	switch (pd->af) {
2782#ifdef INET
2783	case AF_INET:
2784		inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4,
2785		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
2786		if (inp == NULL) {
2787			inp = in_pcblookup_mbuf(pi, saddr->v4, sport,
2788			   daddr->v4, dport, INPLOOKUP_WILDCARD |
2789			   INPLOOKUP_RLOCKPCB, NULL, m);
2790			if (inp == NULL)
2791				return (-1);
2792		}
2793		break;
2794#endif /* INET */
2795#ifdef INET6
2796	case AF_INET6:
2797		inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6,
2798		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
2799		if (inp == NULL) {
2800			inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport,
2801			    &daddr->v6, dport, INPLOOKUP_WILDCARD |
2802			    INPLOOKUP_RLOCKPCB, NULL, m);
2803			if (inp == NULL)
2804				return (-1);
2805		}
2806		break;
2807#endif /* INET6 */
2808
2809	default:
2810		return (-1);
2811	}
2812	INP_RLOCK_ASSERT(inp);
2813	pd->lookup.uid = inp->inp_cred->cr_uid;
2814	pd->lookup.gid = inp->inp_cred->cr_groups[0];
2815	INP_RUNLOCK(inp);
2816
2817	return (1);
2818}
2819
2820static u_int8_t
2821pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
2822{
2823	int		 hlen;
2824	u_int8_t	 hdr[60];
2825	u_int8_t	*opt, optlen;
2826	u_int8_t	 wscale = 0;
2827
2828	hlen = th_off << 2;		/* hlen <= sizeof(hdr) */
2829	if (hlen <= sizeof(struct tcphdr))
2830		return (0);
2831	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
2832		return (0);
2833	opt = hdr + sizeof(struct tcphdr);
2834	hlen -= sizeof(struct tcphdr);
2835	while (hlen >= 3) {
2836		switch (*opt) {
2837		case TCPOPT_EOL:
2838		case TCPOPT_NOP:
2839			++opt;
2840			--hlen;
2841			break;
2842		case TCPOPT_WINDOW:
2843			wscale = opt[2];
2844			if (wscale > TCP_MAX_WINSHIFT)
2845				wscale = TCP_MAX_WINSHIFT;
2846			wscale |= PF_WSCALE_FLAG;
2847			/* FALLTHROUGH */
2848		default:
2849			optlen = opt[1];
2850			if (optlen < 2)
2851				optlen = 2;
2852			hlen -= optlen;
2853			opt += optlen;
2854			break;
2855		}
2856	}
2857	return (wscale);
2858}
2859
2860static u_int16_t
2861pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
2862{
2863	int		 hlen;
2864	u_int8_t	 hdr[60];
2865	u_int8_t	*opt, optlen;
2866	u_int16_t	 mss = V_tcp_mssdflt;
2867
2868	hlen = th_off << 2;	/* hlen <= sizeof(hdr) */
2869	if (hlen <= sizeof(struct tcphdr))
2870		return (0);
2871	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
2872		return (0);
2873	opt = hdr + sizeof(struct tcphdr);
2874	hlen -= sizeof(struct tcphdr);
2875	while (hlen >= TCPOLEN_MAXSEG) {
2876		switch (*opt) {
2877		case TCPOPT_EOL:
2878		case TCPOPT_NOP:
2879			++opt;
2880			--hlen;
2881			break;
2882		case TCPOPT_MAXSEG:
2883			bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
2884			NTOHS(mss);
2885			/* FALLTHROUGH */
2886		default:
2887			optlen = opt[1];
2888			if (optlen < 2)
2889				optlen = 2;
2890			hlen -= optlen;
2891			opt += optlen;
2892			break;
2893		}
2894	}
2895	return (mss);
2896}
2897
2898static u_int16_t
2899pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer)
2900{
2901#ifdef INET
2902	struct sockaddr_in	*dst;
2903	struct route		 ro;
2904#endif /* INET */
2905#ifdef INET6
2906	struct sockaddr_in6	*dst6;
2907	struct route_in6	 ro6;
2908#endif /* INET6 */
2909	struct rtentry		*rt = NULL;
2910	int			 hlen = 0;
2911	u_int16_t		 mss = V_tcp_mssdflt;
2912
2913	switch (af) {
2914#ifdef INET
2915	case AF_INET:
2916		hlen = sizeof(struct ip);
2917		bzero(&ro, sizeof(ro));
2918		dst = (struct sockaddr_in *)&ro.ro_dst;
2919		dst->sin_family = AF_INET;
2920		dst->sin_len = sizeof(*dst);
2921		dst->sin_addr = addr->v4;
2922		in_rtalloc_ign(&ro, 0, rtableid);
2923		rt = ro.ro_rt;
2924		break;
2925#endif /* INET */
2926#ifdef INET6
2927	case AF_INET6:
2928		hlen = sizeof(struct ip6_hdr);
2929		bzero(&ro6, sizeof(ro6));
2930		dst6 = (struct sockaddr_in6 *)&ro6.ro_dst;
2931		dst6->sin6_family = AF_INET6;
2932		dst6->sin6_len = sizeof(*dst6);
2933		dst6->sin6_addr = addr->v6;
2934		in6_rtalloc_ign(&ro6, 0, rtableid);
2935		rt = ro6.ro_rt;
2936		break;
2937#endif /* INET6 */
2938	}
2939
2940	if (rt && rt->rt_ifp) {
2941		mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr);
2942		mss = max(V_tcp_mssdflt, mss);
2943		RTFREE(rt);
2944	}
2945	mss = min(mss, offer);
2946	mss = max(mss, 64);		/* sanity - at least max opt space */
2947	return (mss);
2948}
2949
2950static void
2951pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr)
2952{
2953	struct pf_rule *r = s->rule.ptr;
2954	struct pf_src_node *sn = NULL;
2955
2956	s->rt_kif = NULL;
2957	if (!r->rt || r->rt == PF_FASTROUTE)
2958		return;
2959	switch (s->key[PF_SK_WIRE]->af) {
2960#ifdef INET
2961	case AF_INET:
2962		pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL, &sn);
2963		s->rt_kif = r->rpool.cur->kif;
2964		break;
2965#endif /* INET */
2966#ifdef INET6
2967	case AF_INET6:
2968		pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL, &sn);
2969		s->rt_kif = r->rpool.cur->kif;
2970		break;
2971#endif /* INET6 */
2972	}
2973}
2974
2975static u_int32_t
2976pf_tcp_iss(struct pf_pdesc *pd)
2977{
2978	MD5_CTX ctx;
2979	u_int32_t digest[4];
2980
2981	if (V_pf_tcp_secret_init == 0) {
2982		read_random(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret));
2983		MD5Init(&V_pf_tcp_secret_ctx);
2984		MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret,
2985		    sizeof(V_pf_tcp_secret));
2986		V_pf_tcp_secret_init = 1;
2987	}
2988
2989	ctx = V_pf_tcp_secret_ctx;
2990
2991	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
2992	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
2993	if (pd->af == AF_INET6) {
2994		MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
2995		MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
2996	} else {
2997		MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
2998		MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
2999	}
3000	MD5Final((u_char *)digest, &ctx);
3001	V_pf_tcp_iss_off += 4096;
3002#define	ISN_RANDOM_INCREMENT (4096 - 1)
3003	return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) +
3004	    V_pf_tcp_iss_off);
3005#undef	ISN_RANDOM_INCREMENT
3006}
3007
3008static int
3009pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
3010    struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd,
3011    struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp)
3012{
3013	struct pf_rule		*nr = NULL;
3014	struct pf_addr		* const saddr = pd->src;
3015	struct pf_addr		* const daddr = pd->dst;
3016	sa_family_t		 af = pd->af;
3017	struct pf_rule		*r, *a = NULL;
3018	struct pf_ruleset	*ruleset = NULL;
3019	struct pf_src_node	*nsn = NULL;
3020	struct tcphdr		*th = pd->hdr.tcp;
3021	struct pf_state_key	*sk = NULL, *nk = NULL;
3022	u_short			 reason;
3023	int			 rewrite = 0, hdrlen = 0;
3024	int			 tag = -1, rtableid = -1;
3025	int			 asd = 0;
3026	int			 match = 0;
3027	int			 state_icmp = 0;
3028	u_int16_t		 sport = 0, dport = 0;
3029	u_int16_t		 bproto_sum = 0, bip_sum = 0;
3030	u_int8_t		 icmptype = 0, icmpcode = 0;
3031	struct pf_anchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
3032
3033	PF_RULES_RASSERT();
3034
3035	if (inp != NULL) {
3036		INP_LOCK_ASSERT(inp);
3037		pd->lookup.uid = inp->inp_cred->cr_uid;
3038		pd->lookup.gid = inp->inp_cred->cr_groups[0];
3039		pd->lookup.done = 1;
3040	}
3041
3042	switch (pd->proto) {
3043	case IPPROTO_TCP:
3044		sport = th->th_sport;
3045		dport = th->th_dport;
3046		hdrlen = sizeof(*th);
3047		break;
3048	case IPPROTO_UDP:
3049		sport = pd->hdr.udp->uh_sport;
3050		dport = pd->hdr.udp->uh_dport;
3051		hdrlen = sizeof(*pd->hdr.udp);
3052		break;
3053#ifdef INET
3054	case IPPROTO_ICMP:
3055		if (pd->af != AF_INET)
3056			break;
3057		sport = dport = pd->hdr.icmp->icmp_id;
3058		hdrlen = sizeof(*pd->hdr.icmp);
3059		icmptype = pd->hdr.icmp->icmp_type;
3060		icmpcode = pd->hdr.icmp->icmp_code;
3061
3062		if (icmptype == ICMP_UNREACH ||
3063		    icmptype == ICMP_SOURCEQUENCH ||
3064		    icmptype == ICMP_REDIRECT ||
3065		    icmptype == ICMP_TIMXCEED ||
3066		    icmptype == ICMP_PARAMPROB)
3067			state_icmp++;
3068		break;
3069#endif /* INET */
3070#ifdef INET6
3071	case IPPROTO_ICMPV6:
3072		if (af != AF_INET6)
3073			break;
3074		sport = dport = pd->hdr.icmp6->icmp6_id;
3075		hdrlen = sizeof(*pd->hdr.icmp6);
3076		icmptype = pd->hdr.icmp6->icmp6_type;
3077		icmpcode = pd->hdr.icmp6->icmp6_code;
3078
3079		if (icmptype == ICMP6_DST_UNREACH ||
3080		    icmptype == ICMP6_PACKET_TOO_BIG ||
3081		    icmptype == ICMP6_TIME_EXCEEDED ||
3082		    icmptype == ICMP6_PARAM_PROB)
3083			state_icmp++;
3084		break;
3085#endif /* INET6 */
3086	default:
3087		sport = dport = hdrlen = 0;
3088		break;
3089	}
3090
3091	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3092
3093	/* check packet for BINAT/NAT/RDR */
3094	if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk,
3095	    &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) {
3096		KASSERT(sk != NULL, ("%s: null sk", __func__));
3097		KASSERT(nk != NULL, ("%s: null nk", __func__));
3098
3099		if (pd->ip_sum)
3100			bip_sum = *pd->ip_sum;
3101
3102		switch (pd->proto) {
3103		case IPPROTO_TCP:
3104			bproto_sum = th->th_sum;
3105			pd->proto_sum = &th->th_sum;
3106
3107			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3108			    nk->port[pd->sidx] != sport) {
3109				pf_change_ap(saddr, &th->th_sport, pd->ip_sum,
3110				    &th->th_sum, &nk->addr[pd->sidx],
3111				    nk->port[pd->sidx], 0, af);
3112				pd->sport = &th->th_sport;
3113				sport = th->th_sport;
3114			}
3115
3116			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3117			    nk->port[pd->didx] != dport) {
3118				pf_change_ap(daddr, &th->th_dport, pd->ip_sum,
3119				    &th->th_sum, &nk->addr[pd->didx],
3120				    nk->port[pd->didx], 0, af);
3121				dport = th->th_dport;
3122				pd->dport = &th->th_dport;
3123			}
3124			rewrite++;
3125			break;
3126		case IPPROTO_UDP:
3127			bproto_sum = pd->hdr.udp->uh_sum;
3128			pd->proto_sum = &pd->hdr.udp->uh_sum;
3129
3130			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3131			    nk->port[pd->sidx] != sport) {
3132				pf_change_ap(saddr, &pd->hdr.udp->uh_sport,
3133				    pd->ip_sum, &pd->hdr.udp->uh_sum,
3134				    &nk->addr[pd->sidx],
3135				    nk->port[pd->sidx], 1, af);
3136				sport = pd->hdr.udp->uh_sport;
3137				pd->sport = &pd->hdr.udp->uh_sport;
3138			}
3139
3140			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3141			    nk->port[pd->didx] != dport) {
3142				pf_change_ap(daddr, &pd->hdr.udp->uh_dport,
3143				    pd->ip_sum, &pd->hdr.udp->uh_sum,
3144				    &nk->addr[pd->didx],
3145				    nk->port[pd->didx], 1, af);
3146				dport = pd->hdr.udp->uh_dport;
3147				pd->dport = &pd->hdr.udp->uh_dport;
3148			}
3149			rewrite++;
3150			break;
3151#ifdef INET
3152		case IPPROTO_ICMP:
3153			nk->port[0] = nk->port[1];
3154			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
3155				pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
3156				    nk->addr[pd->sidx].v4.s_addr, 0);
3157
3158			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
3159				pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
3160				    nk->addr[pd->didx].v4.s_addr, 0);
3161
3162			if (nk->port[1] != pd->hdr.icmp->icmp_id) {
3163				pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
3164				    pd->hdr.icmp->icmp_cksum, sport,
3165				    nk->port[1], 0);
3166				pd->hdr.icmp->icmp_id = nk->port[1];
3167				pd->sport = &pd->hdr.icmp->icmp_id;
3168			}
3169			m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
3170			break;
3171#endif /* INET */
3172#ifdef INET6
3173		case IPPROTO_ICMPV6:
3174			nk->port[0] = nk->port[1];
3175			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
3176				pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
3177				    &nk->addr[pd->sidx], 0);
3178
3179			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
3180				pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
3181				    &nk->addr[pd->didx], 0);
3182			rewrite++;
3183			break;
3184#endif /* INET */
3185		default:
3186			switch (af) {
3187#ifdef INET
3188			case AF_INET:
3189				if (PF_ANEQ(saddr,
3190				    &nk->addr[pd->sidx], AF_INET))
3191					pf_change_a(&saddr->v4.s_addr,
3192					    pd->ip_sum,
3193					    nk->addr[pd->sidx].v4.s_addr, 0);
3194
3195				if (PF_ANEQ(daddr,
3196				    &nk->addr[pd->didx], AF_INET))
3197					pf_change_a(&daddr->v4.s_addr,
3198					    pd->ip_sum,
3199					    nk->addr[pd->didx].v4.s_addr, 0);
3200				break;
3201#endif /* INET */
3202#ifdef INET6
3203			case AF_INET6:
3204				if (PF_ANEQ(saddr,
3205				    &nk->addr[pd->sidx], AF_INET6))
3206					PF_ACPY(saddr, &nk->addr[pd->sidx], af);
3207
3208				if (PF_ANEQ(daddr,
3209				    &nk->addr[pd->didx], AF_INET6))
3210					PF_ACPY(saddr, &nk->addr[pd->didx], af);
3211				break;
3212#endif /* INET */
3213			}
3214			break;
3215		}
3216		if (nr->natpass)
3217			r = NULL;
3218		pd->nat_rule = nr;
3219	}
3220
3221	while (r != NULL) {
3222		r->evaluations++;
3223		if (pfi_kif_match(r->kif, kif) == r->ifnot)
3224			r = r->skip[PF_SKIP_IFP].ptr;
3225		else if (r->direction && r->direction != direction)
3226			r = r->skip[PF_SKIP_DIR].ptr;
3227		else if (r->af && r->af != af)
3228			r = r->skip[PF_SKIP_AF].ptr;
3229		else if (r->proto && r->proto != pd->proto)
3230			r = r->skip[PF_SKIP_PROTO].ptr;
3231		else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
3232		    r->src.neg, kif, M_GETFIB(m)))
3233			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3234		/* tcp/udp only. port_op always 0 in other cases */
3235		else if (r->src.port_op && !pf_match_port(r->src.port_op,
3236		    r->src.port[0], r->src.port[1], sport))
3237			r = r->skip[PF_SKIP_SRC_PORT].ptr;
3238		else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
3239		    r->dst.neg, NULL, M_GETFIB(m)))
3240			r = r->skip[PF_SKIP_DST_ADDR].ptr;
3241		/* tcp/udp only. port_op always 0 in other cases */
3242		else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
3243		    r->dst.port[0], r->dst.port[1], dport))
3244			r = r->skip[PF_SKIP_DST_PORT].ptr;
3245		/* icmp only. type always 0 in other cases */
3246		else if (r->type && r->type != icmptype + 1)
3247			r = TAILQ_NEXT(r, entries);
3248		/* icmp only. type always 0 in other cases */
3249		else if (r->code && r->code != icmpcode + 1)
3250			r = TAILQ_NEXT(r, entries);
3251		else if (r->tos && !(r->tos == pd->tos))
3252			r = TAILQ_NEXT(r, entries);
3253		else if (r->rule_flag & PFRULE_FRAGMENT)
3254			r = TAILQ_NEXT(r, entries);
3255		else if (pd->proto == IPPROTO_TCP &&
3256		    (r->flagset & th->th_flags) != r->flags)
3257			r = TAILQ_NEXT(r, entries);
3258		/* tcp/udp only. uid.op always 0 in other cases */
3259		else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
3260		    pf_socket_lookup(direction, pd, m), 1)) &&
3261		    !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
3262		    pd->lookup.uid))
3263			r = TAILQ_NEXT(r, entries);
3264		/* tcp/udp only. gid.op always 0 in other cases */
3265		else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
3266		    pf_socket_lookup(direction, pd, m), 1)) &&
3267		    !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
3268		    pd->lookup.gid))
3269			r = TAILQ_NEXT(r, entries);
3270		else if (r->prob &&
3271		    r->prob <= arc4random())
3272			r = TAILQ_NEXT(r, entries);
3273		else if (r->match_tag && !pf_match_tag(m, r, &tag,
3274		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
3275			r = TAILQ_NEXT(r, entries);
3276		else if (r->os_fingerprint != PF_OSFP_ANY &&
3277		    (pd->proto != IPPROTO_TCP || !pf_osfp_match(
3278		    pf_osfp_fingerprint(pd, m, off, th),
3279		    r->os_fingerprint)))
3280			r = TAILQ_NEXT(r, entries);
3281		else {
3282			if (r->tag)
3283				tag = r->tag;
3284			if (r->rtableid >= 0)
3285				rtableid = r->rtableid;
3286			if (r->anchor == NULL) {
3287				match = 1;
3288				*rm = r;
3289				*am = a;
3290				*rsm = ruleset;
3291				if ((*rm)->quick)
3292					break;
3293				r = TAILQ_NEXT(r, entries);
3294			} else
3295				pf_step_into_anchor(anchor_stack, &asd,
3296				    &ruleset, PF_RULESET_FILTER, &r, &a,
3297				    &match);
3298		}
3299		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
3300		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
3301			break;
3302	}
3303	r = *rm;
3304	a = *am;
3305	ruleset = *rsm;
3306
3307	REASON_SET(&reason, PFRES_MATCH);
3308
3309	if (r->log || (nr != NULL && nr->log)) {
3310		if (rewrite)
3311			m_copyback(m, off, hdrlen, pd->hdr.any);
3312		PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a,
3313		    ruleset, pd, 1);
3314	}
3315
3316	if ((r->action == PF_DROP) &&
3317	    ((r->rule_flag & PFRULE_RETURNRST) ||
3318	    (r->rule_flag & PFRULE_RETURNICMP) ||
3319	    (r->rule_flag & PFRULE_RETURN))) {
3320		/* undo NAT changes, if they have taken place */
3321		if (nr != NULL) {
3322			PF_ACPY(saddr, &sk->addr[pd->sidx], af);
3323			PF_ACPY(daddr, &sk->addr[pd->didx], af);
3324			if (pd->sport)
3325				*pd->sport = sk->port[pd->sidx];
3326			if (pd->dport)
3327				*pd->dport = sk->port[pd->didx];
3328			if (pd->proto_sum)
3329				*pd->proto_sum = bproto_sum;
3330			if (pd->ip_sum)
3331				*pd->ip_sum = bip_sum;
3332			m_copyback(m, off, hdrlen, pd->hdr.any);
3333		}
3334		if (pd->proto == IPPROTO_TCP &&
3335		    ((r->rule_flag & PFRULE_RETURNRST) ||
3336		    (r->rule_flag & PFRULE_RETURN)) &&
3337		    !(th->th_flags & TH_RST)) {
3338			u_int32_t	 ack = ntohl(th->th_seq) + pd->p_len;
3339			int		 len = 0;
3340#ifdef INET
3341			struct ip	*h4;
3342#endif
3343#ifdef INET6
3344			struct ip6_hdr	*h6;
3345#endif
3346
3347			switch (af) {
3348#ifdef INET
3349			case AF_INET:
3350				h4 = mtod(m, struct ip *);
3351				len = ntohs(h4->ip_len) - off;
3352				break;
3353#endif
3354#ifdef INET6
3355			case AF_INET6:
3356				h6 = mtod(m, struct ip6_hdr *);
3357				len = ntohs(h6->ip6_plen) - (off - sizeof(*h6));
3358				break;
3359#endif
3360			}
3361
3362			if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
3363				REASON_SET(&reason, PFRES_PROTCKSUM);
3364			else {
3365				if (th->th_flags & TH_SYN)
3366					ack++;
3367				if (th->th_flags & TH_FIN)
3368					ack++;
3369				pf_send_tcp(m, r, af, pd->dst,
3370				    pd->src, th->th_dport, th->th_sport,
3371				    ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
3372				    r->return_ttl, 1, 0, kif->pfik_ifp);
3373			}
3374		} else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
3375		    r->return_icmp)
3376			pf_send_icmp(m, r->return_icmp >> 8,
3377			    r->return_icmp & 255, af, r);
3378		else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
3379		    r->return_icmp6)
3380			pf_send_icmp(m, r->return_icmp6 >> 8,
3381			    r->return_icmp6 & 255, af, r);
3382	}
3383
3384	if (r->action == PF_DROP)
3385		goto cleanup;
3386
3387	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3388		REASON_SET(&reason, PFRES_MEMORY);
3389		goto cleanup;
3390	}
3391	if (rtableid >= 0)
3392		M_SETFIB(m, rtableid);
3393
3394	if (!state_icmp && (r->keep_state || nr != NULL ||
3395	    (pd->flags & PFDESC_TCP_NORM))) {
3396		int action;
3397		action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off,
3398		    sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum,
3399		    hdrlen);
3400		if (action != PF_PASS)
3401			return (action);
3402	} else {
3403		if (sk != NULL)
3404			uma_zfree(V_pf_state_key_z, sk);
3405		if (nk != NULL)
3406			uma_zfree(V_pf_state_key_z, nk);
3407	}
3408
3409	/* copy back packet headers if we performed NAT operations */
3410	if (rewrite)
3411		m_copyback(m, off, hdrlen, pd->hdr.any);
3412
3413	if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) &&
3414	    direction == PF_OUT &&
3415	    pfsync_defer_ptr != NULL && pfsync_defer_ptr(*sm, m))
3416		/*
3417		 * We want the state created, but we dont
3418		 * want to send this in case a partner
3419		 * firewall has to know about it to allow
3420		 * replies through it.
3421		 */
3422		return (PF_DEFER);
3423
3424	return (PF_PASS);
3425
3426cleanup:
3427	if (sk != NULL)
3428		uma_zfree(V_pf_state_key_z, sk);
3429	if (nk != NULL)
3430		uma_zfree(V_pf_state_key_z, nk);
3431	return (PF_DROP);
3432}
3433
3434static int
3435pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a,
3436    struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk,
3437    struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport,
3438    u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm,
3439    int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen)
3440{
3441	struct pf_state		*s = NULL;
3442	struct pf_src_node	*sn = NULL;
3443	struct tcphdr		*th = pd->hdr.tcp;
3444	u_int16_t		 mss = V_tcp_mssdflt;
3445	u_short			 reason;
3446
3447	/* check maximums */
3448	if (r->max_states &&
3449	    (counter_u64_fetch(r->states_cur) >= r->max_states)) {
3450		V_pf_status.lcounters[LCNT_STATES]++;
3451		REASON_SET(&reason, PFRES_MAXSTATES);
3452		return (PF_DROP);
3453	}
3454	/* src node for filter rule */
3455	if ((r->rule_flag & PFRULE_SRCTRACK ||
3456	    r->rpool.opts & PF_POOL_STICKYADDR) &&
3457	    pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
3458		REASON_SET(&reason, PFRES_SRCLIMIT);
3459		goto csfailed;
3460	}
3461	/* src node for translation rule */
3462	if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
3463	    pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
3464		REASON_SET(&reason, PFRES_SRCLIMIT);
3465		goto csfailed;
3466	}
3467	s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO);
3468	if (s == NULL) {
3469		REASON_SET(&reason, PFRES_MEMORY);
3470		goto csfailed;
3471	}
3472	s->rule.ptr = r;
3473	s->nat_rule.ptr = nr;
3474	s->anchor.ptr = a;
3475	STATE_INC_COUNTERS(s);
3476	if (r->allow_opts)
3477		s->state_flags |= PFSTATE_ALLOWOPTS;
3478	if (r->rule_flag & PFRULE_STATESLOPPY)
3479		s->state_flags |= PFSTATE_SLOPPY;
3480	s->log = r->log & PF_LOG_ALL;
3481	s->sync_state = PFSYNC_S_NONE;
3482	if (nr != NULL)
3483		s->log |= nr->log & PF_LOG_ALL;
3484	switch (pd->proto) {
3485	case IPPROTO_TCP:
3486		s->src.seqlo = ntohl(th->th_seq);
3487		s->src.seqhi = s->src.seqlo + pd->p_len + 1;
3488		if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
3489		    r->keep_state == PF_STATE_MODULATE) {
3490			/* Generate sequence number modulator */
3491			if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
3492			    0)
3493				s->src.seqdiff = 1;
3494			pf_change_a(&th->th_seq, &th->th_sum,
3495			    htonl(s->src.seqlo + s->src.seqdiff), 0);
3496			*rewrite = 1;
3497		} else
3498			s->src.seqdiff = 0;
3499		if (th->th_flags & TH_SYN) {
3500			s->src.seqhi++;
3501			s->src.wscale = pf_get_wscale(m, off,
3502			    th->th_off, pd->af);
3503		}
3504		s->src.max_win = MAX(ntohs(th->th_win), 1);
3505		if (s->src.wscale & PF_WSCALE_MASK) {
3506			/* Remove scale factor from initial window */
3507			int win = s->src.max_win;
3508			win += 1 << (s->src.wscale & PF_WSCALE_MASK);
3509			s->src.max_win = (win - 1) >>
3510			    (s->src.wscale & PF_WSCALE_MASK);
3511		}
3512		if (th->th_flags & TH_FIN)
3513			s->src.seqhi++;
3514		s->dst.seqhi = 1;
3515		s->dst.max_win = 1;
3516		s->src.state = TCPS_SYN_SENT;
3517		s->dst.state = TCPS_CLOSED;
3518		s->timeout = PFTM_TCP_FIRST_PACKET;
3519		break;
3520	case IPPROTO_UDP:
3521		s->src.state = PFUDPS_SINGLE;
3522		s->dst.state = PFUDPS_NO_TRAFFIC;
3523		s->timeout = PFTM_UDP_FIRST_PACKET;
3524		break;
3525	case IPPROTO_ICMP:
3526#ifdef INET6
3527	case IPPROTO_ICMPV6:
3528#endif
3529		s->timeout = PFTM_ICMP_FIRST_PACKET;
3530		break;
3531	default:
3532		s->src.state = PFOTHERS_SINGLE;
3533		s->dst.state = PFOTHERS_NO_TRAFFIC;
3534		s->timeout = PFTM_OTHER_FIRST_PACKET;
3535	}
3536
3537	s->creation = time_uptime;
3538	s->expire = time_uptime;
3539
3540	if (sn != NULL) {
3541		s->src_node = sn;
3542		s->src_node->states++;
3543	}
3544	if (nsn != NULL) {
3545		/* XXX We only modify one side for now. */
3546		PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
3547		s->nat_src_node = nsn;
3548		s->nat_src_node->states++;
3549	}
3550	if (pd->proto == IPPROTO_TCP) {
3551		if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
3552		    off, pd, th, &s->src, &s->dst)) {
3553			REASON_SET(&reason, PFRES_MEMORY);
3554			pf_src_tree_remove_state(s);
3555			STATE_DEC_COUNTERS(s);
3556			uma_zfree(V_pf_state_z, s);
3557			return (PF_DROP);
3558		}
3559		if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
3560		    pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
3561		    &s->src, &s->dst, rewrite)) {
3562			/* This really shouldn't happen!!! */
3563			DPFPRINTF(PF_DEBUG_URGENT,
3564			    ("pf_normalize_tcp_stateful failed on first pkt"));
3565			pf_normalize_tcp_cleanup(s);
3566			pf_src_tree_remove_state(s);
3567			STATE_DEC_COUNTERS(s);
3568			uma_zfree(V_pf_state_z, s);
3569			return (PF_DROP);
3570		}
3571	}
3572	s->direction = pd->dir;
3573
3574	/*
3575	 * sk/nk could already been setup by pf_get_translation().
3576	 */
3577	if (nr == NULL) {
3578		KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p",
3579		    __func__, nr, sk, nk));
3580		sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport);
3581		if (sk == NULL)
3582			goto csfailed;
3583		nk = sk;
3584	} else
3585		KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p",
3586		    __func__, nr, sk, nk));
3587
3588	/* Swap sk/nk for PF_OUT. */
3589	if (pf_state_insert(BOUND_IFACE(r, kif),
3590	    (pd->dir == PF_IN) ? sk : nk,
3591	    (pd->dir == PF_IN) ? nk : sk, s)) {
3592		if (pd->proto == IPPROTO_TCP)
3593			pf_normalize_tcp_cleanup(s);
3594		REASON_SET(&reason, PFRES_STATEINS);
3595		pf_src_tree_remove_state(s);
3596		STATE_DEC_COUNTERS(s);
3597		uma_zfree(V_pf_state_z, s);
3598		return (PF_DROP);
3599	} else
3600		*sm = s;
3601
3602	pf_set_rt_ifp(s, pd->src);	/* needs s->state_key set */
3603	if (tag > 0)
3604		s->tag = tag;
3605	if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
3606	    TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
3607		s->src.state = PF_TCPS_PROXY_SRC;
3608		/* undo NAT changes, if they have taken place */
3609		if (nr != NULL) {
3610			struct pf_state_key *skt = s->key[PF_SK_WIRE];
3611			if (pd->dir == PF_OUT)
3612				skt = s->key[PF_SK_STACK];
3613			PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
3614			PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
3615			if (pd->sport)
3616				*pd->sport = skt->port[pd->sidx];
3617			if (pd->dport)
3618				*pd->dport = skt->port[pd->didx];
3619			if (pd->proto_sum)
3620				*pd->proto_sum = bproto_sum;
3621			if (pd->ip_sum)
3622				*pd->ip_sum = bip_sum;
3623			m_copyback(m, off, hdrlen, pd->hdr.any);
3624		}
3625		s->src.seqhi = htonl(arc4random());
3626		/* Find mss option */
3627		int rtid = M_GETFIB(m);
3628		mss = pf_get_mss(m, off, th->th_off, pd->af);
3629		mss = pf_calc_mss(pd->src, pd->af, rtid, mss);
3630		mss = pf_calc_mss(pd->dst, pd->af, rtid, mss);
3631		s->src.mss = mss;
3632		pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport,
3633		    th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
3634		    TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL);
3635		REASON_SET(&reason, PFRES_SYNPROXY);
3636		return (PF_SYNPROXY_DROP);
3637	}
3638
3639	return (PF_PASS);
3640
3641csfailed:
3642	if (sk != NULL)
3643		uma_zfree(V_pf_state_key_z, sk);
3644	if (nk != NULL)
3645		uma_zfree(V_pf_state_key_z, nk);
3646
3647	if (sn != NULL && sn->states == 0 && sn->expire == 0) {
3648		pf_unlink_src_node(sn);
3649		pf_free_src_node(sn);
3650	}
3651
3652	if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) {
3653		pf_unlink_src_node(nsn);
3654		pf_free_src_node(nsn);
3655	}
3656
3657	return (PF_DROP);
3658}
3659
3660static int
3661pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
3662    struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am,
3663    struct pf_ruleset **rsm)
3664{
3665	struct pf_rule		*r, *a = NULL;
3666	struct pf_ruleset	*ruleset = NULL;
3667	sa_family_t		 af = pd->af;
3668	u_short			 reason;
3669	int			 tag = -1;
3670	int			 asd = 0;
3671	int			 match = 0;
3672	struct pf_anchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
3673
3674	PF_RULES_RASSERT();
3675
3676	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3677	while (r != NULL) {
3678		r->evaluations++;
3679		if (pfi_kif_match(r->kif, kif) == r->ifnot)
3680			r = r->skip[PF_SKIP_IFP].ptr;
3681		else if (r->direction && r->direction != direction)
3682			r = r->skip[PF_SKIP_DIR].ptr;
3683		else if (r->af && r->af != af)
3684			r = r->skip[PF_SKIP_AF].ptr;
3685		else if (r->proto && r->proto != pd->proto)
3686			r = r->skip[PF_SKIP_PROTO].ptr;
3687		else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
3688		    r->src.neg, kif, M_GETFIB(m)))
3689			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3690		else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
3691		    r->dst.neg, NULL, M_GETFIB(m)))
3692			r = r->skip[PF_SKIP_DST_ADDR].ptr;
3693		else if (r->tos && !(r->tos == pd->tos))
3694			r = TAILQ_NEXT(r, entries);
3695		else if (r->os_fingerprint != PF_OSFP_ANY)
3696			r = TAILQ_NEXT(r, entries);
3697		else if (pd->proto == IPPROTO_UDP &&
3698		    (r->src.port_op || r->dst.port_op))
3699			r = TAILQ_NEXT(r, entries);
3700		else if (pd->proto == IPPROTO_TCP &&
3701		    (r->src.port_op || r->dst.port_op || r->flagset))
3702			r = TAILQ_NEXT(r, entries);
3703		else if ((pd->proto == IPPROTO_ICMP ||
3704		    pd->proto == IPPROTO_ICMPV6) &&
3705		    (r->type || r->code))
3706			r = TAILQ_NEXT(r, entries);
3707		else if (r->prob && r->prob <=
3708		    (arc4random() % (UINT_MAX - 1) + 1))
3709			r = TAILQ_NEXT(r, entries);
3710		else if (r->match_tag && !pf_match_tag(m, r, &tag,
3711		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
3712			r = TAILQ_NEXT(r, entries);
3713		else {
3714			if (r->anchor == NULL) {
3715				match = 1;
3716				*rm = r;
3717				*am = a;
3718				*rsm = ruleset;
3719				if ((*rm)->quick)
3720					break;
3721				r = TAILQ_NEXT(r, entries);
3722			} else
3723				pf_step_into_anchor(anchor_stack, &asd,
3724				    &ruleset, PF_RULESET_FILTER, &r, &a,
3725				    &match);
3726		}
3727		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
3728		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
3729			break;
3730	}
3731	r = *rm;
3732	a = *am;
3733	ruleset = *rsm;
3734
3735	REASON_SET(&reason, PFRES_MATCH);
3736
3737	if (r->log)
3738		PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd,
3739		    1);
3740
3741	if (r->action != PF_PASS)
3742		return (PF_DROP);
3743
3744	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3745		REASON_SET(&reason, PFRES_MEMORY);
3746		return (PF_DROP);
3747	}
3748
3749	return (PF_PASS);
3750}
3751
3752static int
3753pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
3754	struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
3755	struct pf_pdesc *pd, u_short *reason, int *copyback)
3756{
3757	struct tcphdr		*th = pd->hdr.tcp;
3758	u_int16_t		 win = ntohs(th->th_win);
3759	u_int32_t		 ack, end, seq, orig_seq;
3760	u_int8_t		 sws, dws;
3761	int			 ackskew;
3762
3763	if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
3764		sws = src->wscale & PF_WSCALE_MASK;
3765		dws = dst->wscale & PF_WSCALE_MASK;
3766	} else
3767		sws = dws = 0;
3768
3769	/*
3770	 * Sequence tracking algorithm from Guido van Rooij's paper:
3771	 *   http://www.madison-gurkha.com/publications/tcp_filtering/
3772	 *	tcp_filtering.ps
3773	 */
3774
3775	orig_seq = seq = ntohl(th->th_seq);
3776	if (src->seqlo == 0) {
3777		/* First packet from this end. Set its state */
3778
3779		if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
3780		    src->scrub == NULL) {
3781			if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
3782				REASON_SET(reason, PFRES_MEMORY);
3783				return (PF_DROP);
3784			}
3785		}
3786
3787		/* Deferred generation of sequence number modulator */
3788		if (dst->seqdiff && !src->seqdiff) {
3789			/* use random iss for the TCP server */
3790			while ((src->seqdiff = arc4random() - seq) == 0)
3791				;
3792			ack = ntohl(th->th_ack) - dst->seqdiff;
3793			pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
3794			    src->seqdiff), 0);
3795			pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
3796			*copyback = 1;
3797		} else {
3798			ack = ntohl(th->th_ack);
3799		}
3800
3801		end = seq + pd->p_len;
3802		if (th->th_flags & TH_SYN) {
3803			end++;
3804			if (dst->wscale & PF_WSCALE_FLAG) {
3805				src->wscale = pf_get_wscale(m, off, th->th_off,
3806				    pd->af);
3807				if (src->wscale & PF_WSCALE_FLAG) {
3808					/* Remove scale factor from initial
3809					 * window */
3810					sws = src->wscale & PF_WSCALE_MASK;
3811					win = ((u_int32_t)win + (1 << sws) - 1)
3812					    >> sws;
3813					dws = dst->wscale & PF_WSCALE_MASK;
3814				} else {
3815					/* fixup other window */
3816					dst->max_win <<= dst->wscale &
3817					    PF_WSCALE_MASK;
3818					/* in case of a retrans SYN|ACK */
3819					dst->wscale = 0;
3820				}
3821			}
3822		}
3823		if (th->th_flags & TH_FIN)
3824			end++;
3825
3826		src->seqlo = seq;
3827		if (src->state < TCPS_SYN_SENT)
3828			src->state = TCPS_SYN_SENT;
3829
3830		/*
3831		 * May need to slide the window (seqhi may have been set by
3832		 * the crappy stack check or if we picked up the connection
3833		 * after establishment)
3834		 */
3835		if (src->seqhi == 1 ||
3836		    SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
3837			src->seqhi = end + MAX(1, dst->max_win << dws);
3838		if (win > src->max_win)
3839			src->max_win = win;
3840
3841	} else {
3842		ack = ntohl(th->th_ack) - dst->seqdiff;
3843		if (src->seqdiff) {
3844			/* Modulate sequence numbers */
3845			pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
3846			    src->seqdiff), 0);
3847			pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
3848			*copyback = 1;
3849		}
3850		end = seq + pd->p_len;
3851		if (th->th_flags & TH_SYN)
3852			end++;
3853		if (th->th_flags & TH_FIN)
3854			end++;
3855	}
3856
3857	if ((th->th_flags & TH_ACK) == 0) {
3858		/* Let it pass through the ack skew check */
3859		ack = dst->seqlo;
3860	} else if ((ack == 0 &&
3861	    (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
3862	    /* broken tcp stacks do not set ack */
3863	    (dst->state < TCPS_SYN_SENT)) {
3864		/*
3865		 * Many stacks (ours included) will set the ACK number in an
3866		 * FIN|ACK if the SYN times out -- no sequence to ACK.
3867		 */
3868		ack = dst->seqlo;
3869	}
3870
3871	if (seq == end) {
3872		/* Ease sequencing restrictions on no data packets */
3873		seq = src->seqlo;
3874		end = seq;
3875	}
3876
3877	ackskew = dst->seqlo - ack;
3878
3879
3880	/*
3881	 * Need to demodulate the sequence numbers in any TCP SACK options
3882	 * (Selective ACK). We could optionally validate the SACK values
3883	 * against the current ACK window, either forwards or backwards, but
3884	 * I'm not confident that SACK has been implemented properly
3885	 * everywhere. It wouldn't surprise me if several stacks accidently
3886	 * SACK too far backwards of previously ACKed data. There really aren't
3887	 * any security implications of bad SACKing unless the target stack
3888	 * doesn't validate the option length correctly. Someone trying to
3889	 * spoof into a TCP connection won't bother blindly sending SACK
3890	 * options anyway.
3891	 */
3892	if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
3893		if (pf_modulate_sack(m, off, pd, th, dst))
3894			*copyback = 1;
3895	}
3896
3897
3898#define	MAXACKWINDOW (0xffff + 1500)	/* 1500 is an arbitrary fudge factor */
3899	if (SEQ_GEQ(src->seqhi, end) &&
3900	    /* Last octet inside other's window space */
3901	    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
3902	    /* Retrans: not more than one window back */
3903	    (ackskew >= -MAXACKWINDOW) &&
3904	    /* Acking not more than one reassembled fragment backwards */
3905	    (ackskew <= (MAXACKWINDOW << sws)) &&
3906	    /* Acking not more than one window forward */
3907	    ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
3908	    (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
3909	    (pd->flags & PFDESC_IP_REAS) == 0)) {
3910	    /* Require an exact/+1 sequence match on resets when possible */
3911
3912		if (dst->scrub || src->scrub) {
3913			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
3914			    *state, src, dst, copyback))
3915				return (PF_DROP);
3916		}
3917
3918		/* update max window */
3919		if (src->max_win < win)
3920			src->max_win = win;
3921		/* synchronize sequencing */
3922		if (SEQ_GT(end, src->seqlo))
3923			src->seqlo = end;
3924		/* slide the window of what the other end can send */
3925		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
3926			dst->seqhi = ack + MAX((win << sws), 1);
3927
3928
3929		/* update states */
3930		if (th->th_flags & TH_SYN)
3931			if (src->state < TCPS_SYN_SENT)
3932				src->state = TCPS_SYN_SENT;
3933		if (th->th_flags & TH_FIN)
3934			if (src->state < TCPS_CLOSING)
3935				src->state = TCPS_CLOSING;
3936		if (th->th_flags & TH_ACK) {
3937			if (dst->state == TCPS_SYN_SENT) {
3938				dst->state = TCPS_ESTABLISHED;
3939				if (src->state == TCPS_ESTABLISHED &&
3940				    (*state)->src_node != NULL &&
3941				    pf_src_connlimit(state)) {
3942					REASON_SET(reason, PFRES_SRCLIMIT);
3943					return (PF_DROP);
3944				}
3945			} else if (dst->state == TCPS_CLOSING)
3946				dst->state = TCPS_FIN_WAIT_2;
3947		}
3948		if (th->th_flags & TH_RST)
3949			src->state = dst->state = TCPS_TIME_WAIT;
3950
3951		/* update expire time */
3952		(*state)->expire = time_uptime;
3953		if (src->state >= TCPS_FIN_WAIT_2 &&
3954		    dst->state >= TCPS_FIN_WAIT_2)
3955			(*state)->timeout = PFTM_TCP_CLOSED;
3956		else if (src->state >= TCPS_CLOSING &&
3957		    dst->state >= TCPS_CLOSING)
3958			(*state)->timeout = PFTM_TCP_FIN_WAIT;
3959		else if (src->state < TCPS_ESTABLISHED ||
3960		    dst->state < TCPS_ESTABLISHED)
3961			(*state)->timeout = PFTM_TCP_OPENING;
3962		else if (src->state >= TCPS_CLOSING ||
3963		    dst->state >= TCPS_CLOSING)
3964			(*state)->timeout = PFTM_TCP_CLOSING;
3965		else
3966			(*state)->timeout = PFTM_TCP_ESTABLISHED;
3967
3968		/* Fall through to PASS packet */
3969
3970	} else if ((dst->state < TCPS_SYN_SENT ||
3971		dst->state >= TCPS_FIN_WAIT_2 ||
3972		src->state >= TCPS_FIN_WAIT_2) &&
3973	    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
3974	    /* Within a window forward of the originating packet */
3975	    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
3976	    /* Within a window backward of the originating packet */
3977
3978		/*
3979		 * This currently handles three situations:
3980		 *  1) Stupid stacks will shotgun SYNs before their peer
3981		 *     replies.
3982		 *  2) When PF catches an already established stream (the
3983		 *     firewall rebooted, the state table was flushed, routes
3984		 *     changed...)
3985		 *  3) Packets get funky immediately after the connection
3986		 *     closes (this should catch Solaris spurious ACK|FINs
3987		 *     that web servers like to spew after a close)
3988		 *
3989		 * This must be a little more careful than the above code
3990		 * since packet floods will also be caught here. We don't
3991		 * update the TTL here to mitigate the damage of a packet
3992		 * flood and so the same code can handle awkward establishment
3993		 * and a loosened connection close.
3994		 * In the establishment case, a correct peer response will
3995		 * validate the connection, go through the normal state code
3996		 * and keep updating the state TTL.
3997		 */
3998
3999		if (V_pf_status.debug >= PF_DEBUG_MISC) {
4000			printf("pf: loose state match: ");
4001			pf_print_state(*state);
4002			pf_print_flags(th->th_flags);
4003			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4004			    "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack,
4005			    pd->p_len, ackskew, (unsigned long long)(*state)->packets[0],
4006			    (unsigned long long)(*state)->packets[1],
4007			    pd->dir == PF_IN ? "in" : "out",
4008			    pd->dir == (*state)->direction ? "fwd" : "rev");
4009		}
4010
4011		if (dst->scrub || src->scrub) {
4012			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
4013			    *state, src, dst, copyback))
4014				return (PF_DROP);
4015		}
4016
4017		/* update max window */
4018		if (src->max_win < win)
4019			src->max_win = win;
4020		/* synchronize sequencing */
4021		if (SEQ_GT(end, src->seqlo))
4022			src->seqlo = end;
4023		/* slide the window of what the other end can send */
4024		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
4025			dst->seqhi = ack + MAX((win << sws), 1);
4026
4027		/*
4028		 * Cannot set dst->seqhi here since this could be a shotgunned
4029		 * SYN and not an already established connection.
4030		 */
4031
4032		if (th->th_flags & TH_FIN)
4033			if (src->state < TCPS_CLOSING)
4034				src->state = TCPS_CLOSING;
4035		if (th->th_flags & TH_RST)
4036			src->state = dst->state = TCPS_TIME_WAIT;
4037
4038		/* Fall through to PASS packet */
4039
4040	} else {
4041		if ((*state)->dst.state == TCPS_SYN_SENT &&
4042		    (*state)->src.state == TCPS_SYN_SENT) {
4043			/* Send RST for state mismatches during handshake */
4044			if (!(th->th_flags & TH_RST))
4045				pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4046				    pd->dst, pd->src, th->th_dport,
4047				    th->th_sport, ntohl(th->th_ack), 0,
4048				    TH_RST, 0, 0,
4049				    (*state)->rule.ptr->return_ttl, 1, 0,
4050				    kif->pfik_ifp);
4051			src->seqlo = 0;
4052			src->seqhi = 1;
4053			src->max_win = 1;
4054		} else if (V_pf_status.debug >= PF_DEBUG_MISC) {
4055			printf("pf: BAD state: ");
4056			pf_print_state(*state);
4057			pf_print_flags(th->th_flags);
4058			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4059			    "pkts=%llu:%llu dir=%s,%s\n",
4060			    seq, orig_seq, ack, pd->p_len, ackskew,
4061			    (unsigned long long)(*state)->packets[0],
4062			    (unsigned long long)(*state)->packets[1],
4063			    pd->dir == PF_IN ? "in" : "out",
4064			    pd->dir == (*state)->direction ? "fwd" : "rev");
4065			printf("pf: State failure on: %c %c %c %c | %c %c\n",
4066			    SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
4067			    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
4068			    ' ': '2',
4069			    (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
4070			    (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
4071			    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
4072			    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
4073		}
4074		REASON_SET(reason, PFRES_BADSTATE);
4075		return (PF_DROP);
4076	}
4077
4078	return (PF_PASS);
4079}
4080
4081static int
4082pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
4083	struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
4084{
4085	struct tcphdr		*th = pd->hdr.tcp;
4086
4087	if (th->th_flags & TH_SYN)
4088		if (src->state < TCPS_SYN_SENT)
4089			src->state = TCPS_SYN_SENT;
4090	if (th->th_flags & TH_FIN)
4091		if (src->state < TCPS_CLOSING)
4092			src->state = TCPS_CLOSING;
4093	if (th->th_flags & TH_ACK) {
4094		if (dst->state == TCPS_SYN_SENT) {
4095			dst->state = TCPS_ESTABLISHED;
4096			if (src->state == TCPS_ESTABLISHED &&
4097			    (*state)->src_node != NULL &&
4098			    pf_src_connlimit(state)) {
4099				REASON_SET(reason, PFRES_SRCLIMIT);
4100				return (PF_DROP);
4101			}
4102		} else if (dst->state == TCPS_CLOSING) {
4103			dst->state = TCPS_FIN_WAIT_2;
4104		} else if (src->state == TCPS_SYN_SENT &&
4105		    dst->state < TCPS_SYN_SENT) {
4106			/*
4107			 * Handle a special sloppy case where we only see one
4108			 * half of the connection. If there is a ACK after
4109			 * the initial SYN without ever seeing a packet from
4110			 * the destination, set the connection to established.
4111			 */
4112			dst->state = src->state = TCPS_ESTABLISHED;
4113			if ((*state)->src_node != NULL &&
4114			    pf_src_connlimit(state)) {
4115				REASON_SET(reason, PFRES_SRCLIMIT);
4116				return (PF_DROP);
4117			}
4118		} else if (src->state == TCPS_CLOSING &&
4119		    dst->state == TCPS_ESTABLISHED &&
4120		    dst->seqlo == 0) {
4121			/*
4122			 * Handle the closing of half connections where we
4123			 * don't see the full bidirectional FIN/ACK+ACK
4124			 * handshake.
4125			 */
4126			dst->state = TCPS_CLOSING;
4127		}
4128	}
4129	if (th->th_flags & TH_RST)
4130		src->state = dst->state = TCPS_TIME_WAIT;
4131
4132	/* update expire time */
4133	(*state)->expire = time_uptime;
4134	if (src->state >= TCPS_FIN_WAIT_2 &&
4135	    dst->state >= TCPS_FIN_WAIT_2)
4136		(*state)->timeout = PFTM_TCP_CLOSED;
4137	else if (src->state >= TCPS_CLOSING &&
4138	    dst->state >= TCPS_CLOSING)
4139		(*state)->timeout = PFTM_TCP_FIN_WAIT;
4140	else if (src->state < TCPS_ESTABLISHED ||
4141	    dst->state < TCPS_ESTABLISHED)
4142		(*state)->timeout = PFTM_TCP_OPENING;
4143	else if (src->state >= TCPS_CLOSING ||
4144	    dst->state >= TCPS_CLOSING)
4145		(*state)->timeout = PFTM_TCP_CLOSING;
4146	else
4147		(*state)->timeout = PFTM_TCP_ESTABLISHED;
4148
4149	return (PF_PASS);
4150}
4151
4152static int
4153pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
4154    struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
4155    u_short *reason)
4156{
4157	struct pf_state_key_cmp	 key;
4158	struct tcphdr		*th = pd->hdr.tcp;
4159	int			 copyback = 0;
4160	struct pf_state_peer	*src, *dst;
4161	struct pf_state_key	*sk;
4162
4163	bzero(&key, sizeof(key));
4164	key.af = pd->af;
4165	key.proto = IPPROTO_TCP;
4166	if (direction == PF_IN)	{	/* wire side, straight */
4167		PF_ACPY(&key.addr[0], pd->src, key.af);
4168		PF_ACPY(&key.addr[1], pd->dst, key.af);
4169		key.port[0] = th->th_sport;
4170		key.port[1] = th->th_dport;
4171	} else {			/* stack side, reverse */
4172		PF_ACPY(&key.addr[1], pd->src, key.af);
4173		PF_ACPY(&key.addr[0], pd->dst, key.af);
4174		key.port[1] = th->th_sport;
4175		key.port[0] = th->th_dport;
4176	}
4177
4178	STATE_LOOKUP(kif, &key, direction, *state, pd);
4179
4180	if (direction == (*state)->direction) {
4181		src = &(*state)->src;
4182		dst = &(*state)->dst;
4183	} else {
4184		src = &(*state)->dst;
4185		dst = &(*state)->src;
4186	}
4187
4188	sk = (*state)->key[pd->didx];
4189
4190	if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
4191		if (direction != (*state)->direction) {
4192			REASON_SET(reason, PFRES_SYNPROXY);
4193			return (PF_SYNPROXY_DROP);
4194		}
4195		if (th->th_flags & TH_SYN) {
4196			if (ntohl(th->th_seq) != (*state)->src.seqlo) {
4197				REASON_SET(reason, PFRES_SYNPROXY);
4198				return (PF_DROP);
4199			}
4200			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4201			    pd->src, th->th_dport, th->th_sport,
4202			    (*state)->src.seqhi, ntohl(th->th_seq) + 1,
4203			    TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL);
4204			REASON_SET(reason, PFRES_SYNPROXY);
4205			return (PF_SYNPROXY_DROP);
4206		} else if (!(th->th_flags & TH_ACK) ||
4207		    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4208		    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4209			REASON_SET(reason, PFRES_SYNPROXY);
4210			return (PF_DROP);
4211		} else if ((*state)->src_node != NULL &&
4212		    pf_src_connlimit(state)) {
4213			REASON_SET(reason, PFRES_SRCLIMIT);
4214			return (PF_DROP);
4215		} else
4216			(*state)->src.state = PF_TCPS_PROXY_DST;
4217	}
4218	if ((*state)->src.state == PF_TCPS_PROXY_DST) {
4219		if (direction == (*state)->direction) {
4220			if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
4221			    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4222			    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4223				REASON_SET(reason, PFRES_SYNPROXY);
4224				return (PF_DROP);
4225			}
4226			(*state)->src.max_win = MAX(ntohs(th->th_win), 1);
4227			if ((*state)->dst.seqhi == 1)
4228				(*state)->dst.seqhi = htonl(arc4random());
4229			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4230			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
4231			    sk->port[pd->sidx], sk->port[pd->didx],
4232			    (*state)->dst.seqhi, 0, TH_SYN, 0,
4233			    (*state)->src.mss, 0, 0, (*state)->tag, NULL);
4234			REASON_SET(reason, PFRES_SYNPROXY);
4235			return (PF_SYNPROXY_DROP);
4236		} else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
4237		    (TH_SYN|TH_ACK)) ||
4238		    (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
4239			REASON_SET(reason, PFRES_SYNPROXY);
4240			return (PF_DROP);
4241		} else {
4242			(*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
4243			(*state)->dst.seqlo = ntohl(th->th_seq);
4244			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4245			    pd->src, th->th_dport, th->th_sport,
4246			    ntohl(th->th_ack), ntohl(th->th_seq) + 1,
4247			    TH_ACK, (*state)->src.max_win, 0, 0, 0,
4248			    (*state)->tag, NULL);
4249			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4250			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
4251			    sk->port[pd->sidx], sk->port[pd->didx],
4252			    (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
4253			    TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL);
4254			(*state)->src.seqdiff = (*state)->dst.seqhi -
4255			    (*state)->src.seqlo;
4256			(*state)->dst.seqdiff = (*state)->src.seqhi -
4257			    (*state)->dst.seqlo;
4258			(*state)->src.seqhi = (*state)->src.seqlo +
4259			    (*state)->dst.max_win;
4260			(*state)->dst.seqhi = (*state)->dst.seqlo +
4261			    (*state)->src.max_win;
4262			(*state)->src.wscale = (*state)->dst.wscale = 0;
4263			(*state)->src.state = (*state)->dst.state =
4264			    TCPS_ESTABLISHED;
4265			REASON_SET(reason, PFRES_SYNPROXY);
4266			return (PF_SYNPROXY_DROP);
4267		}
4268	}
4269
4270	if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
4271	    dst->state >= TCPS_FIN_WAIT_2 &&
4272	    src->state >= TCPS_FIN_WAIT_2) {
4273		if (V_pf_status.debug >= PF_DEBUG_MISC) {
4274			printf("pf: state reuse ");
4275			pf_print_state(*state);
4276			pf_print_flags(th->th_flags);
4277			printf("\n");
4278		}
4279		/* XXX make sure it's the same direction ?? */
4280		(*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
4281		pf_unlink_state(*state, PF_ENTER_LOCKED);
4282		*state = NULL;
4283		return (PF_DROP);
4284	}
4285
4286	if ((*state)->state_flags & PFSTATE_SLOPPY) {
4287		if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP)
4288			return (PF_DROP);
4289	} else {
4290		if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason,
4291		    &copyback) == PF_DROP)
4292			return (PF_DROP);
4293	}
4294
4295	/* translate source/destination address, if necessary */
4296	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4297		struct pf_state_key *nk = (*state)->key[pd->didx];
4298
4299		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4300		    nk->port[pd->sidx] != th->th_sport)
4301			pf_change_ap(pd->src, &th->th_sport, pd->ip_sum,
4302			    &th->th_sum, &nk->addr[pd->sidx],
4303			    nk->port[pd->sidx], 0, pd->af);
4304
4305		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4306		    nk->port[pd->didx] != th->th_dport)
4307			pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum,
4308			    &th->th_sum, &nk->addr[pd->didx],
4309			    nk->port[pd->didx], 0, pd->af);
4310		copyback = 1;
4311	}
4312
4313	/* Copyback sequence modulation or stateful scrub changes if needed */
4314	if (copyback)
4315		m_copyback(m, off, sizeof(*th), (caddr_t)th);
4316
4317	return (PF_PASS);
4318}
4319
4320static int
4321pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
4322    struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
4323{
4324	struct pf_state_peer	*src, *dst;
4325	struct pf_state_key_cmp	 key;
4326	struct udphdr		*uh = pd->hdr.udp;
4327
4328	bzero(&key, sizeof(key));
4329	key.af = pd->af;
4330	key.proto = IPPROTO_UDP;
4331	if (direction == PF_IN)	{	/* wire side, straight */
4332		PF_ACPY(&key.addr[0], pd->src, key.af);
4333		PF_ACPY(&key.addr[1], pd->dst, key.af);
4334		key.port[0] = uh->uh_sport;
4335		key.port[1] = uh->uh_dport;
4336	} else {			/* stack side, reverse */
4337		PF_ACPY(&key.addr[1], pd->src, key.af);
4338		PF_ACPY(&key.addr[0], pd->dst, key.af);
4339		key.port[1] = uh->uh_sport;
4340		key.port[0] = uh->uh_dport;
4341	}
4342
4343	STATE_LOOKUP(kif, &key, direction, *state, pd);
4344
4345	if (direction == (*state)->direction) {
4346		src = &(*state)->src;
4347		dst = &(*state)->dst;
4348	} else {
4349		src = &(*state)->dst;
4350		dst = &(*state)->src;
4351	}
4352
4353	/* update states */
4354	if (src->state < PFUDPS_SINGLE)
4355		src->state = PFUDPS_SINGLE;
4356	if (dst->state == PFUDPS_SINGLE)
4357		dst->state = PFUDPS_MULTIPLE;
4358
4359	/* update expire time */
4360	(*state)->expire = time_uptime;
4361	if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
4362		(*state)->timeout = PFTM_UDP_MULTIPLE;
4363	else
4364		(*state)->timeout = PFTM_UDP_SINGLE;
4365
4366	/* translate source/destination address, if necessary */
4367	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4368		struct pf_state_key *nk = (*state)->key[pd->didx];
4369
4370		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4371		    nk->port[pd->sidx] != uh->uh_sport)
4372			pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum,
4373			    &uh->uh_sum, &nk->addr[pd->sidx],
4374			    nk->port[pd->sidx], 1, pd->af);
4375
4376		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4377		    nk->port[pd->didx] != uh->uh_dport)
4378			pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum,
4379			    &uh->uh_sum, &nk->addr[pd->didx],
4380			    nk->port[pd->didx], 1, pd->af);
4381		m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
4382	}
4383
4384	return (PF_PASS);
4385}
4386
4387static int
4388pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
4389    struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
4390{
4391	struct pf_addr  *saddr = pd->src, *daddr = pd->dst;
4392	u_int16_t	 icmpid = 0, *icmpsum;
4393	u_int8_t	 icmptype;
4394	int		 state_icmp = 0;
4395	struct pf_state_key_cmp key;
4396
4397	bzero(&key, sizeof(key));
4398	switch (pd->proto) {
4399#ifdef INET
4400	case IPPROTO_ICMP:
4401		icmptype = pd->hdr.icmp->icmp_type;
4402		icmpid = pd->hdr.icmp->icmp_id;
4403		icmpsum = &pd->hdr.icmp->icmp_cksum;
4404
4405		if (icmptype == ICMP_UNREACH ||
4406		    icmptype == ICMP_SOURCEQUENCH ||
4407		    icmptype == ICMP_REDIRECT ||
4408		    icmptype == ICMP_TIMXCEED ||
4409		    icmptype == ICMP_PARAMPROB)
4410			state_icmp++;
4411		break;
4412#endif /* INET */
4413#ifdef INET6
4414	case IPPROTO_ICMPV6:
4415		icmptype = pd->hdr.icmp6->icmp6_type;
4416		icmpid = pd->hdr.icmp6->icmp6_id;
4417		icmpsum = &pd->hdr.icmp6->icmp6_cksum;
4418
4419		if (icmptype == ICMP6_DST_UNREACH ||
4420		    icmptype == ICMP6_PACKET_TOO_BIG ||
4421		    icmptype == ICMP6_TIME_EXCEEDED ||
4422		    icmptype == ICMP6_PARAM_PROB)
4423			state_icmp++;
4424		break;
4425#endif /* INET6 */
4426	}
4427
4428	if (!state_icmp) {
4429
4430		/*
4431		 * ICMP query/reply message not related to a TCP/UDP packet.
4432		 * Search for an ICMP state.
4433		 */
4434		key.af = pd->af;
4435		key.proto = pd->proto;
4436		key.port[0] = key.port[1] = icmpid;
4437		if (direction == PF_IN)	{	/* wire side, straight */
4438			PF_ACPY(&key.addr[0], pd->src, key.af);
4439			PF_ACPY(&key.addr[1], pd->dst, key.af);
4440		} else {			/* stack side, reverse */
4441			PF_ACPY(&key.addr[1], pd->src, key.af);
4442			PF_ACPY(&key.addr[0], pd->dst, key.af);
4443		}
4444
4445		STATE_LOOKUP(kif, &key, direction, *state, pd);
4446
4447		(*state)->expire = time_uptime;
4448		(*state)->timeout = PFTM_ICMP_ERROR_REPLY;
4449
4450		/* translate source/destination address, if necessary */
4451		if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4452			struct pf_state_key *nk = (*state)->key[pd->didx];
4453
4454			switch (pd->af) {
4455#ifdef INET
4456			case AF_INET:
4457				if (PF_ANEQ(pd->src,
4458				    &nk->addr[pd->sidx], AF_INET))
4459					pf_change_a(&saddr->v4.s_addr,
4460					    pd->ip_sum,
4461					    nk->addr[pd->sidx].v4.s_addr, 0);
4462
4463				if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
4464				    AF_INET))
4465					pf_change_a(&daddr->v4.s_addr,
4466					    pd->ip_sum,
4467					    nk->addr[pd->didx].v4.s_addr, 0);
4468
4469				if (nk->port[0] !=
4470				    pd->hdr.icmp->icmp_id) {
4471					pd->hdr.icmp->icmp_cksum =
4472					    pf_cksum_fixup(
4473					    pd->hdr.icmp->icmp_cksum, icmpid,
4474					    nk->port[pd->sidx], 0);
4475					pd->hdr.icmp->icmp_id =
4476					    nk->port[pd->sidx];
4477				}
4478
4479				m_copyback(m, off, ICMP_MINLEN,
4480				    (caddr_t )pd->hdr.icmp);
4481				break;
4482#endif /* INET */
4483#ifdef INET6
4484			case AF_INET6:
4485				if (PF_ANEQ(pd->src,
4486				    &nk->addr[pd->sidx], AF_INET6))
4487					pf_change_a6(saddr,
4488					    &pd->hdr.icmp6->icmp6_cksum,
4489					    &nk->addr[pd->sidx], 0);
4490
4491				if (PF_ANEQ(pd->dst,
4492				    &nk->addr[pd->didx], AF_INET6))
4493					pf_change_a6(daddr,
4494					    &pd->hdr.icmp6->icmp6_cksum,
4495					    &nk->addr[pd->didx], 0);
4496
4497				m_copyback(m, off, sizeof(struct icmp6_hdr),
4498				    (caddr_t )pd->hdr.icmp6);
4499				break;
4500#endif /* INET6 */
4501			}
4502		}
4503		return (PF_PASS);
4504
4505	} else {
4506		/*
4507		 * ICMP error message in response to a TCP/UDP packet.
4508		 * Extract the inner TCP/UDP header and search for that state.
4509		 */
4510
4511		struct pf_pdesc	pd2;
4512		bzero(&pd2, sizeof pd2);
4513#ifdef INET
4514		struct ip	h2;
4515#endif /* INET */
4516#ifdef INET6
4517		struct ip6_hdr	h2_6;
4518		int		terminal = 0;
4519#endif /* INET6 */
4520		int		ipoff2 = 0;
4521		int		off2 = 0;
4522
4523		pd2.af = pd->af;
4524		/* Payload packet is from the opposite direction. */
4525		pd2.sidx = (direction == PF_IN) ? 1 : 0;
4526		pd2.didx = (direction == PF_IN) ? 0 : 1;
4527		switch (pd->af) {
4528#ifdef INET
4529		case AF_INET:
4530			/* offset of h2 in mbuf chain */
4531			ipoff2 = off + ICMP_MINLEN;
4532
4533			if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
4534			    NULL, reason, pd2.af)) {
4535				DPFPRINTF(PF_DEBUG_MISC,
4536				    ("pf: ICMP error message too short "
4537				    "(ip)\n"));
4538				return (PF_DROP);
4539			}
4540			/*
4541			 * ICMP error messages don't refer to non-first
4542			 * fragments
4543			 */
4544			if (h2.ip_off & htons(IP_OFFMASK)) {
4545				REASON_SET(reason, PFRES_FRAG);
4546				return (PF_DROP);
4547			}
4548
4549			/* offset of protocol header that follows h2 */
4550			off2 = ipoff2 + (h2.ip_hl << 2);
4551
4552			pd2.proto = h2.ip_p;
4553			pd2.src = (struct pf_addr *)&h2.ip_src;
4554			pd2.dst = (struct pf_addr *)&h2.ip_dst;
4555			pd2.ip_sum = &h2.ip_sum;
4556			break;
4557#endif /* INET */
4558#ifdef INET6
4559		case AF_INET6:
4560			ipoff2 = off + sizeof(struct icmp6_hdr);
4561
4562			if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
4563			    NULL, reason, pd2.af)) {
4564				DPFPRINTF(PF_DEBUG_MISC,
4565				    ("pf: ICMP error message too short "
4566				    "(ip6)\n"));
4567				return (PF_DROP);
4568			}
4569			pd2.proto = h2_6.ip6_nxt;
4570			pd2.src = (struct pf_addr *)&h2_6.ip6_src;
4571			pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
4572			pd2.ip_sum = NULL;
4573			off2 = ipoff2 + sizeof(h2_6);
4574			do {
4575				switch (pd2.proto) {
4576				case IPPROTO_FRAGMENT:
4577					/*
4578					 * ICMPv6 error messages for
4579					 * non-first fragments
4580					 */
4581					REASON_SET(reason, PFRES_FRAG);
4582					return (PF_DROP);
4583				case IPPROTO_AH:
4584				case IPPROTO_HOPOPTS:
4585				case IPPROTO_ROUTING:
4586				case IPPROTO_DSTOPTS: {
4587					/* get next header and header length */
4588					struct ip6_ext opt6;
4589
4590					if (!pf_pull_hdr(m, off2, &opt6,
4591					    sizeof(opt6), NULL, reason,
4592					    pd2.af)) {
4593						DPFPRINTF(PF_DEBUG_MISC,
4594						    ("pf: ICMPv6 short opt\n"));
4595						return (PF_DROP);
4596					}
4597					if (pd2.proto == IPPROTO_AH)
4598						off2 += (opt6.ip6e_len + 2) * 4;
4599					else
4600						off2 += (opt6.ip6e_len + 1) * 8;
4601					pd2.proto = opt6.ip6e_nxt;
4602					/* goto the next header */
4603					break;
4604				}
4605				default:
4606					terminal++;
4607					break;
4608				}
4609			} while (!terminal);
4610			break;
4611#endif /* INET6 */
4612		}
4613
4614		switch (pd2.proto) {
4615		case IPPROTO_TCP: {
4616			struct tcphdr		 th;
4617			u_int32_t		 seq;
4618			struct pf_state_peer	*src, *dst;
4619			u_int8_t		 dws;
4620			int			 copyback = 0;
4621
4622			/*
4623			 * Only the first 8 bytes of the TCP header can be
4624			 * expected. Don't access any TCP header fields after
4625			 * th_seq, an ackskew test is not possible.
4626			 */
4627			if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
4628			    pd2.af)) {
4629				DPFPRINTF(PF_DEBUG_MISC,
4630				    ("pf: ICMP error message too short "
4631				    "(tcp)\n"));
4632				return (PF_DROP);
4633			}
4634
4635			key.af = pd2.af;
4636			key.proto = IPPROTO_TCP;
4637			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4638			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4639			key.port[pd2.sidx] = th.th_sport;
4640			key.port[pd2.didx] = th.th_dport;
4641
4642			STATE_LOOKUP(kif, &key, direction, *state, pd);
4643
4644			if (direction == (*state)->direction) {
4645				src = &(*state)->dst;
4646				dst = &(*state)->src;
4647			} else {
4648				src = &(*state)->src;
4649				dst = &(*state)->dst;
4650			}
4651
4652			if (src->wscale && dst->wscale)
4653				dws = dst->wscale & PF_WSCALE_MASK;
4654			else
4655				dws = 0;
4656
4657			/* Demodulate sequence number */
4658			seq = ntohl(th.th_seq) - src->seqdiff;
4659			if (src->seqdiff) {
4660				pf_change_a(&th.th_seq, icmpsum,
4661				    htonl(seq), 0);
4662				copyback = 1;
4663			}
4664
4665			if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
4666			    (!SEQ_GEQ(src->seqhi, seq) ||
4667			    !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
4668				if (V_pf_status.debug >= PF_DEBUG_MISC) {
4669					printf("pf: BAD ICMP %d:%d ",
4670					    icmptype, pd->hdr.icmp->icmp_code);
4671					pf_print_host(pd->src, 0, pd->af);
4672					printf(" -> ");
4673					pf_print_host(pd->dst, 0, pd->af);
4674					printf(" state: ");
4675					pf_print_state(*state);
4676					printf(" seq=%u\n", seq);
4677				}
4678				REASON_SET(reason, PFRES_BADSTATE);
4679				return (PF_DROP);
4680			} else {
4681				if (V_pf_status.debug >= PF_DEBUG_MISC) {
4682					printf("pf: OK ICMP %d:%d ",
4683					    icmptype, pd->hdr.icmp->icmp_code);
4684					pf_print_host(pd->src, 0, pd->af);
4685					printf(" -> ");
4686					pf_print_host(pd->dst, 0, pd->af);
4687					printf(" state: ");
4688					pf_print_state(*state);
4689					printf(" seq=%u\n", seq);
4690				}
4691			}
4692
4693			/* translate source/destination address, if necessary */
4694			if ((*state)->key[PF_SK_WIRE] !=
4695			    (*state)->key[PF_SK_STACK]) {
4696				struct pf_state_key *nk =
4697				    (*state)->key[pd->didx];
4698
4699				if (PF_ANEQ(pd2.src,
4700				    &nk->addr[pd2.sidx], pd2.af) ||
4701				    nk->port[pd2.sidx] != th.th_sport)
4702					pf_change_icmp(pd2.src, &th.th_sport,
4703					    daddr, &nk->addr[pd2.sidx],
4704					    nk->port[pd2.sidx], NULL,
4705					    pd2.ip_sum, icmpsum,
4706					    pd->ip_sum, 0, pd2.af);
4707
4708				if (PF_ANEQ(pd2.dst,
4709				    &nk->addr[pd2.didx], pd2.af) ||
4710				    nk->port[pd2.didx] != th.th_dport)
4711					pf_change_icmp(pd2.dst, &th.th_dport,
4712					    NULL, /* XXX Inbound NAT? */
4713					    &nk->addr[pd2.didx],
4714					    nk->port[pd2.didx], NULL,
4715					    pd2.ip_sum, icmpsum,
4716					    pd->ip_sum, 0, pd2.af);
4717				copyback = 1;
4718			}
4719
4720			if (copyback) {
4721				switch (pd2.af) {
4722#ifdef INET
4723				case AF_INET:
4724					m_copyback(m, off, ICMP_MINLEN,
4725					    (caddr_t )pd->hdr.icmp);
4726					m_copyback(m, ipoff2, sizeof(h2),
4727					    (caddr_t )&h2);
4728					break;
4729#endif /* INET */
4730#ifdef INET6
4731				case AF_INET6:
4732					m_copyback(m, off,
4733					    sizeof(struct icmp6_hdr),
4734					    (caddr_t )pd->hdr.icmp6);
4735					m_copyback(m, ipoff2, sizeof(h2_6),
4736					    (caddr_t )&h2_6);
4737					break;
4738#endif /* INET6 */
4739				}
4740				m_copyback(m, off2, 8, (caddr_t)&th);
4741			}
4742
4743			return (PF_PASS);
4744			break;
4745		}
4746		case IPPROTO_UDP: {
4747			struct udphdr		uh;
4748
4749			if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
4750			    NULL, reason, pd2.af)) {
4751				DPFPRINTF(PF_DEBUG_MISC,
4752				    ("pf: ICMP error message too short "
4753				    "(udp)\n"));
4754				return (PF_DROP);
4755			}
4756
4757			key.af = pd2.af;
4758			key.proto = IPPROTO_UDP;
4759			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4760			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4761			key.port[pd2.sidx] = uh.uh_sport;
4762			key.port[pd2.didx] = uh.uh_dport;
4763
4764			STATE_LOOKUP(kif, &key, direction, *state, pd);
4765
4766			/* translate source/destination address, if necessary */
4767			if ((*state)->key[PF_SK_WIRE] !=
4768			    (*state)->key[PF_SK_STACK]) {
4769				struct pf_state_key *nk =
4770				    (*state)->key[pd->didx];
4771
4772				if (PF_ANEQ(pd2.src,
4773				    &nk->addr[pd2.sidx], pd2.af) ||
4774				    nk->port[pd2.sidx] != uh.uh_sport)
4775					pf_change_icmp(pd2.src, &uh.uh_sport,
4776					    daddr, &nk->addr[pd2.sidx],
4777					    nk->port[pd2.sidx], &uh.uh_sum,
4778					    pd2.ip_sum, icmpsum,
4779					    pd->ip_sum, 1, pd2.af);
4780
4781				if (PF_ANEQ(pd2.dst,
4782				    &nk->addr[pd2.didx], pd2.af) ||
4783				    nk->port[pd2.didx] != uh.uh_dport)
4784					pf_change_icmp(pd2.dst, &uh.uh_dport,
4785					    NULL, /* XXX Inbound NAT? */
4786					    &nk->addr[pd2.didx],
4787					    nk->port[pd2.didx], &uh.uh_sum,
4788					    pd2.ip_sum, icmpsum,
4789					    pd->ip_sum, 1, pd2.af);
4790
4791				switch (pd2.af) {
4792#ifdef INET
4793				case AF_INET:
4794					m_copyback(m, off, ICMP_MINLEN,
4795					    (caddr_t )pd->hdr.icmp);
4796					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4797					break;
4798#endif /* INET */
4799#ifdef INET6
4800				case AF_INET6:
4801					m_copyback(m, off,
4802					    sizeof(struct icmp6_hdr),
4803					    (caddr_t )pd->hdr.icmp6);
4804					m_copyback(m, ipoff2, sizeof(h2_6),
4805					    (caddr_t )&h2_6);
4806					break;
4807#endif /* INET6 */
4808				}
4809				m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
4810			}
4811			return (PF_PASS);
4812			break;
4813		}
4814#ifdef INET
4815		case IPPROTO_ICMP: {
4816			struct icmp		iih;
4817
4818			if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
4819			    NULL, reason, pd2.af)) {
4820				DPFPRINTF(PF_DEBUG_MISC,
4821				    ("pf: ICMP error message too short i"
4822				    "(icmp)\n"));
4823				return (PF_DROP);
4824			}
4825
4826			key.af = pd2.af;
4827			key.proto = IPPROTO_ICMP;
4828			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4829			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4830			key.port[0] = key.port[1] = iih.icmp_id;
4831
4832			STATE_LOOKUP(kif, &key, direction, *state, pd);
4833
4834			/* translate source/destination address, if necessary */
4835			if ((*state)->key[PF_SK_WIRE] !=
4836			    (*state)->key[PF_SK_STACK]) {
4837				struct pf_state_key *nk =
4838				    (*state)->key[pd->didx];
4839
4840				if (PF_ANEQ(pd2.src,
4841				    &nk->addr[pd2.sidx], pd2.af) ||
4842				    nk->port[pd2.sidx] != iih.icmp_id)
4843					pf_change_icmp(pd2.src, &iih.icmp_id,
4844					    daddr, &nk->addr[pd2.sidx],
4845					    nk->port[pd2.sidx], NULL,
4846					    pd2.ip_sum, icmpsum,
4847					    pd->ip_sum, 0, AF_INET);
4848
4849				if (PF_ANEQ(pd2.dst,
4850				    &nk->addr[pd2.didx], pd2.af) ||
4851				    nk->port[pd2.didx] != iih.icmp_id)
4852					pf_change_icmp(pd2.dst, &iih.icmp_id,
4853					    NULL, /* XXX Inbound NAT? */
4854					    &nk->addr[pd2.didx],
4855					    nk->port[pd2.didx], NULL,
4856					    pd2.ip_sum, icmpsum,
4857					    pd->ip_sum, 0, AF_INET);
4858
4859				m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
4860				m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4861				m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
4862			}
4863			return (PF_PASS);
4864			break;
4865		}
4866#endif /* INET */
4867#ifdef INET6
4868		case IPPROTO_ICMPV6: {
4869			struct icmp6_hdr	iih;
4870
4871			if (!pf_pull_hdr(m, off2, &iih,
4872			    sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
4873				DPFPRINTF(PF_DEBUG_MISC,
4874				    ("pf: ICMP error message too short "
4875				    "(icmp6)\n"));
4876				return (PF_DROP);
4877			}
4878
4879			key.af = pd2.af;
4880			key.proto = IPPROTO_ICMPV6;
4881			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4882			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4883			key.port[0] = key.port[1] = iih.icmp6_id;
4884
4885			STATE_LOOKUP(kif, &key, direction, *state, pd);
4886
4887			/* translate source/destination address, if necessary */
4888			if ((*state)->key[PF_SK_WIRE] !=
4889			    (*state)->key[PF_SK_STACK]) {
4890				struct pf_state_key *nk =
4891				    (*state)->key[pd->didx];
4892
4893				if (PF_ANEQ(pd2.src,
4894				    &nk->addr[pd2.sidx], pd2.af) ||
4895				    nk->port[pd2.sidx] != iih.icmp6_id)
4896					pf_change_icmp(pd2.src, &iih.icmp6_id,
4897					    daddr, &nk->addr[pd2.sidx],
4898					    nk->port[pd2.sidx], NULL,
4899					    pd2.ip_sum, icmpsum,
4900					    pd->ip_sum, 0, AF_INET6);
4901
4902				if (PF_ANEQ(pd2.dst,
4903				    &nk->addr[pd2.didx], pd2.af) ||
4904				    nk->port[pd2.didx] != iih.icmp6_id)
4905					pf_change_icmp(pd2.dst, &iih.icmp6_id,
4906					    NULL, /* XXX Inbound NAT? */
4907					    &nk->addr[pd2.didx],
4908					    nk->port[pd2.didx], NULL,
4909					    pd2.ip_sum, icmpsum,
4910					    pd->ip_sum, 0, AF_INET6);
4911
4912				m_copyback(m, off, sizeof(struct icmp6_hdr),
4913				    (caddr_t)pd->hdr.icmp6);
4914				m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
4915				m_copyback(m, off2, sizeof(struct icmp6_hdr),
4916				    (caddr_t)&iih);
4917			}
4918			return (PF_PASS);
4919			break;
4920		}
4921#endif /* INET6 */
4922		default: {
4923			key.af = pd2.af;
4924			key.proto = pd2.proto;
4925			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4926			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4927			key.port[0] = key.port[1] = 0;
4928
4929			STATE_LOOKUP(kif, &key, direction, *state, pd);
4930
4931			/* translate source/destination address, if necessary */
4932			if ((*state)->key[PF_SK_WIRE] !=
4933			    (*state)->key[PF_SK_STACK]) {
4934				struct pf_state_key *nk =
4935				    (*state)->key[pd->didx];
4936
4937				if (PF_ANEQ(pd2.src,
4938				    &nk->addr[pd2.sidx], pd2.af))
4939					pf_change_icmp(pd2.src, NULL, daddr,
4940					    &nk->addr[pd2.sidx], 0, NULL,
4941					    pd2.ip_sum, icmpsum,
4942					    pd->ip_sum, 0, pd2.af);
4943
4944				if (PF_ANEQ(pd2.dst,
4945				    &nk->addr[pd2.didx], pd2.af))
4946					pf_change_icmp(pd2.src, NULL,
4947					    NULL, /* XXX Inbound NAT? */
4948					    &nk->addr[pd2.didx], 0, NULL,
4949					    pd2.ip_sum, icmpsum,
4950					    pd->ip_sum, 0, pd2.af);
4951
4952				switch (pd2.af) {
4953#ifdef INET
4954				case AF_INET:
4955					m_copyback(m, off, ICMP_MINLEN,
4956					    (caddr_t)pd->hdr.icmp);
4957					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4958					break;
4959#endif /* INET */
4960#ifdef INET6
4961				case AF_INET6:
4962					m_copyback(m, off,
4963					    sizeof(struct icmp6_hdr),
4964					    (caddr_t )pd->hdr.icmp6);
4965					m_copyback(m, ipoff2, sizeof(h2_6),
4966					    (caddr_t )&h2_6);
4967					break;
4968#endif /* INET6 */
4969				}
4970			}
4971			return (PF_PASS);
4972			break;
4973		}
4974		}
4975	}
4976}
4977
4978static int
4979pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif,
4980    struct mbuf *m, struct pf_pdesc *pd)
4981{
4982	struct pf_state_peer	*src, *dst;
4983	struct pf_state_key_cmp	 key;
4984
4985	bzero(&key, sizeof(key));
4986	key.af = pd->af;
4987	key.proto = pd->proto;
4988	if (direction == PF_IN)	{
4989		PF_ACPY(&key.addr[0], pd->src, key.af);
4990		PF_ACPY(&key.addr[1], pd->dst, key.af);
4991		key.port[0] = key.port[1] = 0;
4992	} else {
4993		PF_ACPY(&key.addr[1], pd->src, key.af);
4994		PF_ACPY(&key.addr[0], pd->dst, key.af);
4995		key.port[1] = key.port[0] = 0;
4996	}
4997
4998	STATE_LOOKUP(kif, &key, direction, *state, pd);
4999
5000	if (direction == (*state)->direction) {
5001		src = &(*state)->src;
5002		dst = &(*state)->dst;
5003	} else {
5004		src = &(*state)->dst;
5005		dst = &(*state)->src;
5006	}
5007
5008	/* update states */
5009	if (src->state < PFOTHERS_SINGLE)
5010		src->state = PFOTHERS_SINGLE;
5011	if (dst->state == PFOTHERS_SINGLE)
5012		dst->state = PFOTHERS_MULTIPLE;
5013
5014	/* update expire time */
5015	(*state)->expire = time_uptime;
5016	if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
5017		(*state)->timeout = PFTM_OTHER_MULTIPLE;
5018	else
5019		(*state)->timeout = PFTM_OTHER_SINGLE;
5020
5021	/* translate source/destination address, if necessary */
5022	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5023		struct pf_state_key *nk = (*state)->key[pd->didx];
5024
5025		KASSERT(nk, ("%s: nk is null", __func__));
5026		KASSERT(pd, ("%s: pd is null", __func__));
5027		KASSERT(pd->src, ("%s: pd->src is null", __func__));
5028		KASSERT(pd->dst, ("%s: pd->dst is null", __func__));
5029		switch (pd->af) {
5030#ifdef INET
5031		case AF_INET:
5032			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
5033				pf_change_a(&pd->src->v4.s_addr,
5034				    pd->ip_sum,
5035				    nk->addr[pd->sidx].v4.s_addr,
5036				    0);
5037
5038
5039			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
5040				pf_change_a(&pd->dst->v4.s_addr,
5041				    pd->ip_sum,
5042				    nk->addr[pd->didx].v4.s_addr,
5043				    0);
5044
5045				break;
5046#endif /* INET */
5047#ifdef INET6
5048		case AF_INET6:
5049			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
5050				PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
5051
5052			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
5053				PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
5054#endif /* INET6 */
5055		}
5056	}
5057	return (PF_PASS);
5058}
5059
5060/*
5061 * ipoff and off are measured from the start of the mbuf chain.
5062 * h must be at "ipoff" on the mbuf chain.
5063 */
5064void *
5065pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
5066    u_short *actionp, u_short *reasonp, sa_family_t af)
5067{
5068	switch (af) {
5069#ifdef INET
5070	case AF_INET: {
5071		struct ip	*h = mtod(m, struct ip *);
5072		u_int16_t	 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
5073
5074		if (fragoff) {
5075			if (fragoff >= len)
5076				ACTION_SET(actionp, PF_PASS);
5077			else {
5078				ACTION_SET(actionp, PF_DROP);
5079				REASON_SET(reasonp, PFRES_FRAG);
5080			}
5081			return (NULL);
5082		}
5083		if (m->m_pkthdr.len < off + len ||
5084		    ntohs(h->ip_len) < off + len) {
5085			ACTION_SET(actionp, PF_DROP);
5086			REASON_SET(reasonp, PFRES_SHORT);
5087			return (NULL);
5088		}
5089		break;
5090	}
5091#endif /* INET */
5092#ifdef INET6
5093	case AF_INET6: {
5094		struct ip6_hdr	*h = mtod(m, struct ip6_hdr *);
5095
5096		if (m->m_pkthdr.len < off + len ||
5097		    (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
5098		    (unsigned)(off + len)) {
5099			ACTION_SET(actionp, PF_DROP);
5100			REASON_SET(reasonp, PFRES_SHORT);
5101			return (NULL);
5102		}
5103		break;
5104	}
5105#endif /* INET6 */
5106	}
5107	m_copydata(m, off, len, p);
5108	return (p);
5109}
5110
5111int
5112pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
5113    int rtableid)
5114{
5115#ifdef RADIX_MPATH
5116	struct radix_node_head	*rnh;
5117#endif
5118	struct sockaddr_in	*dst;
5119	int			 ret = 1;
5120	int			 check_mpath;
5121#ifdef INET6
5122	struct sockaddr_in6	*dst6;
5123	struct route_in6	 ro;
5124#else
5125	struct route		 ro;
5126#endif
5127	struct radix_node	*rn;
5128	struct rtentry		*rt;
5129	struct ifnet		*ifp;
5130
5131	check_mpath = 0;
5132#ifdef RADIX_MPATH
5133	/* XXX: stick to table 0 for now */
5134	rnh = rt_tables_get_rnh(0, af);
5135	if (rnh != NULL && rn_mpath_capable(rnh))
5136		check_mpath = 1;
5137#endif
5138	bzero(&ro, sizeof(ro));
5139	switch (af) {
5140	case AF_INET:
5141		dst = satosin(&ro.ro_dst);
5142		dst->sin_family = AF_INET;
5143		dst->sin_len = sizeof(*dst);
5144		dst->sin_addr = addr->v4;
5145		break;
5146#ifdef INET6
5147	case AF_INET6:
5148		/*
5149		 * Skip check for addresses with embedded interface scope,
5150		 * as they would always match anyway.
5151		 */
5152		if (IN6_IS_SCOPE_EMBED(&addr->v6))
5153			goto out;
5154		dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
5155		dst6->sin6_family = AF_INET6;
5156		dst6->sin6_len = sizeof(*dst6);
5157		dst6->sin6_addr = addr->v6;
5158		break;
5159#endif /* INET6 */
5160	default:
5161		return (0);
5162	}
5163
5164	/* Skip checks for ipsec interfaces */
5165	if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
5166		goto out;
5167
5168	switch (af) {
5169#ifdef INET6
5170	case AF_INET6:
5171		in6_rtalloc_ign(&ro, 0, rtableid);
5172		break;
5173#endif
5174#ifdef INET
5175	case AF_INET:
5176		in_rtalloc_ign((struct route *)&ro, 0, rtableid);
5177		break;
5178#endif
5179	default:
5180		rtalloc_ign((struct route *)&ro, 0);	/* No/default FIB. */
5181		break;
5182	}
5183
5184	if (ro.ro_rt != NULL) {
5185		/* No interface given, this is a no-route check */
5186		if (kif == NULL)
5187			goto out;
5188
5189		if (kif->pfik_ifp == NULL) {
5190			ret = 0;
5191			goto out;
5192		}
5193
5194		/* Perform uRPF check if passed input interface */
5195		ret = 0;
5196		rn = (struct radix_node *)ro.ro_rt;
5197		do {
5198			rt = (struct rtentry *)rn;
5199			ifp = rt->rt_ifp;
5200
5201			if (kif->pfik_ifp == ifp)
5202				ret = 1;
5203#ifdef RADIX_MPATH
5204			rn = rn_mpath_next(rn);
5205#endif
5206		} while (check_mpath == 1 && rn != NULL && ret == 0);
5207	} else
5208		ret = 0;
5209out:
5210	if (ro.ro_rt != NULL)
5211		RTFREE(ro.ro_rt);
5212	return (ret);
5213}
5214
5215#ifdef INET
5216static void
5217pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5218    struct pf_state *s, struct pf_pdesc *pd)
5219{
5220	struct mbuf		*m0, *m1;
5221	struct sockaddr_in	dst;
5222	struct ip		*ip;
5223	struct ifnet		*ifp = NULL;
5224	struct pf_addr		 naddr;
5225	struct pf_src_node	*sn = NULL;
5226	int			 error = 0;
5227	uint16_t		 ip_len, ip_off;
5228
5229	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5230	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5231	    __func__));
5232
5233	if ((pd->pf_mtag == NULL &&
5234	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5235	    pd->pf_mtag->routed++ > 3) {
5236		m0 = *m;
5237		*m = NULL;
5238		goto bad_locked;
5239	}
5240
5241	if (r->rt == PF_DUPTO) {
5242		if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5243			if (s)
5244				PF_STATE_UNLOCK(s);
5245			return;
5246		}
5247	} else {
5248		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5249			if (s)
5250				PF_STATE_UNLOCK(s);
5251			return;
5252		}
5253		m0 = *m;
5254	}
5255
5256	ip = mtod(m0, struct ip *);
5257
5258	bzero(&dst, sizeof(dst));
5259	dst.sin_family = AF_INET;
5260	dst.sin_len = sizeof(dst);
5261	dst.sin_addr = ip->ip_dst;
5262
5263	if (r->rt == PF_FASTROUTE) {
5264		struct rtentry *rt;
5265
5266		if (s)
5267			PF_STATE_UNLOCK(s);
5268		rt = rtalloc1_fib(sintosa(&dst), 0, 0, M_GETFIB(m0));
5269		if (rt == NULL) {
5270			KMOD_IPSTAT_INC(ips_noroute);
5271			error = EHOSTUNREACH;
5272			goto bad;
5273		}
5274
5275		ifp = rt->rt_ifp;
5276		counter_u64_add(rt->rt_pksent, 1);
5277
5278		if (rt->rt_flags & RTF_GATEWAY)
5279			bcopy(satosin(rt->rt_gateway), &dst, sizeof(dst));
5280		RTFREE_LOCKED(rt);
5281	} else {
5282		if (TAILQ_EMPTY(&r->rpool.list)) {
5283			DPFPRINTF(PF_DEBUG_URGENT,
5284			    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5285			goto bad_locked;
5286		}
5287		if (s == NULL) {
5288			pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
5289			    &naddr, NULL, &sn);
5290			if (!PF_AZERO(&naddr, AF_INET))
5291				dst.sin_addr.s_addr = naddr.v4.s_addr;
5292			ifp = r->rpool.cur->kif ?
5293			    r->rpool.cur->kif->pfik_ifp : NULL;
5294		} else {
5295			if (!PF_AZERO(&s->rt_addr, AF_INET))
5296				dst.sin_addr.s_addr =
5297				    s->rt_addr.v4.s_addr;
5298			ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5299			PF_STATE_UNLOCK(s);
5300		}
5301	}
5302	if (ifp == NULL)
5303		goto bad;
5304
5305	if (oifp != ifp) {
5306		if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS)
5307			goto bad;
5308		else if (m0 == NULL)
5309			goto done;
5310		if (m0->m_len < sizeof(struct ip)) {
5311			DPFPRINTF(PF_DEBUG_URGENT,
5312			    ("%s: m0->m_len < sizeof(struct ip)\n", __func__));
5313			goto bad;
5314		}
5315		ip = mtod(m0, struct ip *);
5316	}
5317
5318	if (ifp->if_flags & IFF_LOOPBACK)
5319		m0->m_flags |= M_SKIP_FIREWALL;
5320
5321	ip_len = ntohs(ip->ip_len);
5322	ip_off = ntohs(ip->ip_off);
5323
5324	/* Copied from FreeBSD 10.0-CURRENT ip_output. */
5325	m0->m_pkthdr.csum_flags |= CSUM_IP;
5326	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
5327		in_delayed_cksum(m0);
5328		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
5329	}
5330#ifdef SCTP
5331	if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
5332		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
5333		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
5334	}
5335#endif
5336
5337	/*
5338	 * If small enough for interface, or the interface will take
5339	 * care of the fragmentation for us, we can just send directly.
5340	 */
5341	if (ip_len <= ifp->if_mtu ||
5342	    (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
5343	    ((ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
5344		ip->ip_sum = 0;
5345		if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
5346			ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
5347			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
5348		}
5349		m_clrprotoflags(m0);	/* Avoid confusing lower layers. */
5350		error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5351		goto done;
5352	}
5353
5354	/* Balk when DF bit is set or the interface didn't support TSO. */
5355	if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) {
5356		error = EMSGSIZE;
5357		KMOD_IPSTAT_INC(ips_cantfrag);
5358		if (r->rt != PF_DUPTO) {
5359			icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
5360			    ifp->if_mtu);
5361			goto done;
5362		} else
5363			goto bad;
5364	}
5365
5366	error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist);
5367	if (error)
5368		goto bad;
5369
5370	for (; m0; m0 = m1) {
5371		m1 = m0->m_nextpkt;
5372		m0->m_nextpkt = NULL;
5373		if (error == 0) {
5374			m_clrprotoflags(m0);
5375			error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5376		} else
5377			m_freem(m0);
5378	}
5379
5380	if (error == 0)
5381		KMOD_IPSTAT_INC(ips_fragmented);
5382
5383done:
5384	if (r->rt != PF_DUPTO)
5385		*m = NULL;
5386	return;
5387
5388bad_locked:
5389	if (s)
5390		PF_STATE_UNLOCK(s);
5391bad:
5392	m_freem(m0);
5393	goto done;
5394}
5395#endif /* INET */
5396
5397#ifdef INET6
5398static void
5399pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5400    struct pf_state *s, struct pf_pdesc *pd)
5401{
5402	struct mbuf		*m0;
5403	struct sockaddr_in6	dst;
5404	struct ip6_hdr		*ip6;
5405	struct ifnet		*ifp = NULL;
5406	struct pf_addr		 naddr;
5407	struct pf_src_node	*sn = NULL;
5408
5409	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5410	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5411	    __func__));
5412
5413	if ((pd->pf_mtag == NULL &&
5414	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5415	    pd->pf_mtag->routed++ > 3) {
5416		m0 = *m;
5417		*m = NULL;
5418		goto bad_locked;
5419	}
5420
5421	if (r->rt == PF_DUPTO) {
5422		if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5423			if (s)
5424				PF_STATE_UNLOCK(s);
5425			return;
5426		}
5427	} else {
5428		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5429			if (s)
5430				PF_STATE_UNLOCK(s);
5431			return;
5432		}
5433		m0 = *m;
5434	}
5435
5436	ip6 = mtod(m0, struct ip6_hdr *);
5437
5438	bzero(&dst, sizeof(dst));
5439	dst.sin6_family = AF_INET6;
5440	dst.sin6_len = sizeof(dst);
5441	dst.sin6_addr = ip6->ip6_dst;
5442
5443	/* Cheat. XXX why only in the v6 case??? */
5444	if (r->rt == PF_FASTROUTE) {
5445		if (s)
5446			PF_STATE_UNLOCK(s);
5447		m0->m_flags |= M_SKIP_FIREWALL;
5448		ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
5449		return;
5450	}
5451
5452	if (TAILQ_EMPTY(&r->rpool.list)) {
5453		DPFPRINTF(PF_DEBUG_URGENT,
5454		    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5455		goto bad_locked;
5456	}
5457	if (s == NULL) {
5458		pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
5459		    &naddr, NULL, &sn);
5460		if (!PF_AZERO(&naddr, AF_INET6))
5461			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5462			    &naddr, AF_INET6);
5463		ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
5464	} else {
5465		if (!PF_AZERO(&s->rt_addr, AF_INET6))
5466			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5467			    &s->rt_addr, AF_INET6);
5468		ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5469	}
5470
5471	if (s)
5472		PF_STATE_UNLOCK(s);
5473
5474	if (ifp == NULL)
5475		goto bad;
5476
5477	if (oifp != ifp) {
5478		if (pf_test6(PF_OUT, ifp, &m0, NULL) != PF_PASS)
5479			goto bad;
5480		else if (m0 == NULL)
5481			goto done;
5482		if (m0->m_len < sizeof(struct ip6_hdr)) {
5483			DPFPRINTF(PF_DEBUG_URGENT,
5484			    ("%s: m0->m_len < sizeof(struct ip6_hdr)\n",
5485			    __func__));
5486			goto bad;
5487		}
5488		ip6 = mtod(m0, struct ip6_hdr *);
5489	}
5490
5491	if (ifp->if_flags & IFF_LOOPBACK)
5492		m0->m_flags |= M_SKIP_FIREWALL;
5493
5494	/*
5495	 * If the packet is too large for the outgoing interface,
5496	 * send back an icmp6 error.
5497	 */
5498	if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr))
5499		dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
5500	if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu)
5501		nd6_output(ifp, ifp, m0, &dst, NULL);
5502	else {
5503		in6_ifstat_inc(ifp, ifs6_in_toobig);
5504		if (r->rt != PF_DUPTO)
5505			icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
5506		else
5507			goto bad;
5508	}
5509
5510done:
5511	if (r->rt != PF_DUPTO)
5512		*m = NULL;
5513	return;
5514
5515bad_locked:
5516	if (s)
5517		PF_STATE_UNLOCK(s);
5518bad:
5519	m_freem(m0);
5520	goto done;
5521}
5522#endif /* INET6 */
5523
5524/*
5525 * FreeBSD supports cksum offloads for the following drivers.
5526 *  em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4),
5527 *   ti(4), txp(4), xl(4)
5528 *
5529 * CSUM_DATA_VALID | CSUM_PSEUDO_HDR :
5530 *  network driver performed cksum including pseudo header, need to verify
5531 *   csum_data
5532 * CSUM_DATA_VALID :
5533 *  network driver performed cksum, needs to additional pseudo header
5534 *  cksum computation with partial csum_data(i.e. lack of H/W support for
5535 *  pseudo header, for instance hme(4), sk(4) and possibly gem(4))
5536 *
5537 * After validating the cksum of packet, set both flag CSUM_DATA_VALID and
5538 * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
5539 * TCP/UDP layer.
5540 * Also, set csum_data to 0xffff to force cksum validation.
5541 */
5542static int
5543pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
5544{
5545	u_int16_t sum = 0;
5546	int hw_assist = 0;
5547	struct ip *ip;
5548
5549	if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
5550		return (1);
5551	if (m->m_pkthdr.len < off + len)
5552		return (1);
5553
5554	switch (p) {
5555	case IPPROTO_TCP:
5556		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5557			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5558				sum = m->m_pkthdr.csum_data;
5559			} else {
5560				ip = mtod(m, struct ip *);
5561				sum = in_pseudo(ip->ip_src.s_addr,
5562				ip->ip_dst.s_addr, htonl((u_short)len +
5563				m->m_pkthdr.csum_data + IPPROTO_TCP));
5564			}
5565			sum ^= 0xffff;
5566			++hw_assist;
5567		}
5568		break;
5569	case IPPROTO_UDP:
5570		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5571			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5572				sum = m->m_pkthdr.csum_data;
5573			} else {
5574				ip = mtod(m, struct ip *);
5575				sum = in_pseudo(ip->ip_src.s_addr,
5576				ip->ip_dst.s_addr, htonl((u_short)len +
5577				m->m_pkthdr.csum_data + IPPROTO_UDP));
5578			}
5579			sum ^= 0xffff;
5580			++hw_assist;
5581		}
5582		break;
5583	case IPPROTO_ICMP:
5584#ifdef INET6
5585	case IPPROTO_ICMPV6:
5586#endif /* INET6 */
5587		break;
5588	default:
5589		return (1);
5590	}
5591
5592	if (!hw_assist) {
5593		switch (af) {
5594		case AF_INET:
5595			if (p == IPPROTO_ICMP) {
5596				if (m->m_len < off)
5597					return (1);
5598				m->m_data += off;
5599				m->m_len -= off;
5600				sum = in_cksum(m, len);
5601				m->m_data -= off;
5602				m->m_len += off;
5603			} else {
5604				if (m->m_len < sizeof(struct ip))
5605					return (1);
5606				sum = in4_cksum(m, p, off, len);
5607			}
5608			break;
5609#ifdef INET6
5610		case AF_INET6:
5611			if (m->m_len < sizeof(struct ip6_hdr))
5612				return (1);
5613			sum = in6_cksum(m, p, off, len);
5614			break;
5615#endif /* INET6 */
5616		default:
5617			return (1);
5618		}
5619	}
5620	if (sum) {
5621		switch (p) {
5622		case IPPROTO_TCP:
5623		    {
5624			KMOD_TCPSTAT_INC(tcps_rcvbadsum);
5625			break;
5626		    }
5627		case IPPROTO_UDP:
5628		    {
5629			KMOD_UDPSTAT_INC(udps_badsum);
5630			break;
5631		    }
5632#ifdef INET
5633		case IPPROTO_ICMP:
5634		    {
5635			KMOD_ICMPSTAT_INC(icps_checksum);
5636			break;
5637		    }
5638#endif
5639#ifdef INET6
5640		case IPPROTO_ICMPV6:
5641		    {
5642			KMOD_ICMP6STAT_INC(icp6s_checksum);
5643			break;
5644		    }
5645#endif /* INET6 */
5646		}
5647		return (1);
5648	} else {
5649		if (p == IPPROTO_TCP || p == IPPROTO_UDP) {
5650			m->m_pkthdr.csum_flags |=
5651			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
5652			m->m_pkthdr.csum_data = 0xffff;
5653		}
5654	}
5655	return (0);
5656}
5657
5658
5659#ifdef INET
5660int
5661pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
5662{
5663	struct pfi_kif		*kif;
5664	u_short			 action, reason = 0, log = 0;
5665	struct mbuf		*m = *m0;
5666	struct ip		*h = NULL;
5667	struct m_tag		*ipfwtag;
5668	struct pf_rule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
5669	struct pf_state		*s = NULL;
5670	struct pf_ruleset	*ruleset = NULL;
5671	struct pf_pdesc		 pd;
5672	int			 off, dirndx, pqid = 0;
5673
5674	M_ASSERTPKTHDR(m);
5675
5676	if (!V_pf_status.running)
5677		return (PF_PASS);
5678
5679	memset(&pd, 0, sizeof(pd));
5680
5681	kif = (struct pfi_kif *)ifp->if_pf_kif;
5682
5683	if (kif == NULL) {
5684		DPFPRINTF(PF_DEBUG_URGENT,
5685		    ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
5686		return (PF_DROP);
5687	}
5688	if (kif->pfik_flags & PFI_IFLAG_SKIP)
5689		return (PF_PASS);
5690
5691	if (m->m_flags & M_SKIP_FIREWALL)
5692		return (PF_PASS);
5693
5694	pd.pf_mtag = pf_find_mtag(m);
5695
5696	PF_RULES_RLOCK();
5697
5698	if (ip_divert_ptr != NULL &&
5699	    ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) {
5700		struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1);
5701		if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) {
5702			if (pd.pf_mtag == NULL &&
5703			    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5704				action = PF_DROP;
5705				goto done;
5706			}
5707			pd.pf_mtag->flags |= PF_PACKET_LOOPED;
5708			m_tag_delete(m, ipfwtag);
5709		}
5710		if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) {
5711			m->m_flags |= M_FASTFWD_OURS;
5712			pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT;
5713		}
5714	} else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
5715		/* We do IP header normalization and packet reassembly here */
5716		action = PF_DROP;
5717		goto done;
5718	}
5719	m = *m0;	/* pf_normalize messes with m0 */
5720	h = mtod(m, struct ip *);
5721
5722	off = h->ip_hl << 2;
5723	if (off < (int)sizeof(struct ip)) {
5724		action = PF_DROP;
5725		REASON_SET(&reason, PFRES_SHORT);
5726		log = 1;
5727		goto done;
5728	}
5729
5730	pd.src = (struct pf_addr *)&h->ip_src;
5731	pd.dst = (struct pf_addr *)&h->ip_dst;
5732	pd.sport = pd.dport = NULL;
5733	pd.ip_sum = &h->ip_sum;
5734	pd.proto_sum = NULL;
5735	pd.proto = h->ip_p;
5736	pd.dir = dir;
5737	pd.sidx = (dir == PF_IN) ? 0 : 1;
5738	pd.didx = (dir == PF_IN) ? 1 : 0;
5739	pd.af = AF_INET;
5740	pd.tos = h->ip_tos;
5741	pd.tot_len = ntohs(h->ip_len);
5742
5743	/* handle fragments that didn't get reassembled by normalization */
5744	if (h->ip_off & htons(IP_MF | IP_OFFMASK)) {
5745		action = pf_test_fragment(&r, dir, kif, m, h,
5746		    &pd, &a, &ruleset);
5747		goto done;
5748	}
5749
5750	switch (h->ip_p) {
5751
5752	case IPPROTO_TCP: {
5753		struct tcphdr	th;
5754
5755		pd.hdr.tcp = &th;
5756		if (!pf_pull_hdr(m, off, &th, sizeof(th),
5757		    &action, &reason, AF_INET)) {
5758			log = action != PF_PASS;
5759			goto done;
5760		}
5761		pd.p_len = pd.tot_len - off - (th.th_off << 2);
5762		if ((th.th_flags & TH_ACK) && pd.p_len == 0)
5763			pqid = 1;
5764		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
5765		if (action == PF_DROP)
5766			goto done;
5767		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
5768		    &reason);
5769		if (action == PF_PASS) {
5770			if (pfsync_update_state_ptr != NULL)
5771				pfsync_update_state_ptr(s);
5772			r = s->rule.ptr;
5773			a = s->anchor.ptr;
5774			log = s->log;
5775		} else if (s == NULL)
5776			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5777			    &a, &ruleset, inp);
5778		break;
5779	}
5780
5781	case IPPROTO_UDP: {
5782		struct udphdr	uh;
5783
5784		pd.hdr.udp = &uh;
5785		if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
5786		    &action, &reason, AF_INET)) {
5787			log = action != PF_PASS;
5788			goto done;
5789		}
5790		if (uh.uh_dport == 0 ||
5791		    ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
5792		    ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
5793			action = PF_DROP;
5794			REASON_SET(&reason, PFRES_SHORT);
5795			goto done;
5796		}
5797		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
5798		if (action == PF_PASS) {
5799			if (pfsync_update_state_ptr != NULL)
5800				pfsync_update_state_ptr(s);
5801			r = s->rule.ptr;
5802			a = s->anchor.ptr;
5803			log = s->log;
5804		} else if (s == NULL)
5805			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5806			    &a, &ruleset, inp);
5807		break;
5808	}
5809
5810	case IPPROTO_ICMP: {
5811		struct icmp	ih;
5812
5813		pd.hdr.icmp = &ih;
5814		if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
5815		    &action, &reason, AF_INET)) {
5816			log = action != PF_PASS;
5817			goto done;
5818		}
5819		action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
5820		    &reason);
5821		if (action == PF_PASS) {
5822			if (pfsync_update_state_ptr != NULL)
5823				pfsync_update_state_ptr(s);
5824			r = s->rule.ptr;
5825			a = s->anchor.ptr;
5826			log = s->log;
5827		} else if (s == NULL)
5828			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5829			    &a, &ruleset, inp);
5830		break;
5831	}
5832
5833#ifdef INET6
5834	case IPPROTO_ICMPV6: {
5835		action = PF_DROP;
5836		DPFPRINTF(PF_DEBUG_MISC,
5837		    ("pf: dropping IPv4 packet with ICMPv6 payload\n"));
5838		goto done;
5839	}
5840#endif
5841
5842	default:
5843		action = pf_test_state_other(&s, dir, kif, m, &pd);
5844		if (action == PF_PASS) {
5845			if (pfsync_update_state_ptr != NULL)
5846				pfsync_update_state_ptr(s);
5847			r = s->rule.ptr;
5848			a = s->anchor.ptr;
5849			log = s->log;
5850		} else if (s == NULL)
5851			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5852			    &a, &ruleset, inp);
5853		break;
5854	}
5855
5856done:
5857	PF_RULES_RUNLOCK();
5858	if (action == PF_PASS && h->ip_hl > 5 &&
5859	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
5860		action = PF_DROP;
5861		REASON_SET(&reason, PFRES_IPOPTIONS);
5862		log = 1;
5863		DPFPRINTF(PF_DEBUG_MISC,
5864		    ("pf: dropping packet with ip options\n"));
5865	}
5866
5867	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
5868		action = PF_DROP;
5869		REASON_SET(&reason, PFRES_MEMORY);
5870	}
5871	if (r->rtableid >= 0)
5872		M_SETFIB(m, r->rtableid);
5873
5874#ifdef ALTQ
5875	if (action == PF_PASS && r->qid) {
5876		if (pd.pf_mtag == NULL &&
5877		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5878			action = PF_DROP;
5879			REASON_SET(&reason, PFRES_MEMORY);
5880		}
5881		if (pqid || (pd.tos & IPTOS_LOWDELAY))
5882			pd.pf_mtag->qid = r->pqid;
5883		else
5884			pd.pf_mtag->qid = r->qid;
5885		/* add hints for ecn */
5886		pd.pf_mtag->hdr = h;
5887
5888	}
5889#endif /* ALTQ */
5890
5891	/*
5892	 * connections redirected to loopback should not match sockets
5893	 * bound specifically to loopback due to security implications,
5894	 * see tcp_input() and in_pcblookup_listen().
5895	 */
5896	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
5897	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
5898	    (s->nat_rule.ptr->action == PF_RDR ||
5899	    s->nat_rule.ptr->action == PF_BINAT) &&
5900	    (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
5901		m->m_flags |= M_SKIP_FIREWALL;
5902
5903	if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL &&
5904	    !PACKET_LOOPED(&pd)) {
5905
5906		ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
5907		    sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
5908		if (ipfwtag != NULL) {
5909			((struct ipfw_rule_ref *)(ipfwtag+1))->info =
5910			    ntohs(r->divert.port);
5911			((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir;
5912
5913			if (s)
5914				PF_STATE_UNLOCK(s);
5915
5916			m_tag_prepend(m, ipfwtag);
5917			if (m->m_flags & M_FASTFWD_OURS) {
5918				if (pd.pf_mtag == NULL &&
5919				    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5920					action = PF_DROP;
5921					REASON_SET(&reason, PFRES_MEMORY);
5922					log = 1;
5923					DPFPRINTF(PF_DEBUG_MISC,
5924					    ("pf: failed to allocate tag\n"));
5925				}
5926				pd.pf_mtag->flags |= PF_FASTFWD_OURS_PRESENT;
5927				m->m_flags &= ~M_FASTFWD_OURS;
5928			}
5929			ip_divert_ptr(*m0, dir ==  PF_IN ? DIR_IN : DIR_OUT);
5930			*m0 = NULL;
5931
5932			return (action);
5933		} else {
5934			/* XXX: ipfw has the same behaviour! */
5935			action = PF_DROP;
5936			REASON_SET(&reason, PFRES_MEMORY);
5937			log = 1;
5938			DPFPRINTF(PF_DEBUG_MISC,
5939			    ("pf: failed to allocate divert tag\n"));
5940		}
5941	}
5942
5943	if (log) {
5944		struct pf_rule *lr;
5945
5946		if (s != NULL && s->nat_rule.ptr != NULL &&
5947		    s->nat_rule.ptr->log & PF_LOG_ALL)
5948			lr = s->nat_rule.ptr;
5949		else
5950			lr = r;
5951		PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd,
5952		    (s == NULL));
5953	}
5954
5955	kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
5956	kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
5957
5958	if (action == PF_PASS || r->action == PF_DROP) {
5959		dirndx = (dir == PF_OUT);
5960		r->packets[dirndx]++;
5961		r->bytes[dirndx] += pd.tot_len;
5962		if (a != NULL) {
5963			a->packets[dirndx]++;
5964			a->bytes[dirndx] += pd.tot_len;
5965		}
5966		if (s != NULL) {
5967			if (s->nat_rule.ptr != NULL) {
5968				s->nat_rule.ptr->packets[dirndx]++;
5969				s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
5970			}
5971			if (s->src_node != NULL) {
5972				s->src_node->packets[dirndx]++;
5973				s->src_node->bytes[dirndx] += pd.tot_len;
5974			}
5975			if (s->nat_src_node != NULL) {
5976				s->nat_src_node->packets[dirndx]++;
5977				s->nat_src_node->bytes[dirndx] += pd.tot_len;
5978			}
5979			dirndx = (dir == s->direction) ? 0 : 1;
5980			s->packets[dirndx]++;
5981			s->bytes[dirndx] += pd.tot_len;
5982		}
5983		tr = r;
5984		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
5985		if (nr != NULL && r == &V_pf_default_rule)
5986			tr = nr;
5987		if (tr->src.addr.type == PF_ADDR_TABLE)
5988			pfr_update_stats(tr->src.addr.p.tbl,
5989			    (s == NULL) ? pd.src :
5990			    &s->key[(s->direction == PF_IN)]->
5991				addr[(s->direction == PF_OUT)],
5992			    pd.af, pd.tot_len, dir == PF_OUT,
5993			    r->action == PF_PASS, tr->src.neg);
5994		if (tr->dst.addr.type == PF_ADDR_TABLE)
5995			pfr_update_stats(tr->dst.addr.p.tbl,
5996			    (s == NULL) ? pd.dst :
5997			    &s->key[(s->direction == PF_IN)]->
5998				addr[(s->direction == PF_IN)],
5999			    pd.af, pd.tot_len, dir == PF_OUT,
6000			    r->action == PF_PASS, tr->dst.neg);
6001	}
6002
6003	switch (action) {
6004	case PF_SYNPROXY_DROP:
6005		m_freem(*m0);
6006	case PF_DEFER:
6007		*m0 = NULL;
6008		action = PF_PASS;
6009		break;
6010	default:
6011		/* pf_route() returns unlocked. */
6012		if (r->rt) {
6013			pf_route(m0, r, dir, kif->pfik_ifp, s, &pd);
6014			return (action);
6015		}
6016		break;
6017	}
6018	if (s)
6019		PF_STATE_UNLOCK(s);
6020
6021	return (action);
6022}
6023#endif /* INET */
6024
6025#ifdef INET6
6026int
6027pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
6028{
6029	struct pfi_kif		*kif;
6030	u_short			 action, reason = 0, log = 0;
6031	struct mbuf		*m = *m0, *n = NULL;
6032	struct ip6_hdr		*h = NULL;
6033	struct pf_rule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
6034	struct pf_state		*s = NULL;
6035	struct pf_ruleset	*ruleset = NULL;
6036	struct pf_pdesc		 pd;
6037	int			 off, terminal = 0, dirndx, rh_cnt = 0;
6038
6039	M_ASSERTPKTHDR(m);
6040
6041	if (!V_pf_status.running)
6042		return (PF_PASS);
6043
6044	memset(&pd, 0, sizeof(pd));
6045	pd.pf_mtag = pf_find_mtag(m);
6046
6047	if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED)
6048		return (PF_PASS);
6049
6050	kif = (struct pfi_kif *)ifp->if_pf_kif;
6051	if (kif == NULL) {
6052		DPFPRINTF(PF_DEBUG_URGENT,
6053		    ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
6054		return (PF_DROP);
6055	}
6056	if (kif->pfik_flags & PFI_IFLAG_SKIP)
6057		return (PF_PASS);
6058
6059	PF_RULES_RLOCK();
6060
6061	/* We do IP header normalization and packet reassembly here */
6062	if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
6063		action = PF_DROP;
6064		goto done;
6065	}
6066	m = *m0;	/* pf_normalize messes with m0 */
6067	h = mtod(m, struct ip6_hdr *);
6068
6069#if 1
6070	/*
6071	 * we do not support jumbogram yet.  if we keep going, zero ip6_plen
6072	 * will do something bad, so drop the packet for now.
6073	 */
6074	if (htons(h->ip6_plen) == 0) {
6075		action = PF_DROP;
6076		REASON_SET(&reason, PFRES_NORM);	/*XXX*/
6077		goto done;
6078	}
6079#endif
6080
6081	pd.src = (struct pf_addr *)&h->ip6_src;
6082	pd.dst = (struct pf_addr *)&h->ip6_dst;
6083	pd.sport = pd.dport = NULL;
6084	pd.ip_sum = NULL;
6085	pd.proto_sum = NULL;
6086	pd.dir = dir;
6087	pd.sidx = (dir == PF_IN) ? 0 : 1;
6088	pd.didx = (dir == PF_IN) ? 1 : 0;
6089	pd.af = AF_INET6;
6090	pd.tos = 0;
6091	pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
6092
6093	off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
6094	pd.proto = h->ip6_nxt;
6095	do {
6096		switch (pd.proto) {
6097		case IPPROTO_FRAGMENT:
6098			action = pf_test_fragment(&r, dir, kif, m, h,
6099			    &pd, &a, &ruleset);
6100			if (action == PF_DROP)
6101				REASON_SET(&reason, PFRES_FRAG);
6102			goto done;
6103		case IPPROTO_ROUTING: {
6104			struct ip6_rthdr rthdr;
6105
6106			if (rh_cnt++) {
6107				DPFPRINTF(PF_DEBUG_MISC,
6108				    ("pf: IPv6 more than one rthdr\n"));
6109				action = PF_DROP;
6110				REASON_SET(&reason, PFRES_IPOPTIONS);
6111				log = 1;
6112				goto done;
6113			}
6114			if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
6115			    &reason, pd.af)) {
6116				DPFPRINTF(PF_DEBUG_MISC,
6117				    ("pf: IPv6 short rthdr\n"));
6118				action = PF_DROP;
6119				REASON_SET(&reason, PFRES_SHORT);
6120				log = 1;
6121				goto done;
6122			}
6123			if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
6124				DPFPRINTF(PF_DEBUG_MISC,
6125				    ("pf: IPv6 rthdr0\n"));
6126				action = PF_DROP;
6127				REASON_SET(&reason, PFRES_IPOPTIONS);
6128				log = 1;
6129				goto done;
6130			}
6131			/* FALLTHROUGH */
6132		}
6133		case IPPROTO_AH:
6134		case IPPROTO_HOPOPTS:
6135		case IPPROTO_DSTOPTS: {
6136			/* get next header and header length */
6137			struct ip6_ext	opt6;
6138
6139			if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
6140			    NULL, &reason, pd.af)) {
6141				DPFPRINTF(PF_DEBUG_MISC,
6142				    ("pf: IPv6 short opt\n"));
6143				action = PF_DROP;
6144				log = 1;
6145				goto done;
6146			}
6147			if (pd.proto == IPPROTO_AH)
6148				off += (opt6.ip6e_len + 2) * 4;
6149			else
6150				off += (opt6.ip6e_len + 1) * 8;
6151			pd.proto = opt6.ip6e_nxt;
6152			/* goto the next header */
6153			break;
6154		}
6155		default:
6156			terminal++;
6157			break;
6158		}
6159	} while (!terminal);
6160
6161	/* if there's no routing header, use unmodified mbuf for checksumming */
6162	if (!n)
6163		n = m;
6164
6165	switch (pd.proto) {
6166
6167	case IPPROTO_TCP: {
6168		struct tcphdr	th;
6169
6170		pd.hdr.tcp = &th;
6171		if (!pf_pull_hdr(m, off, &th, sizeof(th),
6172		    &action, &reason, AF_INET6)) {
6173			log = action != PF_PASS;
6174			goto done;
6175		}
6176		pd.p_len = pd.tot_len - off - (th.th_off << 2);
6177		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
6178		if (action == PF_DROP)
6179			goto done;
6180		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
6181		    &reason);
6182		if (action == PF_PASS) {
6183			if (pfsync_update_state_ptr != NULL)
6184				pfsync_update_state_ptr(s);
6185			r = s->rule.ptr;
6186			a = s->anchor.ptr;
6187			log = s->log;
6188		} else if (s == NULL)
6189			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6190			    &a, &ruleset, inp);
6191		break;
6192	}
6193
6194	case IPPROTO_UDP: {
6195		struct udphdr	uh;
6196
6197		pd.hdr.udp = &uh;
6198		if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
6199		    &action, &reason, AF_INET6)) {
6200			log = action != PF_PASS;
6201			goto done;
6202		}
6203		if (uh.uh_dport == 0 ||
6204		    ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
6205		    ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
6206			action = PF_DROP;
6207			REASON_SET(&reason, PFRES_SHORT);
6208			goto done;
6209		}
6210		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
6211		if (action == PF_PASS) {
6212			if (pfsync_update_state_ptr != NULL)
6213				pfsync_update_state_ptr(s);
6214			r = s->rule.ptr;
6215			a = s->anchor.ptr;
6216			log = s->log;
6217		} else if (s == NULL)
6218			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6219			    &a, &ruleset, inp);
6220		break;
6221	}
6222
6223	case IPPROTO_ICMP: {
6224		action = PF_DROP;
6225		DPFPRINTF(PF_DEBUG_MISC,
6226		    ("pf: dropping IPv6 packet with ICMPv4 payload\n"));
6227		goto done;
6228	}
6229
6230	case IPPROTO_ICMPV6: {
6231		struct icmp6_hdr	ih;
6232
6233		pd.hdr.icmp6 = &ih;
6234		if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
6235		    &action, &reason, AF_INET6)) {
6236			log = action != PF_PASS;
6237			goto done;
6238		}
6239		action = pf_test_state_icmp(&s, dir, kif,
6240		    m, off, h, &pd, &reason);
6241		if (action == PF_PASS) {
6242			if (pfsync_update_state_ptr != NULL)
6243				pfsync_update_state_ptr(s);
6244			r = s->rule.ptr;
6245			a = s->anchor.ptr;
6246			log = s->log;
6247		} else if (s == NULL)
6248			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6249			    &a, &ruleset, inp);
6250		break;
6251	}
6252
6253	default:
6254		action = pf_test_state_other(&s, dir, kif, m, &pd);
6255		if (action == PF_PASS) {
6256			if (pfsync_update_state_ptr != NULL)
6257				pfsync_update_state_ptr(s);
6258			r = s->rule.ptr;
6259			a = s->anchor.ptr;
6260			log = s->log;
6261		} else if (s == NULL)
6262			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6263			    &a, &ruleset, inp);
6264		break;
6265	}
6266
6267done:
6268	PF_RULES_RUNLOCK();
6269	if (n != m) {
6270		m_freem(n);
6271		n = NULL;
6272	}
6273
6274	/* handle dangerous IPv6 extension headers. */
6275	if (action == PF_PASS && rh_cnt &&
6276	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
6277		action = PF_DROP;
6278		REASON_SET(&reason, PFRES_IPOPTIONS);
6279		log = 1;
6280		DPFPRINTF(PF_DEBUG_MISC,
6281		    ("pf: dropping packet with dangerous v6 headers\n"));
6282	}
6283
6284	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
6285		action = PF_DROP;
6286		REASON_SET(&reason, PFRES_MEMORY);
6287	}
6288	if (r->rtableid >= 0)
6289		M_SETFIB(m, r->rtableid);
6290
6291#ifdef ALTQ
6292	if (action == PF_PASS && r->qid) {
6293		if (pd.pf_mtag == NULL &&
6294		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
6295			action = PF_DROP;
6296			REASON_SET(&reason, PFRES_MEMORY);
6297		}
6298		if (pd.tos & IPTOS_LOWDELAY)
6299			pd.pf_mtag->qid = r->pqid;
6300		else
6301			pd.pf_mtag->qid = r->qid;
6302		/* add hints for ecn */
6303		pd.pf_mtag->hdr = h;
6304	}
6305#endif /* ALTQ */
6306
6307	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
6308	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
6309	    (s->nat_rule.ptr->action == PF_RDR ||
6310	    s->nat_rule.ptr->action == PF_BINAT) &&
6311	    IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
6312		m->m_flags |= M_SKIP_FIREWALL;
6313
6314	/* XXX: Anybody working on it?! */
6315	if (r->divert.port)
6316		printf("pf: divert(9) is not supported for IPv6\n");
6317
6318	if (log) {
6319		struct pf_rule *lr;
6320
6321		if (s != NULL && s->nat_rule.ptr != NULL &&
6322		    s->nat_rule.ptr->log & PF_LOG_ALL)
6323			lr = s->nat_rule.ptr;
6324		else
6325			lr = r;
6326		PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset,
6327		    &pd, (s == NULL));
6328	}
6329
6330	kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
6331	kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
6332
6333	if (action == PF_PASS || r->action == PF_DROP) {
6334		dirndx = (dir == PF_OUT);
6335		r->packets[dirndx]++;
6336		r->bytes[dirndx] += pd.tot_len;
6337		if (a != NULL) {
6338			a->packets[dirndx]++;
6339			a->bytes[dirndx] += pd.tot_len;
6340		}
6341		if (s != NULL) {
6342			if (s->nat_rule.ptr != NULL) {
6343				s->nat_rule.ptr->packets[dirndx]++;
6344				s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
6345			}
6346			if (s->src_node != NULL) {
6347				s->src_node->packets[dirndx]++;
6348				s->src_node->bytes[dirndx] += pd.tot_len;
6349			}
6350			if (s->nat_src_node != NULL) {
6351				s->nat_src_node->packets[dirndx]++;
6352				s->nat_src_node->bytes[dirndx] += pd.tot_len;
6353			}
6354			dirndx = (dir == s->direction) ? 0 : 1;
6355			s->packets[dirndx]++;
6356			s->bytes[dirndx] += pd.tot_len;
6357		}
6358		tr = r;
6359		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
6360		if (nr != NULL && r == &V_pf_default_rule)
6361			tr = nr;
6362		if (tr->src.addr.type == PF_ADDR_TABLE)
6363			pfr_update_stats(tr->src.addr.p.tbl,
6364			    (s == NULL) ? pd.src :
6365			    &s->key[(s->direction == PF_IN)]->addr[0],
6366			    pd.af, pd.tot_len, dir == PF_OUT,
6367			    r->action == PF_PASS, tr->src.neg);
6368		if (tr->dst.addr.type == PF_ADDR_TABLE)
6369			pfr_update_stats(tr->dst.addr.p.tbl,
6370			    (s == NULL) ? pd.dst :
6371			    &s->key[(s->direction == PF_IN)]->addr[1],
6372			    pd.af, pd.tot_len, dir == PF_OUT,
6373			    r->action == PF_PASS, tr->dst.neg);
6374	}
6375
6376	switch (action) {
6377	case PF_SYNPROXY_DROP:
6378		m_freem(*m0);
6379	case PF_DEFER:
6380		*m0 = NULL;
6381		action = PF_PASS;
6382		break;
6383	default:
6384		/* pf_route6() returns unlocked. */
6385		if (r->rt) {
6386			pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd);
6387			return (action);
6388		}
6389		break;
6390	}
6391
6392	if (s)
6393		PF_STATE_UNLOCK(s);
6394
6395	return (action);
6396}
6397#endif /* INET6 */
6398