pf.c revision 332513
1/*-
2 * Copyright (c) 2001 Daniel Hartmeier
3 * Copyright (c) 2002 - 2008 Henning Brauer
4 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 *    - Redistributions of source code must retain the above copyright
12 *      notice, this list of conditions and the following disclaimer.
13 *    - Redistributions in binary form must reproduce the above
14 *      copyright notice, this list of conditions and the following
15 *      disclaimer in the documentation and/or other materials provided
16 *      with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 * Effort sponsored in part by the Defense Advanced Research Projects
32 * Agency (DARPA) and Air Force Research Laboratory, Air Force
33 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
34 *
35 *	$OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: stable/11/sys/netpfil/pf/pf.c 332513 2018-04-15 15:22:28Z kp $");
40
41#include "opt_inet.h"
42#include "opt_inet6.h"
43#include "opt_bpf.h"
44#include "opt_pf.h"
45
46#include <sys/param.h>
47#include <sys/bus.h>
48#include <sys/endian.h>
49#include <sys/hash.h>
50#include <sys/interrupt.h>
51#include <sys/kernel.h>
52#include <sys/kthread.h>
53#include <sys/limits.h>
54#include <sys/mbuf.h>
55#include <sys/md5.h>
56#include <sys/random.h>
57#include <sys/refcount.h>
58#include <sys/socket.h>
59#include <sys/sysctl.h>
60#include <sys/taskqueue.h>
61#include <sys/ucred.h>
62
63#include <net/if.h>
64#include <net/if_var.h>
65#include <net/if_types.h>
66#include <net/if_vlan_var.h>
67#include <net/route.h>
68#include <net/radix_mpath.h>
69#include <net/vnet.h>
70
71#include <net/pfil.h>
72#include <net/pfvar.h>
73#include <net/if_pflog.h>
74#include <net/if_pfsync.h>
75
76#include <netinet/in_pcb.h>
77#include <netinet/in_var.h>
78#include <netinet/in_fib.h>
79#include <netinet/ip.h>
80#include <netinet/ip_fw.h>
81#include <netinet/ip_icmp.h>
82#include <netinet/icmp_var.h>
83#include <netinet/ip_var.h>
84#include <netinet/tcp.h>
85#include <netinet/tcp_fsm.h>
86#include <netinet/tcp_seq.h>
87#include <netinet/tcp_timer.h>
88#include <netinet/tcp_var.h>
89#include <netinet/udp.h>
90#include <netinet/udp_var.h>
91
92#include <netpfil/ipfw/ip_fw_private.h> /* XXX: only for DIR_IN/DIR_OUT */
93
94#ifdef INET6
95#include <netinet/ip6.h>
96#include <netinet/icmp6.h>
97#include <netinet6/nd6.h>
98#include <netinet6/ip6_var.h>
99#include <netinet6/in6_pcb.h>
100#include <netinet6/in6_fib.h>
101#include <netinet6/scope6_var.h>
102#endif /* INET6 */
103
104#include <machine/in_cksum.h>
105#include <security/mac/mac_framework.h>
106
107#define	DPFPRINTF(n, x)	if (V_pf_status.debug >= (n)) printf x
108
109/*
110 * Global variables
111 */
112
113/* state tables */
114VNET_DEFINE(struct pf_altqqueue,	 pf_altqs[2]);
115VNET_DEFINE(struct pf_palist,		 pf_pabuf);
116VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_active);
117VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_inactive);
118VNET_DEFINE(struct pf_kstatus,		 pf_status);
119
120VNET_DEFINE(u_int32_t,			 ticket_altqs_active);
121VNET_DEFINE(u_int32_t,			 ticket_altqs_inactive);
122VNET_DEFINE(int,			 altqs_inactive_open);
123VNET_DEFINE(u_int32_t,			 ticket_pabuf);
124
125VNET_DEFINE(MD5_CTX,			 pf_tcp_secret_ctx);
126#define	V_pf_tcp_secret_ctx		 VNET(pf_tcp_secret_ctx)
127VNET_DEFINE(u_char,			 pf_tcp_secret[16]);
128#define	V_pf_tcp_secret			 VNET(pf_tcp_secret)
129VNET_DEFINE(int,			 pf_tcp_secret_init);
130#define	V_pf_tcp_secret_init		 VNET(pf_tcp_secret_init)
131VNET_DEFINE(int,			 pf_tcp_iss_off);
132#define	V_pf_tcp_iss_off		 VNET(pf_tcp_iss_off)
133VNET_DECLARE(int,			 pf_vnet_active);
134#define	V_pf_vnet_active		 VNET(pf_vnet_active)
135
136/*
137 * Queue for pf_intr() sends.
138 */
139static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations");
140struct pf_send_entry {
141	STAILQ_ENTRY(pf_send_entry)	pfse_next;
142	struct mbuf			*pfse_m;
143	enum {
144		PFSE_IP,
145		PFSE_IP6,
146		PFSE_ICMP,
147		PFSE_ICMP6,
148	}				pfse_type;
149	struct {
150		int		type;
151		int		code;
152		int		mtu;
153	} icmpopts;
154};
155
156STAILQ_HEAD(pf_send_head, pf_send_entry);
157static VNET_DEFINE(struct pf_send_head, pf_sendqueue);
158#define	V_pf_sendqueue	VNET(pf_sendqueue)
159
160static struct mtx pf_sendqueue_mtx;
161MTX_SYSINIT(pf_sendqueue_mtx, &pf_sendqueue_mtx, "pf send queue", MTX_DEF);
162#define	PF_SENDQ_LOCK()		mtx_lock(&pf_sendqueue_mtx)
163#define	PF_SENDQ_UNLOCK()	mtx_unlock(&pf_sendqueue_mtx)
164
165/*
166 * Queue for pf_overload_task() tasks.
167 */
168struct pf_overload_entry {
169	SLIST_ENTRY(pf_overload_entry)	next;
170	struct pf_addr  		addr;
171	sa_family_t			af;
172	uint8_t				dir;
173	struct pf_rule  		*rule;
174};
175
176SLIST_HEAD(pf_overload_head, pf_overload_entry);
177static VNET_DEFINE(struct pf_overload_head, pf_overloadqueue);
178#define V_pf_overloadqueue	VNET(pf_overloadqueue)
179static VNET_DEFINE(struct task, pf_overloadtask);
180#define	V_pf_overloadtask	VNET(pf_overloadtask)
181
182static struct mtx pf_overloadqueue_mtx;
183MTX_SYSINIT(pf_overloadqueue_mtx, &pf_overloadqueue_mtx,
184    "pf overload/flush queue", MTX_DEF);
185#define	PF_OVERLOADQ_LOCK()	mtx_lock(&pf_overloadqueue_mtx)
186#define	PF_OVERLOADQ_UNLOCK()	mtx_unlock(&pf_overloadqueue_mtx)
187
188VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules);
189struct mtx pf_unlnkdrules_mtx;
190MTX_SYSINIT(pf_unlnkdrules_mtx, &pf_unlnkdrules_mtx, "pf unlinked rules",
191    MTX_DEF);
192
193static VNET_DEFINE(uma_zone_t,	pf_sources_z);
194#define	V_pf_sources_z	VNET(pf_sources_z)
195uma_zone_t		pf_mtag_z;
196VNET_DEFINE(uma_zone_t,	 pf_state_z);
197VNET_DEFINE(uma_zone_t,	 pf_state_key_z);
198
199VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]);
200#define	PFID_CPUBITS	8
201#define	PFID_CPUSHIFT	(sizeof(uint64_t) * NBBY - PFID_CPUBITS)
202#define	PFID_CPUMASK	((uint64_t)((1 << PFID_CPUBITS) - 1) <<	PFID_CPUSHIFT)
203#define	PFID_MAXID	(~PFID_CPUMASK)
204CTASSERT((1 << PFID_CPUBITS) >= MAXCPU);
205
206static void		 pf_src_tree_remove_state(struct pf_state *);
207static void		 pf_init_threshold(struct pf_threshold *, u_int32_t,
208			    u_int32_t);
209static void		 pf_add_threshold(struct pf_threshold *);
210static int		 pf_check_threshold(struct pf_threshold *);
211
212static void		 pf_change_ap(struct mbuf *, struct pf_addr *, u_int16_t *,
213			    u_int16_t *, u_int16_t *, struct pf_addr *,
214			    u_int16_t, u_int8_t, sa_family_t);
215static int		 pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
216			    struct tcphdr *, struct pf_state_peer *);
217static void		 pf_change_icmp(struct pf_addr *, u_int16_t *,
218			    struct pf_addr *, struct pf_addr *, u_int16_t,
219			    u_int16_t *, u_int16_t *, u_int16_t *,
220			    u_int16_t *, u_int8_t, sa_family_t);
221static void		 pf_send_tcp(struct mbuf *,
222			    const struct pf_rule *, sa_family_t,
223			    const struct pf_addr *, const struct pf_addr *,
224			    u_int16_t, u_int16_t, u_int32_t, u_int32_t,
225			    u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
226			    u_int16_t, struct ifnet *);
227static void		 pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
228			    sa_family_t, struct pf_rule *);
229static void		 pf_detach_state(struct pf_state *);
230static int		 pf_state_key_attach(struct pf_state_key *,
231			    struct pf_state_key *, struct pf_state *);
232static void		 pf_state_key_detach(struct pf_state *, int);
233static int		 pf_state_key_ctor(void *, int, void *, int);
234static u_int32_t	 pf_tcp_iss(struct pf_pdesc *);
235static int		 pf_test_rule(struct pf_rule **, struct pf_state **,
236			    int, struct pfi_kif *, struct mbuf *, int,
237			    struct pf_pdesc *, struct pf_rule **,
238			    struct pf_ruleset **, struct inpcb *);
239static int		 pf_create_state(struct pf_rule *, struct pf_rule *,
240			    struct pf_rule *, struct pf_pdesc *,
241			    struct pf_src_node *, struct pf_state_key *,
242			    struct pf_state_key *, struct mbuf *, int,
243			    u_int16_t, u_int16_t, int *, struct pfi_kif *,
244			    struct pf_state **, int, u_int16_t, u_int16_t,
245			    int);
246static int		 pf_test_fragment(struct pf_rule **, int,
247			    struct pfi_kif *, struct mbuf *, void *,
248			    struct pf_pdesc *, struct pf_rule **,
249			    struct pf_ruleset **);
250static int		 pf_tcp_track_full(struct pf_state_peer *,
251			    struct pf_state_peer *, struct pf_state **,
252			    struct pfi_kif *, struct mbuf *, int,
253			    struct pf_pdesc *, u_short *, int *);
254static int		 pf_tcp_track_sloppy(struct pf_state_peer *,
255			    struct pf_state_peer *, struct pf_state **,
256			    struct pf_pdesc *, u_short *);
257static int		 pf_test_state_tcp(struct pf_state **, int,
258			    struct pfi_kif *, struct mbuf *, int,
259			    void *, struct pf_pdesc *, u_short *);
260static int		 pf_test_state_udp(struct pf_state **, int,
261			    struct pfi_kif *, struct mbuf *, int,
262			    void *, struct pf_pdesc *);
263static int		 pf_test_state_icmp(struct pf_state **, int,
264			    struct pfi_kif *, struct mbuf *, int,
265			    void *, struct pf_pdesc *, u_short *);
266static int		 pf_test_state_other(struct pf_state **, int,
267			    struct pfi_kif *, struct mbuf *, struct pf_pdesc *);
268static u_int8_t		 pf_get_wscale(struct mbuf *, int, u_int16_t,
269			    sa_family_t);
270static u_int16_t	 pf_get_mss(struct mbuf *, int, u_int16_t,
271			    sa_family_t);
272static u_int16_t	 pf_calc_mss(struct pf_addr *, sa_family_t,
273				int, u_int16_t);
274static int		 pf_check_proto_cksum(struct mbuf *, int, int,
275			    u_int8_t, sa_family_t);
276static void		 pf_print_state_parts(struct pf_state *,
277			    struct pf_state_key *, struct pf_state_key *);
278static int		 pf_addr_wrap_neq(struct pf_addr_wrap *,
279			    struct pf_addr_wrap *);
280static struct pf_state	*pf_find_state(struct pfi_kif *,
281			    struct pf_state_key_cmp *, u_int);
282static int		 pf_src_connlimit(struct pf_state **);
283static void		 pf_overload_task(void *v, int pending);
284static int		 pf_insert_src_node(struct pf_src_node **,
285			    struct pf_rule *, struct pf_addr *, sa_family_t);
286static u_int		 pf_purge_expired_states(u_int, int);
287static void		 pf_purge_unlinked_rules(void);
288static int		 pf_mtag_uminit(void *, int, int);
289static void		 pf_mtag_free(struct m_tag *);
290#ifdef INET
291static void		 pf_route(struct mbuf **, struct pf_rule *, int,
292			    struct ifnet *, struct pf_state *,
293			    struct pf_pdesc *);
294#endif /* INET */
295#ifdef INET6
296static void		 pf_change_a6(struct pf_addr *, u_int16_t *,
297			    struct pf_addr *, u_int8_t);
298static void		 pf_route6(struct mbuf **, struct pf_rule *, int,
299			    struct ifnet *, struct pf_state *,
300			    struct pf_pdesc *);
301#endif /* INET6 */
302
303int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
304
305extern int pf_end_threads;
306
307VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]);
308
309#define	PACKET_LOOPED(pd)	((pd)->pf_mtag &&			\
310				 (pd)->pf_mtag->flags & PF_PACKET_LOOPED)
311
312#define	STATE_LOOKUP(i, k, d, s, pd)					\
313	do {								\
314		(s) = pf_find_state((i), (k), (d));			\
315		if ((s) == NULL)					\
316			return (PF_DROP);				\
317		if (PACKET_LOOPED(pd))					\
318			return (PF_PASS);				\
319		if ((d) == PF_OUT &&					\
320		    (((s)->rule.ptr->rt == PF_ROUTETO &&		\
321		    (s)->rule.ptr->direction == PF_OUT) ||		\
322		    ((s)->rule.ptr->rt == PF_REPLYTO &&			\
323		    (s)->rule.ptr->direction == PF_IN)) &&		\
324		    (s)->rt_kif != NULL &&				\
325		    (s)->rt_kif != (i))					\
326			return (PF_PASS);				\
327	} while (0)
328
329#define	BOUND_IFACE(r, k) \
330	((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all
331
332#define	STATE_INC_COUNTERS(s)						\
333	do {								\
334		counter_u64_add(s->rule.ptr->states_cur, 1);		\
335		counter_u64_add(s->rule.ptr->states_tot, 1);		\
336		if (s->anchor.ptr != NULL) {				\
337			counter_u64_add(s->anchor.ptr->states_cur, 1);	\
338			counter_u64_add(s->anchor.ptr->states_tot, 1);	\
339		}							\
340		if (s->nat_rule.ptr != NULL) {				\
341			counter_u64_add(s->nat_rule.ptr->states_cur, 1);\
342			counter_u64_add(s->nat_rule.ptr->states_tot, 1);\
343		}							\
344	} while (0)
345
346#define	STATE_DEC_COUNTERS(s)						\
347	do {								\
348		if (s->nat_rule.ptr != NULL)				\
349			counter_u64_add(s->nat_rule.ptr->states_cur, -1);\
350		if (s->anchor.ptr != NULL)				\
351			counter_u64_add(s->anchor.ptr->states_cur, -1);	\
352		counter_u64_add(s->rule.ptr->states_cur, -1);		\
353	} while (0)
354
355static MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures");
356VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
357VNET_DEFINE(struct pf_idhash *, pf_idhash);
358VNET_DEFINE(struct pf_srchash *, pf_srchash);
359
360SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)");
361
362u_long	pf_hashmask;
363u_long	pf_srchashmask;
364static u_long	pf_hashsize;
365static u_long	pf_srchashsize;
366
367SYSCTL_ULONG(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN,
368    &pf_hashsize, 0, "Size of pf(4) states hashtable");
369SYSCTL_ULONG(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN,
370    &pf_srchashsize, 0, "Size of pf(4) source nodes hashtable");
371
372VNET_DEFINE(void *, pf_swi_cookie);
373
374VNET_DEFINE(uint32_t, pf_hashseed);
375#define	V_pf_hashseed	VNET(pf_hashseed)
376
377int
378pf_addr_cmp(struct pf_addr *a, struct pf_addr *b, sa_family_t af)
379{
380
381	switch (af) {
382#ifdef INET
383	case AF_INET:
384		if (a->addr32[0] > b->addr32[0])
385			return (1);
386		if (a->addr32[0] < b->addr32[0])
387			return (-1);
388		break;
389#endif /* INET */
390#ifdef INET6
391	case AF_INET6:
392		if (a->addr32[3] > b->addr32[3])
393			return (1);
394		if (a->addr32[3] < b->addr32[3])
395			return (-1);
396		if (a->addr32[2] > b->addr32[2])
397			return (1);
398		if (a->addr32[2] < b->addr32[2])
399			return (-1);
400		if (a->addr32[1] > b->addr32[1])
401			return (1);
402		if (a->addr32[1] < b->addr32[1])
403			return (-1);
404		if (a->addr32[0] > b->addr32[0])
405			return (1);
406		if (a->addr32[0] < b->addr32[0])
407			return (-1);
408		break;
409#endif /* INET6 */
410	default:
411		panic("%s: unknown address family %u", __func__, af);
412	}
413	return (0);
414}
415
416static __inline uint32_t
417pf_hashkey(struct pf_state_key *sk)
418{
419	uint32_t h;
420
421	h = murmur3_32_hash32((uint32_t *)sk,
422	    sizeof(struct pf_state_key_cmp)/sizeof(uint32_t),
423	    V_pf_hashseed);
424
425	return (h & pf_hashmask);
426}
427
428static __inline uint32_t
429pf_hashsrc(struct pf_addr *addr, sa_family_t af)
430{
431	uint32_t h;
432
433	switch (af) {
434	case AF_INET:
435		h = murmur3_32_hash32((uint32_t *)&addr->v4,
436		    sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed);
437		break;
438	case AF_INET6:
439		h = murmur3_32_hash32((uint32_t *)&addr->v6,
440		    sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed);
441		break;
442	default:
443		panic("%s: unknown address family %u", __func__, af);
444	}
445
446	return (h & pf_srchashmask);
447}
448
449#ifdef ALTQ
450static int
451pf_state_hash(struct pf_state *s)
452{
453	u_int32_t hv = (intptr_t)s / sizeof(*s);
454
455	hv ^= crc32(&s->src, sizeof(s->src));
456	hv ^= crc32(&s->dst, sizeof(s->dst));
457	if (hv == 0)
458		hv = 1;
459	return (hv);
460}
461#endif
462
463#ifdef INET6
464void
465pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
466{
467	switch (af) {
468#ifdef INET
469	case AF_INET:
470		dst->addr32[0] = src->addr32[0];
471		break;
472#endif /* INET */
473	case AF_INET6:
474		dst->addr32[0] = src->addr32[0];
475		dst->addr32[1] = src->addr32[1];
476		dst->addr32[2] = src->addr32[2];
477		dst->addr32[3] = src->addr32[3];
478		break;
479	}
480}
481#endif /* INET6 */
482
483static void
484pf_init_threshold(struct pf_threshold *threshold,
485    u_int32_t limit, u_int32_t seconds)
486{
487	threshold->limit = limit * PF_THRESHOLD_MULT;
488	threshold->seconds = seconds;
489	threshold->count = 0;
490	threshold->last = time_uptime;
491}
492
493static void
494pf_add_threshold(struct pf_threshold *threshold)
495{
496	u_int32_t t = time_uptime, diff = t - threshold->last;
497
498	if (diff >= threshold->seconds)
499		threshold->count = 0;
500	else
501		threshold->count -= threshold->count * diff /
502		    threshold->seconds;
503	threshold->count += PF_THRESHOLD_MULT;
504	threshold->last = t;
505}
506
507static int
508pf_check_threshold(struct pf_threshold *threshold)
509{
510	return (threshold->count > threshold->limit);
511}
512
513static int
514pf_src_connlimit(struct pf_state **state)
515{
516	struct pf_overload_entry *pfoe;
517	int bad = 0;
518
519	PF_STATE_LOCK_ASSERT(*state);
520
521	(*state)->src_node->conn++;
522	(*state)->src.tcp_est = 1;
523	pf_add_threshold(&(*state)->src_node->conn_rate);
524
525	if ((*state)->rule.ptr->max_src_conn &&
526	    (*state)->rule.ptr->max_src_conn <
527	    (*state)->src_node->conn) {
528		counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONN], 1);
529		bad++;
530	}
531
532	if ((*state)->rule.ptr->max_src_conn_rate.limit &&
533	    pf_check_threshold(&(*state)->src_node->conn_rate)) {
534		counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONNRATE], 1);
535		bad++;
536	}
537
538	if (!bad)
539		return (0);
540
541	/* Kill this state. */
542	(*state)->timeout = PFTM_PURGE;
543	(*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
544
545	if ((*state)->rule.ptr->overload_tbl == NULL)
546		return (1);
547
548	/* Schedule overloading and flushing task. */
549	pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT);
550	if (pfoe == NULL)
551		return (1);	/* too bad :( */
552
553	bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr));
554	pfoe->af = (*state)->key[PF_SK_WIRE]->af;
555	pfoe->rule = (*state)->rule.ptr;
556	pfoe->dir = (*state)->direction;
557	PF_OVERLOADQ_LOCK();
558	SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next);
559	PF_OVERLOADQ_UNLOCK();
560	taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask);
561
562	return (1);
563}
564
565static void
566pf_overload_task(void *v, int pending)
567{
568	struct pf_overload_head queue;
569	struct pfr_addr p;
570	struct pf_overload_entry *pfoe, *pfoe1;
571	uint32_t killed = 0;
572
573	CURVNET_SET((struct vnet *)v);
574
575	PF_OVERLOADQ_LOCK();
576	queue = V_pf_overloadqueue;
577	SLIST_INIT(&V_pf_overloadqueue);
578	PF_OVERLOADQ_UNLOCK();
579
580	bzero(&p, sizeof(p));
581	SLIST_FOREACH(pfoe, &queue, next) {
582		counter_u64_add(V_pf_status.lcounters[LCNT_OVERLOAD_TABLE], 1);
583		if (V_pf_status.debug >= PF_DEBUG_MISC) {
584			printf("%s: blocking address ", __func__);
585			pf_print_host(&pfoe->addr, 0, pfoe->af);
586			printf("\n");
587		}
588
589		p.pfra_af = pfoe->af;
590		switch (pfoe->af) {
591#ifdef INET
592		case AF_INET:
593			p.pfra_net = 32;
594			p.pfra_ip4addr = pfoe->addr.v4;
595			break;
596#endif
597#ifdef INET6
598		case AF_INET6:
599			p.pfra_net = 128;
600			p.pfra_ip6addr = pfoe->addr.v6;
601			break;
602#endif
603		}
604
605		PF_RULES_WLOCK();
606		pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second);
607		PF_RULES_WUNLOCK();
608	}
609
610	/*
611	 * Remove those entries, that don't need flushing.
612	 */
613	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
614		if (pfoe->rule->flush == 0) {
615			SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next);
616			free(pfoe, M_PFTEMP);
617		} else
618			counter_u64_add(
619			    V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH], 1);
620
621	/* If nothing to flush, return. */
622	if (SLIST_EMPTY(&queue)) {
623		CURVNET_RESTORE();
624		return;
625	}
626
627	for (int i = 0; i <= pf_hashmask; i++) {
628		struct pf_idhash *ih = &V_pf_idhash[i];
629		struct pf_state_key *sk;
630		struct pf_state *s;
631
632		PF_HASHROW_LOCK(ih);
633		LIST_FOREACH(s, &ih->states, entry) {
634		    sk = s->key[PF_SK_WIRE];
635		    SLIST_FOREACH(pfoe, &queue, next)
636			if (sk->af == pfoe->af &&
637			    ((pfoe->rule->flush & PF_FLUSH_GLOBAL) ||
638			    pfoe->rule == s->rule.ptr) &&
639			    ((pfoe->dir == PF_OUT &&
640			    PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) ||
641			    (pfoe->dir == PF_IN &&
642			    PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) {
643				s->timeout = PFTM_PURGE;
644				s->src.state = s->dst.state = TCPS_CLOSED;
645				killed++;
646			}
647		}
648		PF_HASHROW_UNLOCK(ih);
649	}
650	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
651		free(pfoe, M_PFTEMP);
652	if (V_pf_status.debug >= PF_DEBUG_MISC)
653		printf("%s: %u states killed", __func__, killed);
654
655	CURVNET_RESTORE();
656}
657
658/*
659 * Can return locked on failure, so that we can consistently
660 * allocate and insert a new one.
661 */
662struct pf_src_node *
663pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af,
664	int returnlocked)
665{
666	struct pf_srchash *sh;
667	struct pf_src_node *n;
668
669	counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1);
670
671	sh = &V_pf_srchash[pf_hashsrc(src, af)];
672	PF_HASHROW_LOCK(sh);
673	LIST_FOREACH(n, &sh->nodes, entry)
674		if (n->rule.ptr == rule && n->af == af &&
675		    ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) ||
676		    (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0)))
677			break;
678	if (n != NULL) {
679		n->states++;
680		PF_HASHROW_UNLOCK(sh);
681	} else if (returnlocked == 0)
682		PF_HASHROW_UNLOCK(sh);
683
684	return (n);
685}
686
687static int
688pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
689    struct pf_addr *src, sa_family_t af)
690{
691
692	KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK ||
693	    rule->rpool.opts & PF_POOL_STICKYADDR),
694	    ("%s for non-tracking rule %p", __func__, rule));
695
696	if (*sn == NULL)
697		*sn = pf_find_src_node(src, rule, af, 1);
698
699	if (*sn == NULL) {
700		struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)];
701
702		PF_HASHROW_ASSERT(sh);
703
704		if (!rule->max_src_nodes ||
705		    counter_u64_fetch(rule->src_nodes) < rule->max_src_nodes)
706			(*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO);
707		else
708			counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES],
709			    1);
710		if ((*sn) == NULL) {
711			PF_HASHROW_UNLOCK(sh);
712			return (-1);
713		}
714
715		pf_init_threshold(&(*sn)->conn_rate,
716		    rule->max_src_conn_rate.limit,
717		    rule->max_src_conn_rate.seconds);
718
719		(*sn)->af = af;
720		(*sn)->rule.ptr = rule;
721		PF_ACPY(&(*sn)->addr, src, af);
722		LIST_INSERT_HEAD(&sh->nodes, *sn, entry);
723		(*sn)->creation = time_uptime;
724		(*sn)->ruletype = rule->action;
725		(*sn)->states = 1;
726		if ((*sn)->rule.ptr != NULL)
727			counter_u64_add((*sn)->rule.ptr->src_nodes, 1);
728		PF_HASHROW_UNLOCK(sh);
729		counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1);
730	} else {
731		if (rule->max_src_states &&
732		    (*sn)->states >= rule->max_src_states) {
733			counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES],
734			    1);
735			return (-1);
736		}
737	}
738	return (0);
739}
740
741void
742pf_unlink_src_node(struct pf_src_node *src)
743{
744
745	PF_HASHROW_ASSERT(&V_pf_srchash[pf_hashsrc(&src->addr, src->af)]);
746	LIST_REMOVE(src, entry);
747	if (src->rule.ptr)
748		counter_u64_add(src->rule.ptr->src_nodes, -1);
749}
750
751u_int
752pf_free_src_nodes(struct pf_src_node_list *head)
753{
754	struct pf_src_node *sn, *tmp;
755	u_int count = 0;
756
757	LIST_FOREACH_SAFE(sn, head, entry, tmp) {
758		uma_zfree(V_pf_sources_z, sn);
759		count++;
760	}
761
762	counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], count);
763
764	return (count);
765}
766
767void
768pf_mtag_initialize()
769{
770
771	pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) +
772	    sizeof(struct pf_mtag), NULL, NULL, pf_mtag_uminit, NULL,
773	    UMA_ALIGN_PTR, 0);
774}
775
776/* Per-vnet data storage structures initialization. */
777void
778pf_initialize()
779{
780	struct pf_keyhash	*kh;
781	struct pf_idhash	*ih;
782	struct pf_srchash	*sh;
783	u_int i;
784
785	if (pf_hashsize == 0 || !powerof2(pf_hashsize))
786		pf_hashsize = PF_HASHSIZ;
787	if (pf_srchashsize == 0 || !powerof2(pf_srchashsize))
788		pf_srchashsize = PF_SRCHASHSIZ;
789
790	V_pf_hashseed = arc4random();
791
792	/* States and state keys storage. */
793	V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state),
794	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
795	V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z;
796	uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
797	uma_zone_set_warning(V_pf_state_z, "PF states limit reached");
798
799	V_pf_state_key_z = uma_zcreate("pf state keys",
800	    sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL,
801	    UMA_ALIGN_PTR, 0);
802
803	V_pf_keyhash = mallocarray(pf_hashsize, sizeof(struct pf_keyhash),
804	    M_PFHASH, M_NOWAIT | M_ZERO);
805	V_pf_idhash = mallocarray(pf_hashsize, sizeof(struct pf_idhash),
806	    M_PFHASH, M_NOWAIT | M_ZERO);
807	if (V_pf_keyhash == NULL || V_pf_idhash == NULL) {
808		printf("pf: Unable to allocate memory for "
809		    "state_hashsize %lu.\n", pf_hashsize);
810
811		free(V_pf_keyhash, M_PFHASH);
812		free(V_pf_idhash, M_PFHASH);
813
814		pf_hashsize = PF_HASHSIZ;
815		V_pf_keyhash = mallocarray(pf_hashsize,
816		    sizeof(struct pf_keyhash), M_PFHASH, M_WAITOK | M_ZERO);
817		V_pf_idhash = mallocarray(pf_hashsize,
818		    sizeof(struct pf_idhash), M_PFHASH, M_WAITOK | M_ZERO);
819	}
820
821	pf_hashmask = pf_hashsize - 1;
822	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask;
823	    i++, kh++, ih++) {
824		mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK);
825		mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
826	}
827
828	/* Source nodes. */
829	V_pf_sources_z = uma_zcreate("pf source nodes",
830	    sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
831	    0);
832	V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z;
833	uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT);
834	uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached");
835
836	V_pf_srchash = mallocarray(pf_srchashsize,
837	    sizeof(struct pf_srchash), M_PFHASH, M_NOWAIT | M_ZERO);
838	if (V_pf_srchash == NULL) {
839		printf("pf: Unable to allocate memory for "
840		    "source_hashsize %lu.\n", pf_srchashsize);
841
842		pf_srchashsize = PF_SRCHASHSIZ;
843		V_pf_srchash = mallocarray(pf_srchashsize,
844		    sizeof(struct pf_srchash), M_PFHASH, M_WAITOK | M_ZERO);
845	}
846
847	pf_srchashmask = pf_srchashsize - 1;
848	for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++)
849		mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
850
851	/* ALTQ */
852	TAILQ_INIT(&V_pf_altqs[0]);
853	TAILQ_INIT(&V_pf_altqs[1]);
854	TAILQ_INIT(&V_pf_pabuf);
855	V_pf_altqs_active = &V_pf_altqs[0];
856	V_pf_altqs_inactive = &V_pf_altqs[1];
857
858	/* Send & overload+flush queues. */
859	STAILQ_INIT(&V_pf_sendqueue);
860	SLIST_INIT(&V_pf_overloadqueue);
861	TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, curvnet);
862
863	/* Unlinked, but may be referenced rules. */
864	TAILQ_INIT(&V_pf_unlinked_rules);
865}
866
867void
868pf_mtag_cleanup()
869{
870
871	uma_zdestroy(pf_mtag_z);
872}
873
874void
875pf_cleanup()
876{
877	struct pf_keyhash	*kh;
878	struct pf_idhash	*ih;
879	struct pf_srchash	*sh;
880	struct pf_send_entry	*pfse, *next;
881	u_int i;
882
883	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask;
884	    i++, kh++, ih++) {
885		KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
886		    __func__));
887		KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
888		    __func__));
889		mtx_destroy(&kh->lock);
890		mtx_destroy(&ih->lock);
891	}
892	free(V_pf_keyhash, M_PFHASH);
893	free(V_pf_idhash, M_PFHASH);
894
895	for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) {
896		KASSERT(LIST_EMPTY(&sh->nodes),
897		    ("%s: source node hash not empty", __func__));
898		mtx_destroy(&sh->lock);
899	}
900	free(V_pf_srchash, M_PFHASH);
901
902	STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
903		m_freem(pfse->pfse_m);
904		free(pfse, M_PFTEMP);
905	}
906
907	uma_zdestroy(V_pf_sources_z);
908	uma_zdestroy(V_pf_state_z);
909	uma_zdestroy(V_pf_state_key_z);
910}
911
912static int
913pf_mtag_uminit(void *mem, int size, int how)
914{
915	struct m_tag *t;
916
917	t = (struct m_tag *)mem;
918	t->m_tag_cookie = MTAG_ABI_COMPAT;
919	t->m_tag_id = PACKET_TAG_PF;
920	t->m_tag_len = sizeof(struct pf_mtag);
921	t->m_tag_free = pf_mtag_free;
922
923	return (0);
924}
925
926static void
927pf_mtag_free(struct m_tag *t)
928{
929
930	uma_zfree(pf_mtag_z, t);
931}
932
933struct pf_mtag *
934pf_get_mtag(struct mbuf *m)
935{
936	struct m_tag *mtag;
937
938	if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL)
939		return ((struct pf_mtag *)(mtag + 1));
940
941	mtag = uma_zalloc(pf_mtag_z, M_NOWAIT);
942	if (mtag == NULL)
943		return (NULL);
944	bzero(mtag + 1, sizeof(struct pf_mtag));
945	m_tag_prepend(m, mtag);
946
947	return ((struct pf_mtag *)(mtag + 1));
948}
949
950static int
951pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
952    struct pf_state *s)
953{
954	struct pf_keyhash	*khs, *khw, *kh;
955	struct pf_state_key	*sk, *cur;
956	struct pf_state		*si, *olds = NULL;
957	int idx;
958
959	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
960	KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
961	KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
962
963	/*
964	 * We need to lock hash slots of both keys. To avoid deadlock
965	 * we always lock the slot with lower address first. Unlock order
966	 * isn't important.
967	 *
968	 * We also need to lock ID hash slot before dropping key
969	 * locks. On success we return with ID hash slot locked.
970	 */
971
972	if (skw == sks) {
973		khs = khw = &V_pf_keyhash[pf_hashkey(skw)];
974		PF_HASHROW_LOCK(khs);
975	} else {
976		khs = &V_pf_keyhash[pf_hashkey(sks)];
977		khw = &V_pf_keyhash[pf_hashkey(skw)];
978		if (khs == khw) {
979			PF_HASHROW_LOCK(khs);
980		} else if (khs < khw) {
981			PF_HASHROW_LOCK(khs);
982			PF_HASHROW_LOCK(khw);
983		} else {
984			PF_HASHROW_LOCK(khw);
985			PF_HASHROW_LOCK(khs);
986		}
987	}
988
989#define	KEYS_UNLOCK()	do {			\
990	if (khs != khw) {			\
991		PF_HASHROW_UNLOCK(khs);		\
992		PF_HASHROW_UNLOCK(khw);		\
993	} else					\
994		PF_HASHROW_UNLOCK(khs);		\
995} while (0)
996
997	/*
998	 * First run: start with wire key.
999	 */
1000	sk = skw;
1001	kh = khw;
1002	idx = PF_SK_WIRE;
1003
1004keyattach:
1005	LIST_FOREACH(cur, &kh->keys, entry)
1006		if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
1007			break;
1008
1009	if (cur != NULL) {
1010		/* Key exists. Check for same kif, if none, add to key. */
1011		TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
1012			struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
1013
1014			PF_HASHROW_LOCK(ih);
1015			if (si->kif == s->kif &&
1016			    si->direction == s->direction) {
1017				if (sk->proto == IPPROTO_TCP &&
1018				    si->src.state >= TCPS_FIN_WAIT_2 &&
1019				    si->dst.state >= TCPS_FIN_WAIT_2) {
1020					/*
1021					 * New state matches an old >FIN_WAIT_2
1022					 * state. We can't drop key hash locks,
1023					 * thus we can't unlink it properly.
1024					 *
1025					 * As a workaround we drop it into
1026					 * TCPS_CLOSED state, schedule purge
1027					 * ASAP and push it into the very end
1028					 * of the slot TAILQ, so that it won't
1029					 * conflict with our new state.
1030					 */
1031					si->src.state = si->dst.state =
1032					    TCPS_CLOSED;
1033					si->timeout = PFTM_PURGE;
1034					olds = si;
1035				} else {
1036					if (V_pf_status.debug >= PF_DEBUG_MISC) {
1037						printf("pf: %s key attach "
1038						    "failed on %s: ",
1039						    (idx == PF_SK_WIRE) ?
1040						    "wire" : "stack",
1041						    s->kif->pfik_name);
1042						pf_print_state_parts(s,
1043						    (idx == PF_SK_WIRE) ?
1044						    sk : NULL,
1045						    (idx == PF_SK_STACK) ?
1046						    sk : NULL);
1047						printf(", existing: ");
1048						pf_print_state_parts(si,
1049						    (idx == PF_SK_WIRE) ?
1050						    sk : NULL,
1051						    (idx == PF_SK_STACK) ?
1052						    sk : NULL);
1053						printf("\n");
1054					}
1055					PF_HASHROW_UNLOCK(ih);
1056					KEYS_UNLOCK();
1057					uma_zfree(V_pf_state_key_z, sk);
1058					if (idx == PF_SK_STACK)
1059						pf_detach_state(s);
1060					return (EEXIST); /* collision! */
1061				}
1062			}
1063			PF_HASHROW_UNLOCK(ih);
1064		}
1065		uma_zfree(V_pf_state_key_z, sk);
1066		s->key[idx] = cur;
1067	} else {
1068		LIST_INSERT_HEAD(&kh->keys, sk, entry);
1069		s->key[idx] = sk;
1070	}
1071
1072stateattach:
1073	/* List is sorted, if-bound states before floating. */
1074	if (s->kif == V_pfi_all)
1075		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
1076	else
1077		TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
1078
1079	if (olds) {
1080		TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]);
1081		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds,
1082		    key_list[idx]);
1083		olds = NULL;
1084	}
1085
1086	/*
1087	 * Attach done. See how should we (or should not?)
1088	 * attach a second key.
1089	 */
1090	if (sks == skw) {
1091		s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
1092		idx = PF_SK_STACK;
1093		sks = NULL;
1094		goto stateattach;
1095	} else if (sks != NULL) {
1096		/*
1097		 * Continue attaching with stack key.
1098		 */
1099		sk = sks;
1100		kh = khs;
1101		idx = PF_SK_STACK;
1102		sks = NULL;
1103		goto keyattach;
1104	}
1105
1106	PF_STATE_LOCK(s);
1107	KEYS_UNLOCK();
1108
1109	KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
1110	    ("%s failure", __func__));
1111
1112	return (0);
1113#undef	KEYS_UNLOCK
1114}
1115
1116static void
1117pf_detach_state(struct pf_state *s)
1118{
1119	struct pf_state_key *sks = s->key[PF_SK_STACK];
1120	struct pf_keyhash *kh;
1121
1122	if (sks != NULL) {
1123		kh = &V_pf_keyhash[pf_hashkey(sks)];
1124		PF_HASHROW_LOCK(kh);
1125		if (s->key[PF_SK_STACK] != NULL)
1126			pf_state_key_detach(s, PF_SK_STACK);
1127		/*
1128		 * If both point to same key, then we are done.
1129		 */
1130		if (sks == s->key[PF_SK_WIRE]) {
1131			pf_state_key_detach(s, PF_SK_WIRE);
1132			PF_HASHROW_UNLOCK(kh);
1133			return;
1134		}
1135		PF_HASHROW_UNLOCK(kh);
1136	}
1137
1138	if (s->key[PF_SK_WIRE] != NULL) {
1139		kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])];
1140		PF_HASHROW_LOCK(kh);
1141		if (s->key[PF_SK_WIRE] != NULL)
1142			pf_state_key_detach(s, PF_SK_WIRE);
1143		PF_HASHROW_UNLOCK(kh);
1144	}
1145}
1146
1147static void
1148pf_state_key_detach(struct pf_state *s, int idx)
1149{
1150	struct pf_state_key *sk = s->key[idx];
1151#ifdef INVARIANTS
1152	struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)];
1153
1154	PF_HASHROW_ASSERT(kh);
1155#endif
1156	TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]);
1157	s->key[idx] = NULL;
1158
1159	if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) {
1160		LIST_REMOVE(sk, entry);
1161		uma_zfree(V_pf_state_key_z, sk);
1162	}
1163}
1164
1165static int
1166pf_state_key_ctor(void *mem, int size, void *arg, int flags)
1167{
1168	struct pf_state_key *sk = mem;
1169
1170	bzero(sk, sizeof(struct pf_state_key_cmp));
1171	TAILQ_INIT(&sk->states[PF_SK_WIRE]);
1172	TAILQ_INIT(&sk->states[PF_SK_STACK]);
1173
1174	return (0);
1175}
1176
1177struct pf_state_key *
1178pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr,
1179	struct pf_addr *daddr, u_int16_t sport, u_int16_t dport)
1180{
1181	struct pf_state_key *sk;
1182
1183	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1184	if (sk == NULL)
1185		return (NULL);
1186
1187	PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af);
1188	PF_ACPY(&sk->addr[pd->didx], daddr, pd->af);
1189	sk->port[pd->sidx] = sport;
1190	sk->port[pd->didx] = dport;
1191	sk->proto = pd->proto;
1192	sk->af = pd->af;
1193
1194	return (sk);
1195}
1196
1197struct pf_state_key *
1198pf_state_key_clone(struct pf_state_key *orig)
1199{
1200	struct pf_state_key *sk;
1201
1202	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1203	if (sk == NULL)
1204		return (NULL);
1205
1206	bcopy(orig, sk, sizeof(struct pf_state_key_cmp));
1207
1208	return (sk);
1209}
1210
1211int
1212pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw,
1213    struct pf_state_key *sks, struct pf_state *s)
1214{
1215	struct pf_idhash *ih;
1216	struct pf_state *cur;
1217	int error;
1218
1219	KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]),
1220	    ("%s: sks not pristine", __func__));
1221	KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]),
1222	    ("%s: skw not pristine", __func__));
1223	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
1224
1225	s->kif = kif;
1226
1227	if (s->id == 0 && s->creatorid == 0) {
1228		/* XXX: should be atomic, but probability of collision low */
1229		if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID)
1230			V_pf_stateid[curcpu] = 1;
1231		s->id |= (uint64_t )curcpu << PFID_CPUSHIFT;
1232		s->id = htobe64(s->id);
1233		s->creatorid = V_pf_status.hostid;
1234	}
1235
1236	/* Returns with ID locked on success. */
1237	if ((error = pf_state_key_attach(skw, sks, s)) != 0)
1238		return (error);
1239
1240	ih = &V_pf_idhash[PF_IDHASH(s)];
1241	PF_HASHROW_ASSERT(ih);
1242	LIST_FOREACH(cur, &ih->states, entry)
1243		if (cur->id == s->id && cur->creatorid == s->creatorid)
1244			break;
1245
1246	if (cur != NULL) {
1247		PF_HASHROW_UNLOCK(ih);
1248		if (V_pf_status.debug >= PF_DEBUG_MISC) {
1249			printf("pf: state ID collision: "
1250			    "id: %016llx creatorid: %08x\n",
1251			    (unsigned long long)be64toh(s->id),
1252			    ntohl(s->creatorid));
1253		}
1254		pf_detach_state(s);
1255		return (EEXIST);
1256	}
1257	LIST_INSERT_HEAD(&ih->states, s, entry);
1258	/* One for keys, one for ID hash. */
1259	refcount_init(&s->refs, 2);
1260
1261	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_INSERT], 1);
1262	if (pfsync_insert_state_ptr != NULL)
1263		pfsync_insert_state_ptr(s);
1264
1265	/* Returns locked. */
1266	return (0);
1267}
1268
1269/*
1270 * Find state by ID: returns with locked row on success.
1271 */
1272struct pf_state *
1273pf_find_state_byid(uint64_t id, uint32_t creatorid)
1274{
1275	struct pf_idhash *ih;
1276	struct pf_state *s;
1277
1278	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1279
1280	ih = &V_pf_idhash[(be64toh(id) % (pf_hashmask + 1))];
1281
1282	PF_HASHROW_LOCK(ih);
1283	LIST_FOREACH(s, &ih->states, entry)
1284		if (s->id == id && s->creatorid == creatorid)
1285			break;
1286
1287	if (s == NULL)
1288		PF_HASHROW_UNLOCK(ih);
1289
1290	return (s);
1291}
1292
1293/*
1294 * Find state by key.
1295 * Returns with ID hash slot locked on success.
1296 */
1297static struct pf_state *
1298pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir)
1299{
1300	struct pf_keyhash	*kh;
1301	struct pf_state_key	*sk;
1302	struct pf_state		*s;
1303	int idx;
1304
1305	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1306
1307	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1308
1309	PF_HASHROW_LOCK(kh);
1310	LIST_FOREACH(sk, &kh->keys, entry)
1311		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1312			break;
1313	if (sk == NULL) {
1314		PF_HASHROW_UNLOCK(kh);
1315		return (NULL);
1316	}
1317
1318	idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK);
1319
1320	/* List is sorted, if-bound states before floating ones. */
1321	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx])
1322		if (s->kif == V_pfi_all || s->kif == kif) {
1323			PF_STATE_LOCK(s);
1324			PF_HASHROW_UNLOCK(kh);
1325			if (s->timeout >= PFTM_MAX) {
1326				/*
1327				 * State is either being processed by
1328				 * pf_unlink_state() in an other thread, or
1329				 * is scheduled for immediate expiry.
1330				 */
1331				PF_STATE_UNLOCK(s);
1332				return (NULL);
1333			}
1334			return (s);
1335		}
1336	PF_HASHROW_UNLOCK(kh);
1337
1338	return (NULL);
1339}
1340
1341struct pf_state *
1342pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
1343{
1344	struct pf_keyhash	*kh;
1345	struct pf_state_key	*sk;
1346	struct pf_state		*s, *ret = NULL;
1347	int			 idx, inout = 0;
1348
1349	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1350
1351	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1352
1353	PF_HASHROW_LOCK(kh);
1354	LIST_FOREACH(sk, &kh->keys, entry)
1355		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1356			break;
1357	if (sk == NULL) {
1358		PF_HASHROW_UNLOCK(kh);
1359		return (NULL);
1360	}
1361	switch (dir) {
1362	case PF_IN:
1363		idx = PF_SK_WIRE;
1364		break;
1365	case PF_OUT:
1366		idx = PF_SK_STACK;
1367		break;
1368	case PF_INOUT:
1369		idx = PF_SK_WIRE;
1370		inout = 1;
1371		break;
1372	default:
1373		panic("%s: dir %u", __func__, dir);
1374	}
1375second_run:
1376	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
1377		if (more == NULL) {
1378			PF_HASHROW_UNLOCK(kh);
1379			return (s);
1380		}
1381
1382		if (ret)
1383			(*more)++;
1384		else
1385			ret = s;
1386	}
1387	if (inout == 1) {
1388		inout = 0;
1389		idx = PF_SK_STACK;
1390		goto second_run;
1391	}
1392	PF_HASHROW_UNLOCK(kh);
1393
1394	return (ret);
1395}
1396
1397/* END state table stuff */
1398
1399static void
1400pf_send(struct pf_send_entry *pfse)
1401{
1402
1403	PF_SENDQ_LOCK();
1404	STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next);
1405	PF_SENDQ_UNLOCK();
1406	swi_sched(V_pf_swi_cookie, 0);
1407}
1408
1409void
1410pf_intr(void *v)
1411{
1412	struct pf_send_head queue;
1413	struct pf_send_entry *pfse, *next;
1414
1415	CURVNET_SET((struct vnet *)v);
1416
1417	PF_SENDQ_LOCK();
1418	queue = V_pf_sendqueue;
1419	STAILQ_INIT(&V_pf_sendqueue);
1420	PF_SENDQ_UNLOCK();
1421
1422	STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) {
1423		switch (pfse->pfse_type) {
1424#ifdef INET
1425		case PFSE_IP:
1426			ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL);
1427			break;
1428		case PFSE_ICMP:
1429			icmp_error(pfse->pfse_m, pfse->icmpopts.type,
1430			    pfse->icmpopts.code, 0, pfse->icmpopts.mtu);
1431			break;
1432#endif /* INET */
1433#ifdef INET6
1434		case PFSE_IP6:
1435			ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL,
1436			    NULL);
1437			break;
1438		case PFSE_ICMP6:
1439			icmp6_error(pfse->pfse_m, pfse->icmpopts.type,
1440			    pfse->icmpopts.code, pfse->icmpopts.mtu);
1441			break;
1442#endif /* INET6 */
1443		default:
1444			panic("%s: unknown type", __func__);
1445		}
1446		free(pfse, M_PFTEMP);
1447	}
1448	CURVNET_RESTORE();
1449}
1450
1451void
1452pf_purge_thread(void *unused __unused)
1453{
1454	VNET_ITERATOR_DECL(vnet_iter);
1455	u_int idx = 0;
1456
1457	for (;;) {
1458		PF_RULES_RLOCK();
1459		rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftm", hz / 10);
1460		PF_RULES_RUNLOCK();
1461
1462		VNET_LIST_RLOCK();
1463		VNET_FOREACH(vnet_iter) {
1464			CURVNET_SET(vnet_iter);
1465
1466		if (pf_end_threads) {
1467			pf_end_threads++;
1468			wakeup(pf_purge_thread);
1469			kproc_exit(0);
1470		}
1471
1472		/* Wait while V_pf_default_rule.timeout is initialized. */
1473		if (V_pf_vnet_active == 0) {
1474			CURVNET_RESTORE();
1475			continue;
1476		}
1477
1478		/* Process 1/interval fraction of the state table every run. */
1479		idx = pf_purge_expired_states(idx, pf_hashmask /
1480			    (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10));
1481
1482		/* Purge other expired types every PFTM_INTERVAL seconds. */
1483		if (idx == 0) {
1484			/*
1485			 * Order is important:
1486			 * - states and src nodes reference rules
1487			 * - states and rules reference kifs
1488			 */
1489			pf_purge_expired_fragments();
1490			pf_purge_expired_src_nodes();
1491			pf_purge_unlinked_rules();
1492			pfi_kif_purge();
1493		}
1494		CURVNET_RESTORE();
1495		}
1496		VNET_LIST_RUNLOCK();
1497	}
1498	/* not reached */
1499}
1500
1501void
1502pf_unload_vnet_purge(void)
1503{
1504
1505	/*
1506	 * To cleanse up all kifs and rules we need
1507	 * two runs: first one clears reference flags,
1508	 * then pf_purge_expired_states() doesn't
1509	 * raise them, and then second run frees.
1510	 */
1511	pf_purge_unlinked_rules();
1512	pfi_kif_purge();
1513
1514	/*
1515	 * Now purge everything.
1516	 */
1517	pf_purge_expired_states(0, pf_hashmask);
1518	pf_purge_expired_fragments();
1519	pf_purge_expired_src_nodes();
1520
1521	/*
1522	 * Now all kifs & rules should be unreferenced,
1523	 * thus should be successfully freed.
1524	 */
1525	pf_purge_unlinked_rules();
1526	pfi_kif_purge();
1527}
1528
1529
1530u_int32_t
1531pf_state_expires(const struct pf_state *state)
1532{
1533	u_int32_t	timeout;
1534	u_int32_t	start;
1535	u_int32_t	end;
1536	u_int32_t	states;
1537
1538	/* handle all PFTM_* > PFTM_MAX here */
1539	if (state->timeout == PFTM_PURGE)
1540		return (time_uptime);
1541	KASSERT(state->timeout != PFTM_UNLINKED,
1542	    ("pf_state_expires: timeout == PFTM_UNLINKED"));
1543	KASSERT((state->timeout < PFTM_MAX),
1544	    ("pf_state_expires: timeout > PFTM_MAX"));
1545	timeout = state->rule.ptr->timeout[state->timeout];
1546	if (!timeout)
1547		timeout = V_pf_default_rule.timeout[state->timeout];
1548	start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
1549	if (start) {
1550		end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
1551		states = counter_u64_fetch(state->rule.ptr->states_cur);
1552	} else {
1553		start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START];
1554		end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END];
1555		states = V_pf_status.states;
1556	}
1557	if (end && states > start && start < end) {
1558		if (states < end)
1559			return (state->expire + timeout * (end - states) /
1560			    (end - start));
1561		else
1562			return (time_uptime);
1563	}
1564	return (state->expire + timeout);
1565}
1566
1567void
1568pf_purge_expired_src_nodes()
1569{
1570	struct pf_src_node_list	 freelist;
1571	struct pf_srchash	*sh;
1572	struct pf_src_node	*cur, *next;
1573	int i;
1574
1575	LIST_INIT(&freelist);
1576	for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) {
1577	    PF_HASHROW_LOCK(sh);
1578	    LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next)
1579		if (cur->states == 0 && cur->expire <= time_uptime) {
1580			pf_unlink_src_node(cur);
1581			LIST_INSERT_HEAD(&freelist, cur, entry);
1582		} else if (cur->rule.ptr != NULL)
1583			cur->rule.ptr->rule_flag |= PFRULE_REFS;
1584	    PF_HASHROW_UNLOCK(sh);
1585	}
1586
1587	pf_free_src_nodes(&freelist);
1588
1589	V_pf_status.src_nodes = uma_zone_get_cur(V_pf_sources_z);
1590}
1591
1592static void
1593pf_src_tree_remove_state(struct pf_state *s)
1594{
1595	struct pf_src_node *sn;
1596	struct pf_srchash *sh;
1597	uint32_t timeout;
1598
1599	timeout = s->rule.ptr->timeout[PFTM_SRC_NODE] ?
1600	    s->rule.ptr->timeout[PFTM_SRC_NODE] :
1601	    V_pf_default_rule.timeout[PFTM_SRC_NODE];
1602
1603	if (s->src_node != NULL) {
1604		sn = s->src_node;
1605		sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
1606	    	PF_HASHROW_LOCK(sh);
1607		if (s->src.tcp_est)
1608			--sn->conn;
1609		if (--sn->states == 0)
1610			sn->expire = time_uptime + timeout;
1611	    	PF_HASHROW_UNLOCK(sh);
1612	}
1613	if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
1614		sn = s->nat_src_node;
1615		sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
1616	    	PF_HASHROW_LOCK(sh);
1617		if (--sn->states == 0)
1618			sn->expire = time_uptime + timeout;
1619	    	PF_HASHROW_UNLOCK(sh);
1620	}
1621	s->src_node = s->nat_src_node = NULL;
1622}
1623
1624/*
1625 * Unlink and potentilly free a state. Function may be
1626 * called with ID hash row locked, but always returns
1627 * unlocked, since it needs to go through key hash locking.
1628 */
1629int
1630pf_unlink_state(struct pf_state *s, u_int flags)
1631{
1632	struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)];
1633
1634	if ((flags & PF_ENTER_LOCKED) == 0)
1635		PF_HASHROW_LOCK(ih);
1636	else
1637		PF_HASHROW_ASSERT(ih);
1638
1639	if (s->timeout == PFTM_UNLINKED) {
1640		/*
1641		 * State is being processed
1642		 * by pf_unlink_state() in
1643		 * an other thread.
1644		 */
1645		PF_HASHROW_UNLOCK(ih);
1646		return (0);	/* XXXGL: undefined actually */
1647	}
1648
1649	if (s->src.state == PF_TCPS_PROXY_DST) {
1650		/* XXX wire key the right one? */
1651		pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af,
1652		    &s->key[PF_SK_WIRE]->addr[1],
1653		    &s->key[PF_SK_WIRE]->addr[0],
1654		    s->key[PF_SK_WIRE]->port[1],
1655		    s->key[PF_SK_WIRE]->port[0],
1656		    s->src.seqhi, s->src.seqlo + 1,
1657		    TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL);
1658	}
1659
1660	LIST_REMOVE(s, entry);
1661	pf_src_tree_remove_state(s);
1662
1663	if (pfsync_delete_state_ptr != NULL)
1664		pfsync_delete_state_ptr(s);
1665
1666	STATE_DEC_COUNTERS(s);
1667
1668	s->timeout = PFTM_UNLINKED;
1669
1670	PF_HASHROW_UNLOCK(ih);
1671
1672	pf_detach_state(s);
1673	refcount_release(&s->refs);
1674
1675	return (pf_release_state(s));
1676}
1677
1678void
1679pf_free_state(struct pf_state *cur)
1680{
1681
1682	KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur));
1683	KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__,
1684	    cur->timeout));
1685
1686	pf_normalize_tcp_cleanup(cur);
1687	uma_zfree(V_pf_state_z, cur);
1688	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_REMOVALS], 1);
1689}
1690
1691/*
1692 * Called only from pf_purge_thread(), thus serialized.
1693 */
1694static u_int
1695pf_purge_expired_states(u_int i, int maxcheck)
1696{
1697	struct pf_idhash *ih;
1698	struct pf_state *s;
1699
1700	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1701
1702	/*
1703	 * Go through hash and unlink states that expire now.
1704	 */
1705	while (maxcheck > 0) {
1706
1707		ih = &V_pf_idhash[i];
1708relock:
1709		PF_HASHROW_LOCK(ih);
1710		LIST_FOREACH(s, &ih->states, entry) {
1711			if (pf_state_expires(s) <= time_uptime) {
1712				V_pf_status.states -=
1713				    pf_unlink_state(s, PF_ENTER_LOCKED);
1714				goto relock;
1715			}
1716			s->rule.ptr->rule_flag |= PFRULE_REFS;
1717			if (s->nat_rule.ptr != NULL)
1718				s->nat_rule.ptr->rule_flag |= PFRULE_REFS;
1719			if (s->anchor.ptr != NULL)
1720				s->anchor.ptr->rule_flag |= PFRULE_REFS;
1721			s->kif->pfik_flags |= PFI_IFLAG_REFS;
1722			if (s->rt_kif)
1723				s->rt_kif->pfik_flags |= PFI_IFLAG_REFS;
1724		}
1725		PF_HASHROW_UNLOCK(ih);
1726
1727		/* Return when we hit end of hash. */
1728		if (++i > pf_hashmask) {
1729			V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1730			return (0);
1731		}
1732
1733		maxcheck--;
1734	}
1735
1736	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1737
1738	return (i);
1739}
1740
1741static void
1742pf_purge_unlinked_rules()
1743{
1744	struct pf_rulequeue tmpq;
1745	struct pf_rule *r, *r1;
1746
1747	/*
1748	 * If we have overloading task pending, then we'd
1749	 * better skip purging this time. There is a tiny
1750	 * probability that overloading task references
1751	 * an already unlinked rule.
1752	 */
1753	PF_OVERLOADQ_LOCK();
1754	if (!SLIST_EMPTY(&V_pf_overloadqueue)) {
1755		PF_OVERLOADQ_UNLOCK();
1756		return;
1757	}
1758	PF_OVERLOADQ_UNLOCK();
1759
1760	/*
1761	 * Do naive mark-and-sweep garbage collecting of old rules.
1762	 * Reference flag is raised by pf_purge_expired_states()
1763	 * and pf_purge_expired_src_nodes().
1764	 *
1765	 * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK,
1766	 * use a temporary queue.
1767	 */
1768	TAILQ_INIT(&tmpq);
1769	PF_UNLNKDRULES_LOCK();
1770	TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) {
1771		if (!(r->rule_flag & PFRULE_REFS)) {
1772			TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries);
1773			TAILQ_INSERT_TAIL(&tmpq, r, entries);
1774		} else
1775			r->rule_flag &= ~PFRULE_REFS;
1776	}
1777	PF_UNLNKDRULES_UNLOCK();
1778
1779	if (!TAILQ_EMPTY(&tmpq)) {
1780		PF_RULES_WLOCK();
1781		TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) {
1782			TAILQ_REMOVE(&tmpq, r, entries);
1783			pf_free_rule(r);
1784		}
1785		PF_RULES_WUNLOCK();
1786	}
1787}
1788
1789void
1790pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
1791{
1792	switch (af) {
1793#ifdef INET
1794	case AF_INET: {
1795		u_int32_t a = ntohl(addr->addr32[0]);
1796		printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
1797		    (a>>8)&255, a&255);
1798		if (p) {
1799			p = ntohs(p);
1800			printf(":%u", p);
1801		}
1802		break;
1803	}
1804#endif /* INET */
1805#ifdef INET6
1806	case AF_INET6: {
1807		u_int16_t b;
1808		u_int8_t i, curstart, curend, maxstart, maxend;
1809		curstart = curend = maxstart = maxend = 255;
1810		for (i = 0; i < 8; i++) {
1811			if (!addr->addr16[i]) {
1812				if (curstart == 255)
1813					curstart = i;
1814				curend = i;
1815			} else {
1816				if ((curend - curstart) >
1817				    (maxend - maxstart)) {
1818					maxstart = curstart;
1819					maxend = curend;
1820				}
1821				curstart = curend = 255;
1822			}
1823		}
1824		if ((curend - curstart) >
1825		    (maxend - maxstart)) {
1826			maxstart = curstart;
1827			maxend = curend;
1828		}
1829		for (i = 0; i < 8; i++) {
1830			if (i >= maxstart && i <= maxend) {
1831				if (i == 0)
1832					printf(":");
1833				if (i == maxend)
1834					printf(":");
1835			} else {
1836				b = ntohs(addr->addr16[i]);
1837				printf("%x", b);
1838				if (i < 7)
1839					printf(":");
1840			}
1841		}
1842		if (p) {
1843			p = ntohs(p);
1844			printf("[%u]", p);
1845		}
1846		break;
1847	}
1848#endif /* INET6 */
1849	}
1850}
1851
1852void
1853pf_print_state(struct pf_state *s)
1854{
1855	pf_print_state_parts(s, NULL, NULL);
1856}
1857
1858static void
1859pf_print_state_parts(struct pf_state *s,
1860    struct pf_state_key *skwp, struct pf_state_key *sksp)
1861{
1862	struct pf_state_key *skw, *sks;
1863	u_int8_t proto, dir;
1864
1865	/* Do our best to fill these, but they're skipped if NULL */
1866	skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
1867	sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
1868	proto = skw ? skw->proto : (sks ? sks->proto : 0);
1869	dir = s ? s->direction : 0;
1870
1871	switch (proto) {
1872	case IPPROTO_IPV4:
1873		printf("IPv4");
1874		break;
1875	case IPPROTO_IPV6:
1876		printf("IPv6");
1877		break;
1878	case IPPROTO_TCP:
1879		printf("TCP");
1880		break;
1881	case IPPROTO_UDP:
1882		printf("UDP");
1883		break;
1884	case IPPROTO_ICMP:
1885		printf("ICMP");
1886		break;
1887	case IPPROTO_ICMPV6:
1888		printf("ICMPv6");
1889		break;
1890	default:
1891		printf("%u", proto);
1892		break;
1893	}
1894	switch (dir) {
1895	case PF_IN:
1896		printf(" in");
1897		break;
1898	case PF_OUT:
1899		printf(" out");
1900		break;
1901	}
1902	if (skw) {
1903		printf(" wire: ");
1904		pf_print_host(&skw->addr[0], skw->port[0], skw->af);
1905		printf(" ");
1906		pf_print_host(&skw->addr[1], skw->port[1], skw->af);
1907	}
1908	if (sks) {
1909		printf(" stack: ");
1910		if (sks != skw) {
1911			pf_print_host(&sks->addr[0], sks->port[0], sks->af);
1912			printf(" ");
1913			pf_print_host(&sks->addr[1], sks->port[1], sks->af);
1914		} else
1915			printf("-");
1916	}
1917	if (s) {
1918		if (proto == IPPROTO_TCP) {
1919			printf(" [lo=%u high=%u win=%u modulator=%u",
1920			    s->src.seqlo, s->src.seqhi,
1921			    s->src.max_win, s->src.seqdiff);
1922			if (s->src.wscale && s->dst.wscale)
1923				printf(" wscale=%u",
1924				    s->src.wscale & PF_WSCALE_MASK);
1925			printf("]");
1926			printf(" [lo=%u high=%u win=%u modulator=%u",
1927			    s->dst.seqlo, s->dst.seqhi,
1928			    s->dst.max_win, s->dst.seqdiff);
1929			if (s->src.wscale && s->dst.wscale)
1930				printf(" wscale=%u",
1931				s->dst.wscale & PF_WSCALE_MASK);
1932			printf("]");
1933		}
1934		printf(" %u:%u", s->src.state, s->dst.state);
1935	}
1936}
1937
1938void
1939pf_print_flags(u_int8_t f)
1940{
1941	if (f)
1942		printf(" ");
1943	if (f & TH_FIN)
1944		printf("F");
1945	if (f & TH_SYN)
1946		printf("S");
1947	if (f & TH_RST)
1948		printf("R");
1949	if (f & TH_PUSH)
1950		printf("P");
1951	if (f & TH_ACK)
1952		printf("A");
1953	if (f & TH_URG)
1954		printf("U");
1955	if (f & TH_ECE)
1956		printf("E");
1957	if (f & TH_CWR)
1958		printf("W");
1959}
1960
1961#define	PF_SET_SKIP_STEPS(i)					\
1962	do {							\
1963		while (head[i] != cur) {			\
1964			head[i]->skip[i].ptr = cur;		\
1965			head[i] = TAILQ_NEXT(head[i], entries);	\
1966		}						\
1967	} while (0)
1968
1969void
1970pf_calc_skip_steps(struct pf_rulequeue *rules)
1971{
1972	struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT];
1973	int i;
1974
1975	cur = TAILQ_FIRST(rules);
1976	prev = cur;
1977	for (i = 0; i < PF_SKIP_COUNT; ++i)
1978		head[i] = cur;
1979	while (cur != NULL) {
1980
1981		if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
1982			PF_SET_SKIP_STEPS(PF_SKIP_IFP);
1983		if (cur->direction != prev->direction)
1984			PF_SET_SKIP_STEPS(PF_SKIP_DIR);
1985		if (cur->af != prev->af)
1986			PF_SET_SKIP_STEPS(PF_SKIP_AF);
1987		if (cur->proto != prev->proto)
1988			PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
1989		if (cur->src.neg != prev->src.neg ||
1990		    pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
1991			PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
1992		if (cur->src.port[0] != prev->src.port[0] ||
1993		    cur->src.port[1] != prev->src.port[1] ||
1994		    cur->src.port_op != prev->src.port_op)
1995			PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
1996		if (cur->dst.neg != prev->dst.neg ||
1997		    pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
1998			PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
1999		if (cur->dst.port[0] != prev->dst.port[0] ||
2000		    cur->dst.port[1] != prev->dst.port[1] ||
2001		    cur->dst.port_op != prev->dst.port_op)
2002			PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
2003
2004		prev = cur;
2005		cur = TAILQ_NEXT(cur, entries);
2006	}
2007	for (i = 0; i < PF_SKIP_COUNT; ++i)
2008		PF_SET_SKIP_STEPS(i);
2009}
2010
2011static int
2012pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
2013{
2014	if (aw1->type != aw2->type)
2015		return (1);
2016	switch (aw1->type) {
2017	case PF_ADDR_ADDRMASK:
2018	case PF_ADDR_RANGE:
2019		if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, AF_INET6))
2020			return (1);
2021		if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, AF_INET6))
2022			return (1);
2023		return (0);
2024	case PF_ADDR_DYNIFTL:
2025		return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
2026	case PF_ADDR_NOROUTE:
2027	case PF_ADDR_URPFFAILED:
2028		return (0);
2029	case PF_ADDR_TABLE:
2030		return (aw1->p.tbl != aw2->p.tbl);
2031	default:
2032		printf("invalid address type: %d\n", aw1->type);
2033		return (1);
2034	}
2035}
2036
2037/**
2038 * Checksum updates are a little complicated because the checksum in the TCP/UDP
2039 * header isn't always a full checksum. In some cases (i.e. output) it's a
2040 * pseudo-header checksum, which is a partial checksum over src/dst IP
2041 * addresses, protocol number and length.
2042 *
2043 * That means we have the following cases:
2044 *  * Input or forwarding: we don't have TSO, the checksum fields are full
2045 *  	checksums, we need to update the checksum whenever we change anything.
2046 *  * Output (i.e. the checksum is a pseudo-header checksum):
2047 *  	x The field being updated is src/dst address or affects the length of
2048 *  	the packet. We need to update the pseudo-header checksum (note that this
2049 *  	checksum is not ones' complement).
2050 *  	x Some other field is being modified (e.g. src/dst port numbers): We
2051 *  	don't have to update anything.
2052 **/
2053u_int16_t
2054pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
2055{
2056	u_int32_t	l;
2057
2058	if (udp && !cksum)
2059		return (0x0000);
2060	l = cksum + old - new;
2061	l = (l >> 16) + (l & 65535);
2062	l = l & 65535;
2063	if (udp && !l)
2064		return (0xFFFF);
2065	return (l);
2066}
2067
2068u_int16_t
2069pf_proto_cksum_fixup(struct mbuf *m, u_int16_t cksum, u_int16_t old,
2070        u_int16_t new, u_int8_t udp)
2071{
2072	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
2073		return (cksum);
2074
2075	return (pf_cksum_fixup(cksum, old, new, udp));
2076}
2077
2078static void
2079pf_change_ap(struct mbuf *m, struct pf_addr *a, u_int16_t *p, u_int16_t *ic,
2080        u_int16_t *pc, struct pf_addr *an, u_int16_t pn, u_int8_t u,
2081        sa_family_t af)
2082{
2083	struct pf_addr	ao;
2084	u_int16_t	po = *p;
2085
2086	PF_ACPY(&ao, a, af);
2087	PF_ACPY(a, an, af);
2088
2089	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
2090		*pc = ~*pc;
2091
2092	*p = pn;
2093
2094	switch (af) {
2095#ifdef INET
2096	case AF_INET:
2097		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
2098		    ao.addr16[0], an->addr16[0], 0),
2099		    ao.addr16[1], an->addr16[1], 0);
2100		*p = pn;
2101
2102		*pc = pf_cksum_fixup(pf_cksum_fixup(*pc,
2103		    ao.addr16[0], an->addr16[0], u),
2104		    ao.addr16[1], an->addr16[1], u);
2105
2106		*pc = pf_proto_cksum_fixup(m, *pc, po, pn, u);
2107		break;
2108#endif /* INET */
2109#ifdef INET6
2110	case AF_INET6:
2111		*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2112		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2113		    pf_cksum_fixup(pf_cksum_fixup(*pc,
2114		    ao.addr16[0], an->addr16[0], u),
2115		    ao.addr16[1], an->addr16[1], u),
2116		    ao.addr16[2], an->addr16[2], u),
2117		    ao.addr16[3], an->addr16[3], u),
2118		    ao.addr16[4], an->addr16[4], u),
2119		    ao.addr16[5], an->addr16[5], u),
2120		    ao.addr16[6], an->addr16[6], u),
2121		    ao.addr16[7], an->addr16[7], u);
2122
2123		*pc = pf_proto_cksum_fixup(m, *pc, po, pn, u);
2124		break;
2125#endif /* INET6 */
2126	}
2127
2128	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA |
2129	    CSUM_DELAY_DATA_IPV6)) {
2130		*pc = ~*pc;
2131		if (! *pc)
2132			*pc = 0xffff;
2133	}
2134}
2135
2136/* Changes a u_int32_t.  Uses a void * so there are no align restrictions */
2137void
2138pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
2139{
2140	u_int32_t	ao;
2141
2142	memcpy(&ao, a, sizeof(ao));
2143	memcpy(a, &an, sizeof(u_int32_t));
2144	*c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
2145	    ao % 65536, an % 65536, u);
2146}
2147
2148void
2149pf_change_proto_a(struct mbuf *m, void *a, u_int16_t *c, u_int32_t an, u_int8_t udp)
2150{
2151	u_int32_t	ao;
2152
2153	memcpy(&ao, a, sizeof(ao));
2154	memcpy(a, &an, sizeof(u_int32_t));
2155
2156	*c = pf_proto_cksum_fixup(m,
2157	    pf_proto_cksum_fixup(m, *c, ao / 65536, an / 65536, udp),
2158	    ao % 65536, an % 65536, udp);
2159}
2160
2161#ifdef INET6
2162static void
2163pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
2164{
2165	struct pf_addr	ao;
2166
2167	PF_ACPY(&ao, a, AF_INET6);
2168	PF_ACPY(a, an, AF_INET6);
2169
2170	*c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2171	    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2172	    pf_cksum_fixup(pf_cksum_fixup(*c,
2173	    ao.addr16[0], an->addr16[0], u),
2174	    ao.addr16[1], an->addr16[1], u),
2175	    ao.addr16[2], an->addr16[2], u),
2176	    ao.addr16[3], an->addr16[3], u),
2177	    ao.addr16[4], an->addr16[4], u),
2178	    ao.addr16[5], an->addr16[5], u),
2179	    ao.addr16[6], an->addr16[6], u),
2180	    ao.addr16[7], an->addr16[7], u);
2181}
2182#endif /* INET6 */
2183
2184static void
2185pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
2186    struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
2187    u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
2188{
2189	struct pf_addr	oia, ooa;
2190
2191	PF_ACPY(&oia, ia, af);
2192	if (oa)
2193		PF_ACPY(&ooa, oa, af);
2194
2195	/* Change inner protocol port, fix inner protocol checksum. */
2196	if (ip != NULL) {
2197		u_int16_t	oip = *ip;
2198		u_int32_t	opc;
2199
2200		if (pc != NULL)
2201			opc = *pc;
2202		*ip = np;
2203		if (pc != NULL)
2204			*pc = pf_cksum_fixup(*pc, oip, *ip, u);
2205		*ic = pf_cksum_fixup(*ic, oip, *ip, 0);
2206		if (pc != NULL)
2207			*ic = pf_cksum_fixup(*ic, opc, *pc, 0);
2208	}
2209	/* Change inner ip address, fix inner ip and icmp checksums. */
2210	PF_ACPY(ia, na, af);
2211	switch (af) {
2212#ifdef INET
2213	case AF_INET: {
2214		u_int32_t	 oh2c = *h2c;
2215
2216		*h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
2217		    oia.addr16[0], ia->addr16[0], 0),
2218		    oia.addr16[1], ia->addr16[1], 0);
2219		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
2220		    oia.addr16[0], ia->addr16[0], 0),
2221		    oia.addr16[1], ia->addr16[1], 0);
2222		*ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
2223		break;
2224	}
2225#endif /* INET */
2226#ifdef INET6
2227	case AF_INET6:
2228		*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2229		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2230		    pf_cksum_fixup(pf_cksum_fixup(*ic,
2231		    oia.addr16[0], ia->addr16[0], u),
2232		    oia.addr16[1], ia->addr16[1], u),
2233		    oia.addr16[2], ia->addr16[2], u),
2234		    oia.addr16[3], ia->addr16[3], u),
2235		    oia.addr16[4], ia->addr16[4], u),
2236		    oia.addr16[5], ia->addr16[5], u),
2237		    oia.addr16[6], ia->addr16[6], u),
2238		    oia.addr16[7], ia->addr16[7], u);
2239		break;
2240#endif /* INET6 */
2241	}
2242	/* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
2243	if (oa) {
2244		PF_ACPY(oa, na, af);
2245		switch (af) {
2246#ifdef INET
2247		case AF_INET:
2248			*hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
2249			    ooa.addr16[0], oa->addr16[0], 0),
2250			    ooa.addr16[1], oa->addr16[1], 0);
2251			break;
2252#endif /* INET */
2253#ifdef INET6
2254		case AF_INET6:
2255			*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2256			    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2257			    pf_cksum_fixup(pf_cksum_fixup(*ic,
2258			    ooa.addr16[0], oa->addr16[0], u),
2259			    ooa.addr16[1], oa->addr16[1], u),
2260			    ooa.addr16[2], oa->addr16[2], u),
2261			    ooa.addr16[3], oa->addr16[3], u),
2262			    ooa.addr16[4], oa->addr16[4], u),
2263			    ooa.addr16[5], oa->addr16[5], u),
2264			    ooa.addr16[6], oa->addr16[6], u),
2265			    ooa.addr16[7], oa->addr16[7], u);
2266			break;
2267#endif /* INET6 */
2268		}
2269	}
2270}
2271
2272
2273/*
2274 * Need to modulate the sequence numbers in the TCP SACK option
2275 * (credits to Krzysztof Pfaff for report and patch)
2276 */
2277static int
2278pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
2279    struct tcphdr *th, struct pf_state_peer *dst)
2280{
2281	int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
2282	u_int8_t opts[TCP_MAXOLEN], *opt = opts;
2283	int copyback = 0, i, olen;
2284	struct sackblk sack;
2285
2286#define	TCPOLEN_SACKLEN	(TCPOLEN_SACK + 2)
2287	if (hlen < TCPOLEN_SACKLEN ||
2288	    !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
2289		return 0;
2290
2291	while (hlen >= TCPOLEN_SACKLEN) {
2292		olen = opt[1];
2293		switch (*opt) {
2294		case TCPOPT_EOL:	/* FALLTHROUGH */
2295		case TCPOPT_NOP:
2296			opt++;
2297			hlen--;
2298			break;
2299		case TCPOPT_SACK:
2300			if (olen > hlen)
2301				olen = hlen;
2302			if (olen >= TCPOLEN_SACKLEN) {
2303				for (i = 2; i + TCPOLEN_SACK <= olen;
2304				    i += TCPOLEN_SACK) {
2305					memcpy(&sack, &opt[i], sizeof(sack));
2306					pf_change_proto_a(m, &sack.start, &th->th_sum,
2307					    htonl(ntohl(sack.start) - dst->seqdiff), 0);
2308					pf_change_proto_a(m, &sack.end, &th->th_sum,
2309					    htonl(ntohl(sack.end) - dst->seqdiff), 0);
2310					memcpy(&opt[i], &sack, sizeof(sack));
2311				}
2312				copyback = 1;
2313			}
2314			/* FALLTHROUGH */
2315		default:
2316			if (olen < 2)
2317				olen = 2;
2318			hlen -= olen;
2319			opt += olen;
2320		}
2321	}
2322
2323	if (copyback)
2324		m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts);
2325	return (copyback);
2326}
2327
2328static void
2329pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af,
2330    const struct pf_addr *saddr, const struct pf_addr *daddr,
2331    u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
2332    u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
2333    u_int16_t rtag, struct ifnet *ifp)
2334{
2335	struct pf_send_entry *pfse;
2336	struct mbuf	*m;
2337	int		 len, tlen;
2338#ifdef INET
2339	struct ip	*h = NULL;
2340#endif /* INET */
2341#ifdef INET6
2342	struct ip6_hdr	*h6 = NULL;
2343#endif /* INET6 */
2344	struct tcphdr	*th;
2345	char		*opt;
2346	struct pf_mtag  *pf_mtag;
2347
2348	len = 0;
2349	th = NULL;
2350
2351	/* maximum segment size tcp option */
2352	tlen = sizeof(struct tcphdr);
2353	if (mss)
2354		tlen += 4;
2355
2356	switch (af) {
2357#ifdef INET
2358	case AF_INET:
2359		len = sizeof(struct ip) + tlen;
2360		break;
2361#endif /* INET */
2362#ifdef INET6
2363	case AF_INET6:
2364		len = sizeof(struct ip6_hdr) + tlen;
2365		break;
2366#endif /* INET6 */
2367	default:
2368		panic("%s: unsupported af %d", __func__, af);
2369	}
2370
2371	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
2372	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2373	if (pfse == NULL)
2374		return;
2375	m = m_gethdr(M_NOWAIT, MT_DATA);
2376	if (m == NULL) {
2377		free(pfse, M_PFTEMP);
2378		return;
2379	}
2380#ifdef MAC
2381	mac_netinet_firewall_send(m);
2382#endif
2383	if ((pf_mtag = pf_get_mtag(m)) == NULL) {
2384		free(pfse, M_PFTEMP);
2385		m_freem(m);
2386		return;
2387	}
2388	if (tag)
2389		m->m_flags |= M_SKIP_FIREWALL;
2390	pf_mtag->tag = rtag;
2391
2392	if (r != NULL && r->rtableid >= 0)
2393		M_SETFIB(m, r->rtableid);
2394
2395#ifdef ALTQ
2396	if (r != NULL && r->qid) {
2397		pf_mtag->qid = r->qid;
2398
2399		/* add hints for ecn */
2400		pf_mtag->hdr = mtod(m, struct ip *);
2401	}
2402#endif /* ALTQ */
2403	m->m_data += max_linkhdr;
2404	m->m_pkthdr.len = m->m_len = len;
2405	m->m_pkthdr.rcvif = NULL;
2406	bzero(m->m_data, len);
2407	switch (af) {
2408#ifdef INET
2409	case AF_INET:
2410		h = mtod(m, struct ip *);
2411
2412		/* IP header fields included in the TCP checksum */
2413		h->ip_p = IPPROTO_TCP;
2414		h->ip_len = htons(tlen);
2415		h->ip_src.s_addr = saddr->v4.s_addr;
2416		h->ip_dst.s_addr = daddr->v4.s_addr;
2417
2418		th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
2419		break;
2420#endif /* INET */
2421#ifdef INET6
2422	case AF_INET6:
2423		h6 = mtod(m, struct ip6_hdr *);
2424
2425		/* IP header fields included in the TCP checksum */
2426		h6->ip6_nxt = IPPROTO_TCP;
2427		h6->ip6_plen = htons(tlen);
2428		memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
2429		memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
2430
2431		th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
2432		break;
2433#endif /* INET6 */
2434	}
2435
2436	/* TCP header */
2437	th->th_sport = sport;
2438	th->th_dport = dport;
2439	th->th_seq = htonl(seq);
2440	th->th_ack = htonl(ack);
2441	th->th_off = tlen >> 2;
2442	th->th_flags = flags;
2443	th->th_win = htons(win);
2444
2445	if (mss) {
2446		opt = (char *)(th + 1);
2447		opt[0] = TCPOPT_MAXSEG;
2448		opt[1] = 4;
2449		HTONS(mss);
2450		bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
2451	}
2452
2453	switch (af) {
2454#ifdef INET
2455	case AF_INET:
2456		/* TCP checksum */
2457		th->th_sum = in_cksum(m, len);
2458
2459		/* Finish the IP header */
2460		h->ip_v = 4;
2461		h->ip_hl = sizeof(*h) >> 2;
2462		h->ip_tos = IPTOS_LOWDELAY;
2463		h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0);
2464		h->ip_len = htons(len);
2465		h->ip_ttl = ttl ? ttl : V_ip_defttl;
2466		h->ip_sum = 0;
2467
2468		pfse->pfse_type = PFSE_IP;
2469		break;
2470#endif /* INET */
2471#ifdef INET6
2472	case AF_INET6:
2473		/* TCP checksum */
2474		th->th_sum = in6_cksum(m, IPPROTO_TCP,
2475		    sizeof(struct ip6_hdr), tlen);
2476
2477		h6->ip6_vfc |= IPV6_VERSION;
2478		h6->ip6_hlim = IPV6_DEFHLIM;
2479
2480		pfse->pfse_type = PFSE_IP6;
2481		break;
2482#endif /* INET6 */
2483	}
2484	pfse->pfse_m = m;
2485	pf_send(pfse);
2486}
2487
2488static int
2489pf_ieee8021q_setpcp(struct mbuf *m, u_int8_t prio)
2490{
2491	struct m_tag *mtag;
2492
2493	KASSERT(prio <= PF_PRIO_MAX,
2494	    ("%s with invalid pcp", __func__));
2495
2496	mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_OUT, NULL);
2497	if (mtag == NULL) {
2498		mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_OUT,
2499		    sizeof(uint8_t), M_NOWAIT);
2500		if (mtag == NULL)
2501			return (ENOMEM);
2502		m_tag_prepend(m, mtag);
2503	}
2504
2505	*(uint8_t *)(mtag + 1) = prio;
2506	return (0);
2507}
2508
2509static int
2510pf_match_ieee8021q_pcp(u_int8_t prio, struct mbuf *m)
2511{
2512	struct m_tag *mtag;
2513	u_int8_t mpcp;
2514
2515	mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL);
2516	if (mtag == NULL)
2517		return (0);
2518
2519	if (prio == PF_PRIO_ZERO)
2520		prio = 0;
2521
2522	mpcp = *(uint8_t *)(mtag + 1);
2523
2524	return (mpcp == prio);
2525}
2526
2527static void
2528pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
2529    struct pf_rule *r)
2530{
2531	struct pf_send_entry *pfse;
2532	struct mbuf *m0;
2533	struct pf_mtag *pf_mtag;
2534
2535	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
2536	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2537	if (pfse == NULL)
2538		return;
2539
2540	if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) {
2541		free(pfse, M_PFTEMP);
2542		return;
2543	}
2544
2545	if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
2546		free(pfse, M_PFTEMP);
2547		return;
2548	}
2549	/* XXX: revisit */
2550	m0->m_flags |= M_SKIP_FIREWALL;
2551
2552	if (r->rtableid >= 0)
2553		M_SETFIB(m0, r->rtableid);
2554
2555#ifdef ALTQ
2556	if (r->qid) {
2557		pf_mtag->qid = r->qid;
2558		/* add hints for ecn */
2559		pf_mtag->hdr = mtod(m0, struct ip *);
2560	}
2561#endif /* ALTQ */
2562
2563	switch (af) {
2564#ifdef INET
2565	case AF_INET:
2566		pfse->pfse_type = PFSE_ICMP;
2567		break;
2568#endif /* INET */
2569#ifdef INET6
2570	case AF_INET6:
2571		pfse->pfse_type = PFSE_ICMP6;
2572		break;
2573#endif /* INET6 */
2574	}
2575	pfse->pfse_m = m0;
2576	pfse->icmpopts.type = type;
2577	pfse->icmpopts.code = code;
2578	pf_send(pfse);
2579}
2580
2581/*
2582 * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
2583 * If n is 0, they match if they are equal. If n is != 0, they match if they
2584 * are different.
2585 */
2586int
2587pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
2588    struct pf_addr *b, sa_family_t af)
2589{
2590	int	match = 0;
2591
2592	switch (af) {
2593#ifdef INET
2594	case AF_INET:
2595		if ((a->addr32[0] & m->addr32[0]) ==
2596		    (b->addr32[0] & m->addr32[0]))
2597			match++;
2598		break;
2599#endif /* INET */
2600#ifdef INET6
2601	case AF_INET6:
2602		if (((a->addr32[0] & m->addr32[0]) ==
2603		     (b->addr32[0] & m->addr32[0])) &&
2604		    ((a->addr32[1] & m->addr32[1]) ==
2605		     (b->addr32[1] & m->addr32[1])) &&
2606		    ((a->addr32[2] & m->addr32[2]) ==
2607		     (b->addr32[2] & m->addr32[2])) &&
2608		    ((a->addr32[3] & m->addr32[3]) ==
2609		     (b->addr32[3] & m->addr32[3])))
2610			match++;
2611		break;
2612#endif /* INET6 */
2613	}
2614	if (match) {
2615		if (n)
2616			return (0);
2617		else
2618			return (1);
2619	} else {
2620		if (n)
2621			return (1);
2622		else
2623			return (0);
2624	}
2625}
2626
2627/*
2628 * Return 1 if b <= a <= e, otherwise return 0.
2629 */
2630int
2631pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
2632    struct pf_addr *a, sa_family_t af)
2633{
2634	switch (af) {
2635#ifdef INET
2636	case AF_INET:
2637		if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) ||
2638		    (ntohl(a->addr32[0]) > ntohl(e->addr32[0])))
2639			return (0);
2640		break;
2641#endif /* INET */
2642#ifdef INET6
2643	case AF_INET6: {
2644		int	i;
2645
2646		/* check a >= b */
2647		for (i = 0; i < 4; ++i)
2648			if (ntohl(a->addr32[i]) > ntohl(b->addr32[i]))
2649				break;
2650			else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i]))
2651				return (0);
2652		/* check a <= e */
2653		for (i = 0; i < 4; ++i)
2654			if (ntohl(a->addr32[i]) < ntohl(e->addr32[i]))
2655				break;
2656			else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i]))
2657				return (0);
2658		break;
2659	}
2660#endif /* INET6 */
2661	}
2662	return (1);
2663}
2664
2665static int
2666pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
2667{
2668	switch (op) {
2669	case PF_OP_IRG:
2670		return ((p > a1) && (p < a2));
2671	case PF_OP_XRG:
2672		return ((p < a1) || (p > a2));
2673	case PF_OP_RRG:
2674		return ((p >= a1) && (p <= a2));
2675	case PF_OP_EQ:
2676		return (p == a1);
2677	case PF_OP_NE:
2678		return (p != a1);
2679	case PF_OP_LT:
2680		return (p < a1);
2681	case PF_OP_LE:
2682		return (p <= a1);
2683	case PF_OP_GT:
2684		return (p > a1);
2685	case PF_OP_GE:
2686		return (p >= a1);
2687	}
2688	return (0); /* never reached */
2689}
2690
2691int
2692pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
2693{
2694	NTOHS(a1);
2695	NTOHS(a2);
2696	NTOHS(p);
2697	return (pf_match(op, a1, a2, p));
2698}
2699
2700static int
2701pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
2702{
2703	if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2704		return (0);
2705	return (pf_match(op, a1, a2, u));
2706}
2707
2708static int
2709pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
2710{
2711	if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2712		return (0);
2713	return (pf_match(op, a1, a2, g));
2714}
2715
2716int
2717pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag)
2718{
2719	if (*tag == -1)
2720		*tag = mtag;
2721
2722	return ((!r->match_tag_not && r->match_tag == *tag) ||
2723	    (r->match_tag_not && r->match_tag != *tag));
2724}
2725
2726int
2727pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag)
2728{
2729
2730	KASSERT(tag > 0, ("%s: tag %d", __func__, tag));
2731
2732	if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL))
2733		return (ENOMEM);
2734
2735	pd->pf_mtag->tag = tag;
2736
2737	return (0);
2738}
2739
2740#define	PF_ANCHOR_STACKSIZE	32
2741struct pf_anchor_stackframe {
2742	struct pf_ruleset	*rs;
2743	struct pf_rule		*r;	/* XXX: + match bit */
2744	struct pf_anchor	*child;
2745};
2746
2747/*
2748 * XXX: We rely on malloc(9) returning pointer aligned addresses.
2749 */
2750#define	PF_ANCHORSTACK_MATCH	0x00000001
2751#define	PF_ANCHORSTACK_MASK	(PF_ANCHORSTACK_MATCH)
2752
2753#define	PF_ANCHOR_MATCH(f)	((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH)
2754#define	PF_ANCHOR_RULE(f)	(struct pf_rule *)			\
2755				((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK)
2756#define	PF_ANCHOR_SET_MATCH(f)	do { (f)->r = (void *) 			\
2757				((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH);  \
2758} while (0)
2759
2760void
2761pf_step_into_anchor(struct pf_anchor_stackframe *stack, int *depth,
2762    struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
2763    int *match)
2764{
2765	struct pf_anchor_stackframe	*f;
2766
2767	PF_RULES_RASSERT();
2768
2769	if (match)
2770		*match = 0;
2771	if (*depth >= PF_ANCHOR_STACKSIZE) {
2772		printf("%s: anchor stack overflow on %s\n",
2773		    __func__, (*r)->anchor->name);
2774		*r = TAILQ_NEXT(*r, entries);
2775		return;
2776	} else if (*depth == 0 && a != NULL)
2777		*a = *r;
2778	f = stack + (*depth)++;
2779	f->rs = *rs;
2780	f->r = *r;
2781	if ((*r)->anchor_wildcard) {
2782		struct pf_anchor_node *parent = &(*r)->anchor->children;
2783
2784		if ((f->child = RB_MIN(pf_anchor_node, parent)) == NULL) {
2785			*r = NULL;
2786			return;
2787		}
2788		*rs = &f->child->ruleset;
2789	} else {
2790		f->child = NULL;
2791		*rs = &(*r)->anchor->ruleset;
2792	}
2793	*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2794}
2795
2796int
2797pf_step_out_of_anchor(struct pf_anchor_stackframe *stack, int *depth,
2798    struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
2799    int *match)
2800{
2801	struct pf_anchor_stackframe	*f;
2802	struct pf_rule *fr;
2803	int quick = 0;
2804
2805	PF_RULES_RASSERT();
2806
2807	do {
2808		if (*depth <= 0)
2809			break;
2810		f = stack + *depth - 1;
2811		fr = PF_ANCHOR_RULE(f);
2812		if (f->child != NULL) {
2813			struct pf_anchor_node *parent;
2814
2815			/*
2816			 * This block traverses through
2817			 * a wildcard anchor.
2818			 */
2819			parent = &fr->anchor->children;
2820			if (match != NULL && *match) {
2821				/*
2822				 * If any of "*" matched, then
2823				 * "foo/ *" matched, mark frame
2824				 * appropriately.
2825				 */
2826				PF_ANCHOR_SET_MATCH(f);
2827				*match = 0;
2828			}
2829			f->child = RB_NEXT(pf_anchor_node, parent, f->child);
2830			if (f->child != NULL) {
2831				*rs = &f->child->ruleset;
2832				*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2833				if (*r == NULL)
2834					continue;
2835				else
2836					break;
2837			}
2838		}
2839		(*depth)--;
2840		if (*depth == 0 && a != NULL)
2841			*a = NULL;
2842		*rs = f->rs;
2843		if (PF_ANCHOR_MATCH(f) || (match != NULL && *match))
2844			quick = fr->quick;
2845		*r = TAILQ_NEXT(fr, entries);
2846	} while (*r == NULL);
2847
2848	return (quick);
2849}
2850
2851#ifdef INET6
2852void
2853pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
2854    struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
2855{
2856	switch (af) {
2857#ifdef INET
2858	case AF_INET:
2859		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2860		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2861		break;
2862#endif /* INET */
2863	case AF_INET6:
2864		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2865		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2866		naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
2867		((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
2868		naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
2869		((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
2870		naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
2871		((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
2872		break;
2873	}
2874}
2875
2876void
2877pf_addr_inc(struct pf_addr *addr, sa_family_t af)
2878{
2879	switch (af) {
2880#ifdef INET
2881	case AF_INET:
2882		addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
2883		break;
2884#endif /* INET */
2885	case AF_INET6:
2886		if (addr->addr32[3] == 0xffffffff) {
2887			addr->addr32[3] = 0;
2888			if (addr->addr32[2] == 0xffffffff) {
2889				addr->addr32[2] = 0;
2890				if (addr->addr32[1] == 0xffffffff) {
2891					addr->addr32[1] = 0;
2892					addr->addr32[0] =
2893					    htonl(ntohl(addr->addr32[0]) + 1);
2894				} else
2895					addr->addr32[1] =
2896					    htonl(ntohl(addr->addr32[1]) + 1);
2897			} else
2898				addr->addr32[2] =
2899				    htonl(ntohl(addr->addr32[2]) + 1);
2900		} else
2901			addr->addr32[3] =
2902			    htonl(ntohl(addr->addr32[3]) + 1);
2903		break;
2904	}
2905}
2906#endif /* INET6 */
2907
2908int
2909pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m)
2910{
2911	struct pf_addr		*saddr, *daddr;
2912	u_int16_t		 sport, dport;
2913	struct inpcbinfo	*pi;
2914	struct inpcb		*inp;
2915
2916	pd->lookup.uid = UID_MAX;
2917	pd->lookup.gid = GID_MAX;
2918
2919	switch (pd->proto) {
2920	case IPPROTO_TCP:
2921		if (pd->hdr.tcp == NULL)
2922			return (-1);
2923		sport = pd->hdr.tcp->th_sport;
2924		dport = pd->hdr.tcp->th_dport;
2925		pi = &V_tcbinfo;
2926		break;
2927	case IPPROTO_UDP:
2928		if (pd->hdr.udp == NULL)
2929			return (-1);
2930		sport = pd->hdr.udp->uh_sport;
2931		dport = pd->hdr.udp->uh_dport;
2932		pi = &V_udbinfo;
2933		break;
2934	default:
2935		return (-1);
2936	}
2937	if (direction == PF_IN) {
2938		saddr = pd->src;
2939		daddr = pd->dst;
2940	} else {
2941		u_int16_t	p;
2942
2943		p = sport;
2944		sport = dport;
2945		dport = p;
2946		saddr = pd->dst;
2947		daddr = pd->src;
2948	}
2949	switch (pd->af) {
2950#ifdef INET
2951	case AF_INET:
2952		inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4,
2953		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
2954		if (inp == NULL) {
2955			inp = in_pcblookup_mbuf(pi, saddr->v4, sport,
2956			   daddr->v4, dport, INPLOOKUP_WILDCARD |
2957			   INPLOOKUP_RLOCKPCB, NULL, m);
2958			if (inp == NULL)
2959				return (-1);
2960		}
2961		break;
2962#endif /* INET */
2963#ifdef INET6
2964	case AF_INET6:
2965		inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6,
2966		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
2967		if (inp == NULL) {
2968			inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport,
2969			    &daddr->v6, dport, INPLOOKUP_WILDCARD |
2970			    INPLOOKUP_RLOCKPCB, NULL, m);
2971			if (inp == NULL)
2972				return (-1);
2973		}
2974		break;
2975#endif /* INET6 */
2976
2977	default:
2978		return (-1);
2979	}
2980	INP_RLOCK_ASSERT(inp);
2981	pd->lookup.uid = inp->inp_cred->cr_uid;
2982	pd->lookup.gid = inp->inp_cred->cr_groups[0];
2983	INP_RUNLOCK(inp);
2984
2985	return (1);
2986}
2987
2988static u_int8_t
2989pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
2990{
2991	int		 hlen;
2992	u_int8_t	 hdr[60];
2993	u_int8_t	*opt, optlen;
2994	u_int8_t	 wscale = 0;
2995
2996	hlen = th_off << 2;		/* hlen <= sizeof(hdr) */
2997	if (hlen <= sizeof(struct tcphdr))
2998		return (0);
2999	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
3000		return (0);
3001	opt = hdr + sizeof(struct tcphdr);
3002	hlen -= sizeof(struct tcphdr);
3003	while (hlen >= 3) {
3004		switch (*opt) {
3005		case TCPOPT_EOL:
3006		case TCPOPT_NOP:
3007			++opt;
3008			--hlen;
3009			break;
3010		case TCPOPT_WINDOW:
3011			wscale = opt[2];
3012			if (wscale > TCP_MAX_WINSHIFT)
3013				wscale = TCP_MAX_WINSHIFT;
3014			wscale |= PF_WSCALE_FLAG;
3015			/* FALLTHROUGH */
3016		default:
3017			optlen = opt[1];
3018			if (optlen < 2)
3019				optlen = 2;
3020			hlen -= optlen;
3021			opt += optlen;
3022			break;
3023		}
3024	}
3025	return (wscale);
3026}
3027
3028static u_int16_t
3029pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
3030{
3031	int		 hlen;
3032	u_int8_t	 hdr[60];
3033	u_int8_t	*opt, optlen;
3034	u_int16_t	 mss = V_tcp_mssdflt;
3035
3036	hlen = th_off << 2;	/* hlen <= sizeof(hdr) */
3037	if (hlen <= sizeof(struct tcphdr))
3038		return (0);
3039	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
3040		return (0);
3041	opt = hdr + sizeof(struct tcphdr);
3042	hlen -= sizeof(struct tcphdr);
3043	while (hlen >= TCPOLEN_MAXSEG) {
3044		switch (*opt) {
3045		case TCPOPT_EOL:
3046		case TCPOPT_NOP:
3047			++opt;
3048			--hlen;
3049			break;
3050		case TCPOPT_MAXSEG:
3051			bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
3052			NTOHS(mss);
3053			/* FALLTHROUGH */
3054		default:
3055			optlen = opt[1];
3056			if (optlen < 2)
3057				optlen = 2;
3058			hlen -= optlen;
3059			opt += optlen;
3060			break;
3061		}
3062	}
3063	return (mss);
3064}
3065
3066static u_int16_t
3067pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer)
3068{
3069#ifdef INET
3070	struct nhop4_basic	nh4;
3071#endif /* INET */
3072#ifdef INET6
3073	struct nhop6_basic	nh6;
3074	struct in6_addr		dst6;
3075	uint32_t		scopeid;
3076#endif /* INET6 */
3077	int			 hlen = 0;
3078	uint16_t		 mss = 0;
3079
3080	switch (af) {
3081#ifdef INET
3082	case AF_INET:
3083		hlen = sizeof(struct ip);
3084		if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) == 0)
3085			mss = nh4.nh_mtu - hlen - sizeof(struct tcphdr);
3086		break;
3087#endif /* INET */
3088#ifdef INET6
3089	case AF_INET6:
3090		hlen = sizeof(struct ip6_hdr);
3091		in6_splitscope(&addr->v6, &dst6, &scopeid);
3092		if (fib6_lookup_nh_basic(rtableid, &dst6, scopeid, 0,0,&nh6)==0)
3093			mss = nh6.nh_mtu - hlen - sizeof(struct tcphdr);
3094		break;
3095#endif /* INET6 */
3096	}
3097
3098	mss = max(V_tcp_mssdflt, mss);
3099	mss = min(mss, offer);
3100	mss = max(mss, 64);		/* sanity - at least max opt space */
3101	return (mss);
3102}
3103
3104static u_int32_t
3105pf_tcp_iss(struct pf_pdesc *pd)
3106{
3107	MD5_CTX ctx;
3108	u_int32_t digest[4];
3109
3110	if (V_pf_tcp_secret_init == 0) {
3111		read_random(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret));
3112		MD5Init(&V_pf_tcp_secret_ctx);
3113		MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret,
3114		    sizeof(V_pf_tcp_secret));
3115		V_pf_tcp_secret_init = 1;
3116	}
3117
3118	ctx = V_pf_tcp_secret_ctx;
3119
3120	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
3121	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
3122	if (pd->af == AF_INET6) {
3123		MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
3124		MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
3125	} else {
3126		MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
3127		MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
3128	}
3129	MD5Final((u_char *)digest, &ctx);
3130	V_pf_tcp_iss_off += 4096;
3131#define	ISN_RANDOM_INCREMENT (4096 - 1)
3132	return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) +
3133	    V_pf_tcp_iss_off);
3134#undef	ISN_RANDOM_INCREMENT
3135}
3136
3137static int
3138pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
3139    struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd,
3140    struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp)
3141{
3142	struct pf_rule		*nr = NULL;
3143	struct pf_addr		* const saddr = pd->src;
3144	struct pf_addr		* const daddr = pd->dst;
3145	sa_family_t		 af = pd->af;
3146	struct pf_rule		*r, *a = NULL;
3147	struct pf_ruleset	*ruleset = NULL;
3148	struct pf_src_node	*nsn = NULL;
3149	struct tcphdr		*th = pd->hdr.tcp;
3150	struct pf_state_key	*sk = NULL, *nk = NULL;
3151	u_short			 reason;
3152	int			 rewrite = 0, hdrlen = 0;
3153	int			 tag = -1, rtableid = -1;
3154	int			 asd = 0;
3155	int			 match = 0;
3156	int			 state_icmp = 0;
3157	u_int16_t		 sport = 0, dport = 0;
3158	u_int16_t		 bproto_sum = 0, bip_sum = 0;
3159	u_int8_t		 icmptype = 0, icmpcode = 0;
3160	struct pf_anchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
3161
3162	PF_RULES_RASSERT();
3163
3164	if (inp != NULL) {
3165		INP_LOCK_ASSERT(inp);
3166		pd->lookup.uid = inp->inp_cred->cr_uid;
3167		pd->lookup.gid = inp->inp_cred->cr_groups[0];
3168		pd->lookup.done = 1;
3169	}
3170
3171	switch (pd->proto) {
3172	case IPPROTO_TCP:
3173		sport = th->th_sport;
3174		dport = th->th_dport;
3175		hdrlen = sizeof(*th);
3176		break;
3177	case IPPROTO_UDP:
3178		sport = pd->hdr.udp->uh_sport;
3179		dport = pd->hdr.udp->uh_dport;
3180		hdrlen = sizeof(*pd->hdr.udp);
3181		break;
3182#ifdef INET
3183	case IPPROTO_ICMP:
3184		if (pd->af != AF_INET)
3185			break;
3186		sport = dport = pd->hdr.icmp->icmp_id;
3187		hdrlen = sizeof(*pd->hdr.icmp);
3188		icmptype = pd->hdr.icmp->icmp_type;
3189		icmpcode = pd->hdr.icmp->icmp_code;
3190
3191		if (icmptype == ICMP_UNREACH ||
3192		    icmptype == ICMP_SOURCEQUENCH ||
3193		    icmptype == ICMP_REDIRECT ||
3194		    icmptype == ICMP_TIMXCEED ||
3195		    icmptype == ICMP_PARAMPROB)
3196			state_icmp++;
3197		break;
3198#endif /* INET */
3199#ifdef INET6
3200	case IPPROTO_ICMPV6:
3201		if (af != AF_INET6)
3202			break;
3203		sport = dport = pd->hdr.icmp6->icmp6_id;
3204		hdrlen = sizeof(*pd->hdr.icmp6);
3205		icmptype = pd->hdr.icmp6->icmp6_type;
3206		icmpcode = pd->hdr.icmp6->icmp6_code;
3207
3208		if (icmptype == ICMP6_DST_UNREACH ||
3209		    icmptype == ICMP6_PACKET_TOO_BIG ||
3210		    icmptype == ICMP6_TIME_EXCEEDED ||
3211		    icmptype == ICMP6_PARAM_PROB)
3212			state_icmp++;
3213		break;
3214#endif /* INET6 */
3215	default:
3216		sport = dport = hdrlen = 0;
3217		break;
3218	}
3219
3220	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3221
3222	/* check packet for BINAT/NAT/RDR */
3223	if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk,
3224	    &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) {
3225		KASSERT(sk != NULL, ("%s: null sk", __func__));
3226		KASSERT(nk != NULL, ("%s: null nk", __func__));
3227
3228		if (pd->ip_sum)
3229			bip_sum = *pd->ip_sum;
3230
3231		switch (pd->proto) {
3232		case IPPROTO_TCP:
3233			bproto_sum = th->th_sum;
3234			pd->proto_sum = &th->th_sum;
3235
3236			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3237			    nk->port[pd->sidx] != sport) {
3238				pf_change_ap(m, saddr, &th->th_sport, pd->ip_sum,
3239				    &th->th_sum, &nk->addr[pd->sidx],
3240				    nk->port[pd->sidx], 0, af);
3241				pd->sport = &th->th_sport;
3242				sport = th->th_sport;
3243			}
3244
3245			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3246			    nk->port[pd->didx] != dport) {
3247				pf_change_ap(m, daddr, &th->th_dport, pd->ip_sum,
3248				    &th->th_sum, &nk->addr[pd->didx],
3249				    nk->port[pd->didx], 0, af);
3250				dport = th->th_dport;
3251				pd->dport = &th->th_dport;
3252			}
3253			rewrite++;
3254			break;
3255		case IPPROTO_UDP:
3256			bproto_sum = pd->hdr.udp->uh_sum;
3257			pd->proto_sum = &pd->hdr.udp->uh_sum;
3258
3259			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3260			    nk->port[pd->sidx] != sport) {
3261				pf_change_ap(m, saddr, &pd->hdr.udp->uh_sport,
3262				    pd->ip_sum, &pd->hdr.udp->uh_sum,
3263				    &nk->addr[pd->sidx],
3264				    nk->port[pd->sidx], 1, af);
3265				sport = pd->hdr.udp->uh_sport;
3266				pd->sport = &pd->hdr.udp->uh_sport;
3267			}
3268
3269			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3270			    nk->port[pd->didx] != dport) {
3271				pf_change_ap(m, daddr, &pd->hdr.udp->uh_dport,
3272				    pd->ip_sum, &pd->hdr.udp->uh_sum,
3273				    &nk->addr[pd->didx],
3274				    nk->port[pd->didx], 1, af);
3275				dport = pd->hdr.udp->uh_dport;
3276				pd->dport = &pd->hdr.udp->uh_dport;
3277			}
3278			rewrite++;
3279			break;
3280#ifdef INET
3281		case IPPROTO_ICMP:
3282			nk->port[0] = nk->port[1];
3283			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
3284				pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
3285				    nk->addr[pd->sidx].v4.s_addr, 0);
3286
3287			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
3288				pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
3289				    nk->addr[pd->didx].v4.s_addr, 0);
3290
3291			if (nk->port[1] != pd->hdr.icmp->icmp_id) {
3292				pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
3293				    pd->hdr.icmp->icmp_cksum, sport,
3294				    nk->port[1], 0);
3295				pd->hdr.icmp->icmp_id = nk->port[1];
3296				pd->sport = &pd->hdr.icmp->icmp_id;
3297			}
3298			m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
3299			break;
3300#endif /* INET */
3301#ifdef INET6
3302		case IPPROTO_ICMPV6:
3303			nk->port[0] = nk->port[1];
3304			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
3305				pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
3306				    &nk->addr[pd->sidx], 0);
3307
3308			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
3309				pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
3310				    &nk->addr[pd->didx], 0);
3311			rewrite++;
3312			break;
3313#endif /* INET */
3314		default:
3315			switch (af) {
3316#ifdef INET
3317			case AF_INET:
3318				if (PF_ANEQ(saddr,
3319				    &nk->addr[pd->sidx], AF_INET))
3320					pf_change_a(&saddr->v4.s_addr,
3321					    pd->ip_sum,
3322					    nk->addr[pd->sidx].v4.s_addr, 0);
3323
3324				if (PF_ANEQ(daddr,
3325				    &nk->addr[pd->didx], AF_INET))
3326					pf_change_a(&daddr->v4.s_addr,
3327					    pd->ip_sum,
3328					    nk->addr[pd->didx].v4.s_addr, 0);
3329				break;
3330#endif /* INET */
3331#ifdef INET6
3332			case AF_INET6:
3333				if (PF_ANEQ(saddr,
3334				    &nk->addr[pd->sidx], AF_INET6))
3335					PF_ACPY(saddr, &nk->addr[pd->sidx], af);
3336
3337				if (PF_ANEQ(daddr,
3338				    &nk->addr[pd->didx], AF_INET6))
3339					PF_ACPY(saddr, &nk->addr[pd->didx], af);
3340				break;
3341#endif /* INET */
3342			}
3343			break;
3344		}
3345		if (nr->natpass)
3346			r = NULL;
3347		pd->nat_rule = nr;
3348	}
3349
3350	while (r != NULL) {
3351		r->evaluations++;
3352		if (pfi_kif_match(r->kif, kif) == r->ifnot)
3353			r = r->skip[PF_SKIP_IFP].ptr;
3354		else if (r->direction && r->direction != direction)
3355			r = r->skip[PF_SKIP_DIR].ptr;
3356		else if (r->af && r->af != af)
3357			r = r->skip[PF_SKIP_AF].ptr;
3358		else if (r->proto && r->proto != pd->proto)
3359			r = r->skip[PF_SKIP_PROTO].ptr;
3360		else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
3361		    r->src.neg, kif, M_GETFIB(m)))
3362			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3363		/* tcp/udp only. port_op always 0 in other cases */
3364		else if (r->src.port_op && !pf_match_port(r->src.port_op,
3365		    r->src.port[0], r->src.port[1], sport))
3366			r = r->skip[PF_SKIP_SRC_PORT].ptr;
3367		else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
3368		    r->dst.neg, NULL, M_GETFIB(m)))
3369			r = r->skip[PF_SKIP_DST_ADDR].ptr;
3370		/* tcp/udp only. port_op always 0 in other cases */
3371		else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
3372		    r->dst.port[0], r->dst.port[1], dport))
3373			r = r->skip[PF_SKIP_DST_PORT].ptr;
3374		/* icmp only. type always 0 in other cases */
3375		else if (r->type && r->type != icmptype + 1)
3376			r = TAILQ_NEXT(r, entries);
3377		/* icmp only. type always 0 in other cases */
3378		else if (r->code && r->code != icmpcode + 1)
3379			r = TAILQ_NEXT(r, entries);
3380		else if (r->tos && !(r->tos == pd->tos))
3381			r = TAILQ_NEXT(r, entries);
3382		else if (r->rule_flag & PFRULE_FRAGMENT)
3383			r = TAILQ_NEXT(r, entries);
3384		else if (pd->proto == IPPROTO_TCP &&
3385		    (r->flagset & th->th_flags) != r->flags)
3386			r = TAILQ_NEXT(r, entries);
3387		/* tcp/udp only. uid.op always 0 in other cases */
3388		else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
3389		    pf_socket_lookup(direction, pd, m), 1)) &&
3390		    !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
3391		    pd->lookup.uid))
3392			r = TAILQ_NEXT(r, entries);
3393		/* tcp/udp only. gid.op always 0 in other cases */
3394		else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
3395		    pf_socket_lookup(direction, pd, m), 1)) &&
3396		    !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
3397		    pd->lookup.gid))
3398			r = TAILQ_NEXT(r, entries);
3399		else if (r->prio &&
3400		    !pf_match_ieee8021q_pcp(r->prio, m))
3401			r = TAILQ_NEXT(r, entries);
3402		else if (r->prob &&
3403		    r->prob <= arc4random())
3404			r = TAILQ_NEXT(r, entries);
3405		else if (r->match_tag && !pf_match_tag(m, r, &tag,
3406		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
3407			r = TAILQ_NEXT(r, entries);
3408		else if (r->os_fingerprint != PF_OSFP_ANY &&
3409		    (pd->proto != IPPROTO_TCP || !pf_osfp_match(
3410		    pf_osfp_fingerprint(pd, m, off, th),
3411		    r->os_fingerprint)))
3412			r = TAILQ_NEXT(r, entries);
3413		else {
3414			if (r->tag)
3415				tag = r->tag;
3416			if (r->rtableid >= 0)
3417				rtableid = r->rtableid;
3418			if (r->anchor == NULL) {
3419				match = 1;
3420				*rm = r;
3421				*am = a;
3422				*rsm = ruleset;
3423				if ((*rm)->quick)
3424					break;
3425				r = TAILQ_NEXT(r, entries);
3426			} else
3427				pf_step_into_anchor(anchor_stack, &asd,
3428				    &ruleset, PF_RULESET_FILTER, &r, &a,
3429				    &match);
3430		}
3431		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
3432		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
3433			break;
3434	}
3435	r = *rm;
3436	a = *am;
3437	ruleset = *rsm;
3438
3439	REASON_SET(&reason, PFRES_MATCH);
3440
3441	if (r->log || (nr != NULL && nr->log)) {
3442		if (rewrite)
3443			m_copyback(m, off, hdrlen, pd->hdr.any);
3444		PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a,
3445		    ruleset, pd, 1);
3446	}
3447
3448	if ((r->action == PF_DROP) &&
3449	    ((r->rule_flag & PFRULE_RETURNRST) ||
3450	    (r->rule_flag & PFRULE_RETURNICMP) ||
3451	    (r->rule_flag & PFRULE_RETURN))) {
3452		/* undo NAT changes, if they have taken place */
3453		if (nr != NULL) {
3454			PF_ACPY(saddr, &sk->addr[pd->sidx], af);
3455			PF_ACPY(daddr, &sk->addr[pd->didx], af);
3456			if (pd->sport)
3457				*pd->sport = sk->port[pd->sidx];
3458			if (pd->dport)
3459				*pd->dport = sk->port[pd->didx];
3460			if (pd->proto_sum)
3461				*pd->proto_sum = bproto_sum;
3462			if (pd->ip_sum)
3463				*pd->ip_sum = bip_sum;
3464			m_copyback(m, off, hdrlen, pd->hdr.any);
3465		}
3466		if (pd->proto == IPPROTO_TCP &&
3467		    ((r->rule_flag & PFRULE_RETURNRST) ||
3468		    (r->rule_flag & PFRULE_RETURN)) &&
3469		    !(th->th_flags & TH_RST)) {
3470			u_int32_t	 ack = ntohl(th->th_seq) + pd->p_len;
3471			int		 len = 0;
3472#ifdef INET
3473			struct ip	*h4;
3474#endif
3475#ifdef INET6
3476			struct ip6_hdr	*h6;
3477#endif
3478
3479			switch (af) {
3480#ifdef INET
3481			case AF_INET:
3482				h4 = mtod(m, struct ip *);
3483				len = ntohs(h4->ip_len) - off;
3484				break;
3485#endif
3486#ifdef INET6
3487			case AF_INET6:
3488				h6 = mtod(m, struct ip6_hdr *);
3489				len = ntohs(h6->ip6_plen) - (off - sizeof(*h6));
3490				break;
3491#endif
3492			}
3493
3494			if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
3495				REASON_SET(&reason, PFRES_PROTCKSUM);
3496			else {
3497				if (th->th_flags & TH_SYN)
3498					ack++;
3499				if (th->th_flags & TH_FIN)
3500					ack++;
3501				pf_send_tcp(m, r, af, pd->dst,
3502				    pd->src, th->th_dport, th->th_sport,
3503				    ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
3504				    r->return_ttl, 1, 0, kif->pfik_ifp);
3505			}
3506		} else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
3507		    r->return_icmp)
3508			pf_send_icmp(m, r->return_icmp >> 8,
3509			    r->return_icmp & 255, af, r);
3510		else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
3511		    r->return_icmp6)
3512			pf_send_icmp(m, r->return_icmp6 >> 8,
3513			    r->return_icmp6 & 255, af, r);
3514	}
3515
3516	if (r->action == PF_DROP)
3517		goto cleanup;
3518
3519	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3520		REASON_SET(&reason, PFRES_MEMORY);
3521		goto cleanup;
3522	}
3523	if (rtableid >= 0)
3524		M_SETFIB(m, rtableid);
3525
3526	if (!state_icmp && (r->keep_state || nr != NULL ||
3527	    (pd->flags & PFDESC_TCP_NORM))) {
3528		int action;
3529		action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off,
3530		    sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum,
3531		    hdrlen);
3532		if (action != PF_PASS)
3533			return (action);
3534	} else {
3535		if (sk != NULL)
3536			uma_zfree(V_pf_state_key_z, sk);
3537		if (nk != NULL)
3538			uma_zfree(V_pf_state_key_z, nk);
3539	}
3540
3541	/* copy back packet headers if we performed NAT operations */
3542	if (rewrite)
3543		m_copyback(m, off, hdrlen, pd->hdr.any);
3544
3545	if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) &&
3546	    direction == PF_OUT &&
3547	    pfsync_defer_ptr != NULL && pfsync_defer_ptr(*sm, m))
3548		/*
3549		 * We want the state created, but we dont
3550		 * want to send this in case a partner
3551		 * firewall has to know about it to allow
3552		 * replies through it.
3553		 */
3554		return (PF_DEFER);
3555
3556	return (PF_PASS);
3557
3558cleanup:
3559	if (sk != NULL)
3560		uma_zfree(V_pf_state_key_z, sk);
3561	if (nk != NULL)
3562		uma_zfree(V_pf_state_key_z, nk);
3563	return (PF_DROP);
3564}
3565
3566static int
3567pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a,
3568    struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk,
3569    struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport,
3570    u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm,
3571    int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen)
3572{
3573	struct pf_state		*s = NULL;
3574	struct pf_src_node	*sn = NULL;
3575	struct tcphdr		*th = pd->hdr.tcp;
3576	u_int16_t		 mss = V_tcp_mssdflt;
3577	u_short			 reason;
3578
3579	/* check maximums */
3580	if (r->max_states &&
3581	    (counter_u64_fetch(r->states_cur) >= r->max_states)) {
3582		counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1);
3583		REASON_SET(&reason, PFRES_MAXSTATES);
3584		goto csfailed;
3585	}
3586	/* src node for filter rule */
3587	if ((r->rule_flag & PFRULE_SRCTRACK ||
3588	    r->rpool.opts & PF_POOL_STICKYADDR) &&
3589	    pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
3590		REASON_SET(&reason, PFRES_SRCLIMIT);
3591		goto csfailed;
3592	}
3593	/* src node for translation rule */
3594	if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
3595	    pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
3596		REASON_SET(&reason, PFRES_SRCLIMIT);
3597		goto csfailed;
3598	}
3599	s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO);
3600	if (s == NULL) {
3601		REASON_SET(&reason, PFRES_MEMORY);
3602		goto csfailed;
3603	}
3604	s->rule.ptr = r;
3605	s->nat_rule.ptr = nr;
3606	s->anchor.ptr = a;
3607	STATE_INC_COUNTERS(s);
3608	if (r->allow_opts)
3609		s->state_flags |= PFSTATE_ALLOWOPTS;
3610	if (r->rule_flag & PFRULE_STATESLOPPY)
3611		s->state_flags |= PFSTATE_SLOPPY;
3612	s->log = r->log & PF_LOG_ALL;
3613	s->sync_state = PFSYNC_S_NONE;
3614	if (nr != NULL)
3615		s->log |= nr->log & PF_LOG_ALL;
3616	switch (pd->proto) {
3617	case IPPROTO_TCP:
3618		s->src.seqlo = ntohl(th->th_seq);
3619		s->src.seqhi = s->src.seqlo + pd->p_len + 1;
3620		if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
3621		    r->keep_state == PF_STATE_MODULATE) {
3622			/* Generate sequence number modulator */
3623			if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
3624			    0)
3625				s->src.seqdiff = 1;
3626			pf_change_proto_a(m, &th->th_seq, &th->th_sum,
3627			    htonl(s->src.seqlo + s->src.seqdiff), 0);
3628			*rewrite = 1;
3629		} else
3630			s->src.seqdiff = 0;
3631		if (th->th_flags & TH_SYN) {
3632			s->src.seqhi++;
3633			s->src.wscale = pf_get_wscale(m, off,
3634			    th->th_off, pd->af);
3635		}
3636		s->src.max_win = MAX(ntohs(th->th_win), 1);
3637		if (s->src.wscale & PF_WSCALE_MASK) {
3638			/* Remove scale factor from initial window */
3639			int win = s->src.max_win;
3640			win += 1 << (s->src.wscale & PF_WSCALE_MASK);
3641			s->src.max_win = (win - 1) >>
3642			    (s->src.wscale & PF_WSCALE_MASK);
3643		}
3644		if (th->th_flags & TH_FIN)
3645			s->src.seqhi++;
3646		s->dst.seqhi = 1;
3647		s->dst.max_win = 1;
3648		s->src.state = TCPS_SYN_SENT;
3649		s->dst.state = TCPS_CLOSED;
3650		s->timeout = PFTM_TCP_FIRST_PACKET;
3651		break;
3652	case IPPROTO_UDP:
3653		s->src.state = PFUDPS_SINGLE;
3654		s->dst.state = PFUDPS_NO_TRAFFIC;
3655		s->timeout = PFTM_UDP_FIRST_PACKET;
3656		break;
3657	case IPPROTO_ICMP:
3658#ifdef INET6
3659	case IPPROTO_ICMPV6:
3660#endif
3661		s->timeout = PFTM_ICMP_FIRST_PACKET;
3662		break;
3663	default:
3664		s->src.state = PFOTHERS_SINGLE;
3665		s->dst.state = PFOTHERS_NO_TRAFFIC;
3666		s->timeout = PFTM_OTHER_FIRST_PACKET;
3667	}
3668
3669	if (r->rt && r->rt != PF_FASTROUTE) {
3670		if (pf_map_addr(pd->af, r, pd->src, &s->rt_addr, NULL, &sn)) {
3671			REASON_SET(&reason, PFRES_MAPFAILED);
3672			pf_src_tree_remove_state(s);
3673			STATE_DEC_COUNTERS(s);
3674			uma_zfree(V_pf_state_z, s);
3675			goto csfailed;
3676		}
3677		s->rt_kif = r->rpool.cur->kif;
3678	}
3679
3680	s->creation = time_uptime;
3681	s->expire = time_uptime;
3682
3683	if (sn != NULL)
3684		s->src_node = sn;
3685	if (nsn != NULL) {
3686		/* XXX We only modify one side for now. */
3687		PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
3688		s->nat_src_node = nsn;
3689	}
3690	if (pd->proto == IPPROTO_TCP) {
3691		if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
3692		    off, pd, th, &s->src, &s->dst)) {
3693			REASON_SET(&reason, PFRES_MEMORY);
3694			pf_src_tree_remove_state(s);
3695			STATE_DEC_COUNTERS(s);
3696			uma_zfree(V_pf_state_z, s);
3697			return (PF_DROP);
3698		}
3699		if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
3700		    pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
3701		    &s->src, &s->dst, rewrite)) {
3702			/* This really shouldn't happen!!! */
3703			DPFPRINTF(PF_DEBUG_URGENT,
3704			    ("pf_normalize_tcp_stateful failed on first pkt"));
3705			pf_normalize_tcp_cleanup(s);
3706			pf_src_tree_remove_state(s);
3707			STATE_DEC_COUNTERS(s);
3708			uma_zfree(V_pf_state_z, s);
3709			return (PF_DROP);
3710		}
3711	}
3712	s->direction = pd->dir;
3713
3714	/*
3715	 * sk/nk could already been setup by pf_get_translation().
3716	 */
3717	if (nr == NULL) {
3718		KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p",
3719		    __func__, nr, sk, nk));
3720		sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport);
3721		if (sk == NULL)
3722			goto csfailed;
3723		nk = sk;
3724	} else
3725		KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p",
3726		    __func__, nr, sk, nk));
3727
3728	/* Swap sk/nk for PF_OUT. */
3729	if (pf_state_insert(BOUND_IFACE(r, kif),
3730	    (pd->dir == PF_IN) ? sk : nk,
3731	    (pd->dir == PF_IN) ? nk : sk, s)) {
3732		if (pd->proto == IPPROTO_TCP)
3733			pf_normalize_tcp_cleanup(s);
3734		REASON_SET(&reason, PFRES_STATEINS);
3735		pf_src_tree_remove_state(s);
3736		STATE_DEC_COUNTERS(s);
3737		uma_zfree(V_pf_state_z, s);
3738		return (PF_DROP);
3739	} else
3740		*sm = s;
3741
3742	if (tag > 0)
3743		s->tag = tag;
3744	if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
3745	    TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
3746		s->src.state = PF_TCPS_PROXY_SRC;
3747		/* undo NAT changes, if they have taken place */
3748		if (nr != NULL) {
3749			struct pf_state_key *skt = s->key[PF_SK_WIRE];
3750			if (pd->dir == PF_OUT)
3751				skt = s->key[PF_SK_STACK];
3752			PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
3753			PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
3754			if (pd->sport)
3755				*pd->sport = skt->port[pd->sidx];
3756			if (pd->dport)
3757				*pd->dport = skt->port[pd->didx];
3758			if (pd->proto_sum)
3759				*pd->proto_sum = bproto_sum;
3760			if (pd->ip_sum)
3761				*pd->ip_sum = bip_sum;
3762			m_copyback(m, off, hdrlen, pd->hdr.any);
3763		}
3764		s->src.seqhi = htonl(arc4random());
3765		/* Find mss option */
3766		int rtid = M_GETFIB(m);
3767		mss = pf_get_mss(m, off, th->th_off, pd->af);
3768		mss = pf_calc_mss(pd->src, pd->af, rtid, mss);
3769		mss = pf_calc_mss(pd->dst, pd->af, rtid, mss);
3770		s->src.mss = mss;
3771		pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport,
3772		    th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
3773		    TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL);
3774		REASON_SET(&reason, PFRES_SYNPROXY);
3775		return (PF_SYNPROXY_DROP);
3776	}
3777
3778	return (PF_PASS);
3779
3780csfailed:
3781	if (sk != NULL)
3782		uma_zfree(V_pf_state_key_z, sk);
3783	if (nk != NULL)
3784		uma_zfree(V_pf_state_key_z, nk);
3785
3786	if (sn != NULL) {
3787		struct pf_srchash *sh;
3788
3789		sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
3790		PF_HASHROW_LOCK(sh);
3791		if (--sn->states == 0 && sn->expire == 0) {
3792			pf_unlink_src_node(sn);
3793			uma_zfree(V_pf_sources_z, sn);
3794			counter_u64_add(
3795			    V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
3796		}
3797		PF_HASHROW_UNLOCK(sh);
3798	}
3799
3800	if (nsn != sn && nsn != NULL) {
3801		struct pf_srchash *sh;
3802
3803		sh = &V_pf_srchash[pf_hashsrc(&nsn->addr, nsn->af)];
3804		PF_HASHROW_LOCK(sh);
3805		if (--nsn->states == 0 && nsn->expire == 0) {
3806			pf_unlink_src_node(nsn);
3807			uma_zfree(V_pf_sources_z, nsn);
3808			counter_u64_add(
3809			    V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
3810		}
3811		PF_HASHROW_UNLOCK(sh);
3812	}
3813
3814	return (PF_DROP);
3815}
3816
3817static int
3818pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
3819    struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am,
3820    struct pf_ruleset **rsm)
3821{
3822	struct pf_rule		*r, *a = NULL;
3823	struct pf_ruleset	*ruleset = NULL;
3824	sa_family_t		 af = pd->af;
3825	u_short			 reason;
3826	int			 tag = -1;
3827	int			 asd = 0;
3828	int			 match = 0;
3829	struct pf_anchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
3830
3831	PF_RULES_RASSERT();
3832
3833	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3834	while (r != NULL) {
3835		r->evaluations++;
3836		if (pfi_kif_match(r->kif, kif) == r->ifnot)
3837			r = r->skip[PF_SKIP_IFP].ptr;
3838		else if (r->direction && r->direction != direction)
3839			r = r->skip[PF_SKIP_DIR].ptr;
3840		else if (r->af && r->af != af)
3841			r = r->skip[PF_SKIP_AF].ptr;
3842		else if (r->proto && r->proto != pd->proto)
3843			r = r->skip[PF_SKIP_PROTO].ptr;
3844		else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
3845		    r->src.neg, kif, M_GETFIB(m)))
3846			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3847		else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
3848		    r->dst.neg, NULL, M_GETFIB(m)))
3849			r = r->skip[PF_SKIP_DST_ADDR].ptr;
3850		else if (r->tos && !(r->tos == pd->tos))
3851			r = TAILQ_NEXT(r, entries);
3852		else if (r->os_fingerprint != PF_OSFP_ANY)
3853			r = TAILQ_NEXT(r, entries);
3854		else if (pd->proto == IPPROTO_UDP &&
3855		    (r->src.port_op || r->dst.port_op))
3856			r = TAILQ_NEXT(r, entries);
3857		else if (pd->proto == IPPROTO_TCP &&
3858		    (r->src.port_op || r->dst.port_op || r->flagset))
3859			r = TAILQ_NEXT(r, entries);
3860		else if ((pd->proto == IPPROTO_ICMP ||
3861		    pd->proto == IPPROTO_ICMPV6) &&
3862		    (r->type || r->code))
3863			r = TAILQ_NEXT(r, entries);
3864		else if (r->prio &&
3865		    !pf_match_ieee8021q_pcp(r->prio, m))
3866			r = TAILQ_NEXT(r, entries);
3867		else if (r->prob && r->prob <=
3868		    (arc4random() % (UINT_MAX - 1) + 1))
3869			r = TAILQ_NEXT(r, entries);
3870		else if (r->match_tag && !pf_match_tag(m, r, &tag,
3871		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
3872			r = TAILQ_NEXT(r, entries);
3873		else {
3874			if (r->anchor == NULL) {
3875				match = 1;
3876				*rm = r;
3877				*am = a;
3878				*rsm = ruleset;
3879				if ((*rm)->quick)
3880					break;
3881				r = TAILQ_NEXT(r, entries);
3882			} else
3883				pf_step_into_anchor(anchor_stack, &asd,
3884				    &ruleset, PF_RULESET_FILTER, &r, &a,
3885				    &match);
3886		}
3887		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
3888		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
3889			break;
3890	}
3891	r = *rm;
3892	a = *am;
3893	ruleset = *rsm;
3894
3895	REASON_SET(&reason, PFRES_MATCH);
3896
3897	if (r->log)
3898		PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd,
3899		    1);
3900
3901	if (r->action != PF_PASS)
3902		return (PF_DROP);
3903
3904	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3905		REASON_SET(&reason, PFRES_MEMORY);
3906		return (PF_DROP);
3907	}
3908
3909	return (PF_PASS);
3910}
3911
3912static int
3913pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
3914	struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
3915	struct pf_pdesc *pd, u_short *reason, int *copyback)
3916{
3917	struct tcphdr		*th = pd->hdr.tcp;
3918	u_int16_t		 win = ntohs(th->th_win);
3919	u_int32_t		 ack, end, seq, orig_seq;
3920	u_int8_t		 sws, dws;
3921	int			 ackskew;
3922
3923	if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
3924		sws = src->wscale & PF_WSCALE_MASK;
3925		dws = dst->wscale & PF_WSCALE_MASK;
3926	} else
3927		sws = dws = 0;
3928
3929	/*
3930	 * Sequence tracking algorithm from Guido van Rooij's paper:
3931	 *   http://www.madison-gurkha.com/publications/tcp_filtering/
3932	 *	tcp_filtering.ps
3933	 */
3934
3935	orig_seq = seq = ntohl(th->th_seq);
3936	if (src->seqlo == 0) {
3937		/* First packet from this end. Set its state */
3938
3939		if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
3940		    src->scrub == NULL) {
3941			if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
3942				REASON_SET(reason, PFRES_MEMORY);
3943				return (PF_DROP);
3944			}
3945		}
3946
3947		/* Deferred generation of sequence number modulator */
3948		if (dst->seqdiff && !src->seqdiff) {
3949			/* use random iss for the TCP server */
3950			while ((src->seqdiff = arc4random() - seq) == 0)
3951				;
3952			ack = ntohl(th->th_ack) - dst->seqdiff;
3953			pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq +
3954			    src->seqdiff), 0);
3955			pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0);
3956			*copyback = 1;
3957		} else {
3958			ack = ntohl(th->th_ack);
3959		}
3960
3961		end = seq + pd->p_len;
3962		if (th->th_flags & TH_SYN) {
3963			end++;
3964			if (dst->wscale & PF_WSCALE_FLAG) {
3965				src->wscale = pf_get_wscale(m, off, th->th_off,
3966				    pd->af);
3967				if (src->wscale & PF_WSCALE_FLAG) {
3968					/* Remove scale factor from initial
3969					 * window */
3970					sws = src->wscale & PF_WSCALE_MASK;
3971					win = ((u_int32_t)win + (1 << sws) - 1)
3972					    >> sws;
3973					dws = dst->wscale & PF_WSCALE_MASK;
3974				} else {
3975					/* fixup other window */
3976					dst->max_win <<= dst->wscale &
3977					    PF_WSCALE_MASK;
3978					/* in case of a retrans SYN|ACK */
3979					dst->wscale = 0;
3980				}
3981			}
3982		}
3983		if (th->th_flags & TH_FIN)
3984			end++;
3985
3986		src->seqlo = seq;
3987		if (src->state < TCPS_SYN_SENT)
3988			src->state = TCPS_SYN_SENT;
3989
3990		/*
3991		 * May need to slide the window (seqhi may have been set by
3992		 * the crappy stack check or if we picked up the connection
3993		 * after establishment)
3994		 */
3995		if (src->seqhi == 1 ||
3996		    SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
3997			src->seqhi = end + MAX(1, dst->max_win << dws);
3998		if (win > src->max_win)
3999			src->max_win = win;
4000
4001	} else {
4002		ack = ntohl(th->th_ack) - dst->seqdiff;
4003		if (src->seqdiff) {
4004			/* Modulate sequence numbers */
4005			pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq +
4006			    src->seqdiff), 0);
4007			pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0);
4008			*copyback = 1;
4009		}
4010		end = seq + pd->p_len;
4011		if (th->th_flags & TH_SYN)
4012			end++;
4013		if (th->th_flags & TH_FIN)
4014			end++;
4015	}
4016
4017	if ((th->th_flags & TH_ACK) == 0) {
4018		/* Let it pass through the ack skew check */
4019		ack = dst->seqlo;
4020	} else if ((ack == 0 &&
4021	    (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
4022	    /* broken tcp stacks do not set ack */
4023	    (dst->state < TCPS_SYN_SENT)) {
4024		/*
4025		 * Many stacks (ours included) will set the ACK number in an
4026		 * FIN|ACK if the SYN times out -- no sequence to ACK.
4027		 */
4028		ack = dst->seqlo;
4029	}
4030
4031	if (seq == end) {
4032		/* Ease sequencing restrictions on no data packets */
4033		seq = src->seqlo;
4034		end = seq;
4035	}
4036
4037	ackskew = dst->seqlo - ack;
4038
4039
4040	/*
4041	 * Need to demodulate the sequence numbers in any TCP SACK options
4042	 * (Selective ACK). We could optionally validate the SACK values
4043	 * against the current ACK window, either forwards or backwards, but
4044	 * I'm not confident that SACK has been implemented properly
4045	 * everywhere. It wouldn't surprise me if several stacks accidentally
4046	 * SACK too far backwards of previously ACKed data. There really aren't
4047	 * any security implications of bad SACKing unless the target stack
4048	 * doesn't validate the option length correctly. Someone trying to
4049	 * spoof into a TCP connection won't bother blindly sending SACK
4050	 * options anyway.
4051	 */
4052	if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
4053		if (pf_modulate_sack(m, off, pd, th, dst))
4054			*copyback = 1;
4055	}
4056
4057
4058#define	MAXACKWINDOW (0xffff + 1500)	/* 1500 is an arbitrary fudge factor */
4059	if (SEQ_GEQ(src->seqhi, end) &&
4060	    /* Last octet inside other's window space */
4061	    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
4062	    /* Retrans: not more than one window back */
4063	    (ackskew >= -MAXACKWINDOW) &&
4064	    /* Acking not more than one reassembled fragment backwards */
4065	    (ackskew <= (MAXACKWINDOW << sws)) &&
4066	    /* Acking not more than one window forward */
4067	    ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
4068	    (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
4069	    (pd->flags & PFDESC_IP_REAS) == 0)) {
4070	    /* Require an exact/+1 sequence match on resets when possible */
4071
4072		if (dst->scrub || src->scrub) {
4073			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
4074			    *state, src, dst, copyback))
4075				return (PF_DROP);
4076		}
4077
4078		/* update max window */
4079		if (src->max_win < win)
4080			src->max_win = win;
4081		/* synchronize sequencing */
4082		if (SEQ_GT(end, src->seqlo))
4083			src->seqlo = end;
4084		/* slide the window of what the other end can send */
4085		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
4086			dst->seqhi = ack + MAX((win << sws), 1);
4087
4088
4089		/* update states */
4090		if (th->th_flags & TH_SYN)
4091			if (src->state < TCPS_SYN_SENT)
4092				src->state = TCPS_SYN_SENT;
4093		if (th->th_flags & TH_FIN)
4094			if (src->state < TCPS_CLOSING)
4095				src->state = TCPS_CLOSING;
4096		if (th->th_flags & TH_ACK) {
4097			if (dst->state == TCPS_SYN_SENT) {
4098				dst->state = TCPS_ESTABLISHED;
4099				if (src->state == TCPS_ESTABLISHED &&
4100				    (*state)->src_node != NULL &&
4101				    pf_src_connlimit(state)) {
4102					REASON_SET(reason, PFRES_SRCLIMIT);
4103					return (PF_DROP);
4104				}
4105			} else if (dst->state == TCPS_CLOSING)
4106				dst->state = TCPS_FIN_WAIT_2;
4107		}
4108		if (th->th_flags & TH_RST)
4109			src->state = dst->state = TCPS_TIME_WAIT;
4110
4111		/* update expire time */
4112		(*state)->expire = time_uptime;
4113		if (src->state >= TCPS_FIN_WAIT_2 &&
4114		    dst->state >= TCPS_FIN_WAIT_2)
4115			(*state)->timeout = PFTM_TCP_CLOSED;
4116		else if (src->state >= TCPS_CLOSING &&
4117		    dst->state >= TCPS_CLOSING)
4118			(*state)->timeout = PFTM_TCP_FIN_WAIT;
4119		else if (src->state < TCPS_ESTABLISHED ||
4120		    dst->state < TCPS_ESTABLISHED)
4121			(*state)->timeout = PFTM_TCP_OPENING;
4122		else if (src->state >= TCPS_CLOSING ||
4123		    dst->state >= TCPS_CLOSING)
4124			(*state)->timeout = PFTM_TCP_CLOSING;
4125		else
4126			(*state)->timeout = PFTM_TCP_ESTABLISHED;
4127
4128		/* Fall through to PASS packet */
4129
4130	} else if ((dst->state < TCPS_SYN_SENT ||
4131		dst->state >= TCPS_FIN_WAIT_2 ||
4132		src->state >= TCPS_FIN_WAIT_2) &&
4133	    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
4134	    /* Within a window forward of the originating packet */
4135	    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
4136	    /* Within a window backward of the originating packet */
4137
4138		/*
4139		 * This currently handles three situations:
4140		 *  1) Stupid stacks will shotgun SYNs before their peer
4141		 *     replies.
4142		 *  2) When PF catches an already established stream (the
4143		 *     firewall rebooted, the state table was flushed, routes
4144		 *     changed...)
4145		 *  3) Packets get funky immediately after the connection
4146		 *     closes (this should catch Solaris spurious ACK|FINs
4147		 *     that web servers like to spew after a close)
4148		 *
4149		 * This must be a little more careful than the above code
4150		 * since packet floods will also be caught here. We don't
4151		 * update the TTL here to mitigate the damage of a packet
4152		 * flood and so the same code can handle awkward establishment
4153		 * and a loosened connection close.
4154		 * In the establishment case, a correct peer response will
4155		 * validate the connection, go through the normal state code
4156		 * and keep updating the state TTL.
4157		 */
4158
4159		if (V_pf_status.debug >= PF_DEBUG_MISC) {
4160			printf("pf: loose state match: ");
4161			pf_print_state(*state);
4162			pf_print_flags(th->th_flags);
4163			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4164			    "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack,
4165			    pd->p_len, ackskew, (unsigned long long)(*state)->packets[0],
4166			    (unsigned long long)(*state)->packets[1],
4167			    pd->dir == PF_IN ? "in" : "out",
4168			    pd->dir == (*state)->direction ? "fwd" : "rev");
4169		}
4170
4171		if (dst->scrub || src->scrub) {
4172			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
4173			    *state, src, dst, copyback))
4174				return (PF_DROP);
4175		}
4176
4177		/* update max window */
4178		if (src->max_win < win)
4179			src->max_win = win;
4180		/* synchronize sequencing */
4181		if (SEQ_GT(end, src->seqlo))
4182			src->seqlo = end;
4183		/* slide the window of what the other end can send */
4184		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
4185			dst->seqhi = ack + MAX((win << sws), 1);
4186
4187		/*
4188		 * Cannot set dst->seqhi here since this could be a shotgunned
4189		 * SYN and not an already established connection.
4190		 */
4191
4192		if (th->th_flags & TH_FIN)
4193			if (src->state < TCPS_CLOSING)
4194				src->state = TCPS_CLOSING;
4195		if (th->th_flags & TH_RST)
4196			src->state = dst->state = TCPS_TIME_WAIT;
4197
4198		/* Fall through to PASS packet */
4199
4200	} else {
4201		if ((*state)->dst.state == TCPS_SYN_SENT &&
4202		    (*state)->src.state == TCPS_SYN_SENT) {
4203			/* Send RST for state mismatches during handshake */
4204			if (!(th->th_flags & TH_RST))
4205				pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4206				    pd->dst, pd->src, th->th_dport,
4207				    th->th_sport, ntohl(th->th_ack), 0,
4208				    TH_RST, 0, 0,
4209				    (*state)->rule.ptr->return_ttl, 1, 0,
4210				    kif->pfik_ifp);
4211			src->seqlo = 0;
4212			src->seqhi = 1;
4213			src->max_win = 1;
4214		} else if (V_pf_status.debug >= PF_DEBUG_MISC) {
4215			printf("pf: BAD state: ");
4216			pf_print_state(*state);
4217			pf_print_flags(th->th_flags);
4218			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4219			    "pkts=%llu:%llu dir=%s,%s\n",
4220			    seq, orig_seq, ack, pd->p_len, ackskew,
4221			    (unsigned long long)(*state)->packets[0],
4222			    (unsigned long long)(*state)->packets[1],
4223			    pd->dir == PF_IN ? "in" : "out",
4224			    pd->dir == (*state)->direction ? "fwd" : "rev");
4225			printf("pf: State failure on: %c %c %c %c | %c %c\n",
4226			    SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
4227			    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
4228			    ' ': '2',
4229			    (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
4230			    (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
4231			    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
4232			    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
4233		}
4234		REASON_SET(reason, PFRES_BADSTATE);
4235		return (PF_DROP);
4236	}
4237
4238	return (PF_PASS);
4239}
4240
4241static int
4242pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
4243	struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
4244{
4245	struct tcphdr		*th = pd->hdr.tcp;
4246
4247	if (th->th_flags & TH_SYN)
4248		if (src->state < TCPS_SYN_SENT)
4249			src->state = TCPS_SYN_SENT;
4250	if (th->th_flags & TH_FIN)
4251		if (src->state < TCPS_CLOSING)
4252			src->state = TCPS_CLOSING;
4253	if (th->th_flags & TH_ACK) {
4254		if (dst->state == TCPS_SYN_SENT) {
4255			dst->state = TCPS_ESTABLISHED;
4256			if (src->state == TCPS_ESTABLISHED &&
4257			    (*state)->src_node != NULL &&
4258			    pf_src_connlimit(state)) {
4259				REASON_SET(reason, PFRES_SRCLIMIT);
4260				return (PF_DROP);
4261			}
4262		} else if (dst->state == TCPS_CLOSING) {
4263			dst->state = TCPS_FIN_WAIT_2;
4264		} else if (src->state == TCPS_SYN_SENT &&
4265		    dst->state < TCPS_SYN_SENT) {
4266			/*
4267			 * Handle a special sloppy case where we only see one
4268			 * half of the connection. If there is a ACK after
4269			 * the initial SYN without ever seeing a packet from
4270			 * the destination, set the connection to established.
4271			 */
4272			dst->state = src->state = TCPS_ESTABLISHED;
4273			if ((*state)->src_node != NULL &&
4274			    pf_src_connlimit(state)) {
4275				REASON_SET(reason, PFRES_SRCLIMIT);
4276				return (PF_DROP);
4277			}
4278		} else if (src->state == TCPS_CLOSING &&
4279		    dst->state == TCPS_ESTABLISHED &&
4280		    dst->seqlo == 0) {
4281			/*
4282			 * Handle the closing of half connections where we
4283			 * don't see the full bidirectional FIN/ACK+ACK
4284			 * handshake.
4285			 */
4286			dst->state = TCPS_CLOSING;
4287		}
4288	}
4289	if (th->th_flags & TH_RST)
4290		src->state = dst->state = TCPS_TIME_WAIT;
4291
4292	/* update expire time */
4293	(*state)->expire = time_uptime;
4294	if (src->state >= TCPS_FIN_WAIT_2 &&
4295	    dst->state >= TCPS_FIN_WAIT_2)
4296		(*state)->timeout = PFTM_TCP_CLOSED;
4297	else if (src->state >= TCPS_CLOSING &&
4298	    dst->state >= TCPS_CLOSING)
4299		(*state)->timeout = PFTM_TCP_FIN_WAIT;
4300	else if (src->state < TCPS_ESTABLISHED ||
4301	    dst->state < TCPS_ESTABLISHED)
4302		(*state)->timeout = PFTM_TCP_OPENING;
4303	else if (src->state >= TCPS_CLOSING ||
4304	    dst->state >= TCPS_CLOSING)
4305		(*state)->timeout = PFTM_TCP_CLOSING;
4306	else
4307		(*state)->timeout = PFTM_TCP_ESTABLISHED;
4308
4309	return (PF_PASS);
4310}
4311
4312static int
4313pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
4314    struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
4315    u_short *reason)
4316{
4317	struct pf_state_key_cmp	 key;
4318	struct tcphdr		*th = pd->hdr.tcp;
4319	int			 copyback = 0;
4320	struct pf_state_peer	*src, *dst;
4321	struct pf_state_key	*sk;
4322
4323	bzero(&key, sizeof(key));
4324	key.af = pd->af;
4325	key.proto = IPPROTO_TCP;
4326	if (direction == PF_IN)	{	/* wire side, straight */
4327		PF_ACPY(&key.addr[0], pd->src, key.af);
4328		PF_ACPY(&key.addr[1], pd->dst, key.af);
4329		key.port[0] = th->th_sport;
4330		key.port[1] = th->th_dport;
4331	} else {			/* stack side, reverse */
4332		PF_ACPY(&key.addr[1], pd->src, key.af);
4333		PF_ACPY(&key.addr[0], pd->dst, key.af);
4334		key.port[1] = th->th_sport;
4335		key.port[0] = th->th_dport;
4336	}
4337
4338	STATE_LOOKUP(kif, &key, direction, *state, pd);
4339
4340	if (direction == (*state)->direction) {
4341		src = &(*state)->src;
4342		dst = &(*state)->dst;
4343	} else {
4344		src = &(*state)->dst;
4345		dst = &(*state)->src;
4346	}
4347
4348	sk = (*state)->key[pd->didx];
4349
4350	if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
4351		if (direction != (*state)->direction) {
4352			REASON_SET(reason, PFRES_SYNPROXY);
4353			return (PF_SYNPROXY_DROP);
4354		}
4355		if (th->th_flags & TH_SYN) {
4356			if (ntohl(th->th_seq) != (*state)->src.seqlo) {
4357				REASON_SET(reason, PFRES_SYNPROXY);
4358				return (PF_DROP);
4359			}
4360			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4361			    pd->src, th->th_dport, th->th_sport,
4362			    (*state)->src.seqhi, ntohl(th->th_seq) + 1,
4363			    TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL);
4364			REASON_SET(reason, PFRES_SYNPROXY);
4365			return (PF_SYNPROXY_DROP);
4366		} else if (!(th->th_flags & TH_ACK) ||
4367		    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4368		    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4369			REASON_SET(reason, PFRES_SYNPROXY);
4370			return (PF_DROP);
4371		} else if ((*state)->src_node != NULL &&
4372		    pf_src_connlimit(state)) {
4373			REASON_SET(reason, PFRES_SRCLIMIT);
4374			return (PF_DROP);
4375		} else
4376			(*state)->src.state = PF_TCPS_PROXY_DST;
4377	}
4378	if ((*state)->src.state == PF_TCPS_PROXY_DST) {
4379		if (direction == (*state)->direction) {
4380			if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
4381			    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4382			    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4383				REASON_SET(reason, PFRES_SYNPROXY);
4384				return (PF_DROP);
4385			}
4386			(*state)->src.max_win = MAX(ntohs(th->th_win), 1);
4387			if ((*state)->dst.seqhi == 1)
4388				(*state)->dst.seqhi = htonl(arc4random());
4389			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4390			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
4391			    sk->port[pd->sidx], sk->port[pd->didx],
4392			    (*state)->dst.seqhi, 0, TH_SYN, 0,
4393			    (*state)->src.mss, 0, 0, (*state)->tag, NULL);
4394			REASON_SET(reason, PFRES_SYNPROXY);
4395			return (PF_SYNPROXY_DROP);
4396		} else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
4397		    (TH_SYN|TH_ACK)) ||
4398		    (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
4399			REASON_SET(reason, PFRES_SYNPROXY);
4400			return (PF_DROP);
4401		} else {
4402			(*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
4403			(*state)->dst.seqlo = ntohl(th->th_seq);
4404			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4405			    pd->src, th->th_dport, th->th_sport,
4406			    ntohl(th->th_ack), ntohl(th->th_seq) + 1,
4407			    TH_ACK, (*state)->src.max_win, 0, 0, 0,
4408			    (*state)->tag, NULL);
4409			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4410			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
4411			    sk->port[pd->sidx], sk->port[pd->didx],
4412			    (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
4413			    TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL);
4414			(*state)->src.seqdiff = (*state)->dst.seqhi -
4415			    (*state)->src.seqlo;
4416			(*state)->dst.seqdiff = (*state)->src.seqhi -
4417			    (*state)->dst.seqlo;
4418			(*state)->src.seqhi = (*state)->src.seqlo +
4419			    (*state)->dst.max_win;
4420			(*state)->dst.seqhi = (*state)->dst.seqlo +
4421			    (*state)->src.max_win;
4422			(*state)->src.wscale = (*state)->dst.wscale = 0;
4423			(*state)->src.state = (*state)->dst.state =
4424			    TCPS_ESTABLISHED;
4425			REASON_SET(reason, PFRES_SYNPROXY);
4426			return (PF_SYNPROXY_DROP);
4427		}
4428	}
4429
4430	if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
4431	    dst->state >= TCPS_FIN_WAIT_2 &&
4432	    src->state >= TCPS_FIN_WAIT_2) {
4433		if (V_pf_status.debug >= PF_DEBUG_MISC) {
4434			printf("pf: state reuse ");
4435			pf_print_state(*state);
4436			pf_print_flags(th->th_flags);
4437			printf("\n");
4438		}
4439		/* XXX make sure it's the same direction ?? */
4440		(*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
4441		pf_unlink_state(*state, PF_ENTER_LOCKED);
4442		*state = NULL;
4443		return (PF_DROP);
4444	}
4445
4446	if ((*state)->state_flags & PFSTATE_SLOPPY) {
4447		if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP)
4448			return (PF_DROP);
4449	} else {
4450		if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason,
4451		    &copyback) == PF_DROP)
4452			return (PF_DROP);
4453	}
4454
4455	/* translate source/destination address, if necessary */
4456	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4457		struct pf_state_key *nk = (*state)->key[pd->didx];
4458
4459		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4460		    nk->port[pd->sidx] != th->th_sport)
4461			pf_change_ap(m, pd->src, &th->th_sport,
4462			    pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx],
4463			    nk->port[pd->sidx], 0, pd->af);
4464
4465		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4466		    nk->port[pd->didx] != th->th_dport)
4467			pf_change_ap(m, pd->dst, &th->th_dport,
4468			    pd->ip_sum, &th->th_sum, &nk->addr[pd->didx],
4469			    nk->port[pd->didx], 0, pd->af);
4470		copyback = 1;
4471	}
4472
4473	/* Copyback sequence modulation or stateful scrub changes if needed */
4474	if (copyback)
4475		m_copyback(m, off, sizeof(*th), (caddr_t)th);
4476
4477	return (PF_PASS);
4478}
4479
4480static int
4481pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
4482    struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
4483{
4484	struct pf_state_peer	*src, *dst;
4485	struct pf_state_key_cmp	 key;
4486	struct udphdr		*uh = pd->hdr.udp;
4487
4488	bzero(&key, sizeof(key));
4489	key.af = pd->af;
4490	key.proto = IPPROTO_UDP;
4491	if (direction == PF_IN)	{	/* wire side, straight */
4492		PF_ACPY(&key.addr[0], pd->src, key.af);
4493		PF_ACPY(&key.addr[1], pd->dst, key.af);
4494		key.port[0] = uh->uh_sport;
4495		key.port[1] = uh->uh_dport;
4496	} else {			/* stack side, reverse */
4497		PF_ACPY(&key.addr[1], pd->src, key.af);
4498		PF_ACPY(&key.addr[0], pd->dst, key.af);
4499		key.port[1] = uh->uh_sport;
4500		key.port[0] = uh->uh_dport;
4501	}
4502
4503	STATE_LOOKUP(kif, &key, direction, *state, pd);
4504
4505	if (direction == (*state)->direction) {
4506		src = &(*state)->src;
4507		dst = &(*state)->dst;
4508	} else {
4509		src = &(*state)->dst;
4510		dst = &(*state)->src;
4511	}
4512
4513	/* update states */
4514	if (src->state < PFUDPS_SINGLE)
4515		src->state = PFUDPS_SINGLE;
4516	if (dst->state == PFUDPS_SINGLE)
4517		dst->state = PFUDPS_MULTIPLE;
4518
4519	/* update expire time */
4520	(*state)->expire = time_uptime;
4521	if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
4522		(*state)->timeout = PFTM_UDP_MULTIPLE;
4523	else
4524		(*state)->timeout = PFTM_UDP_SINGLE;
4525
4526	/* translate source/destination address, if necessary */
4527	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4528		struct pf_state_key *nk = (*state)->key[pd->didx];
4529
4530		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4531		    nk->port[pd->sidx] != uh->uh_sport)
4532			pf_change_ap(m, pd->src, &uh->uh_sport, pd->ip_sum,
4533			    &uh->uh_sum, &nk->addr[pd->sidx],
4534			    nk->port[pd->sidx], 1, pd->af);
4535
4536		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4537		    nk->port[pd->didx] != uh->uh_dport)
4538			pf_change_ap(m, pd->dst, &uh->uh_dport, pd->ip_sum,
4539			    &uh->uh_sum, &nk->addr[pd->didx],
4540			    nk->port[pd->didx], 1, pd->af);
4541		m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
4542	}
4543
4544	return (PF_PASS);
4545}
4546
4547static int
4548pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
4549    struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
4550{
4551	struct pf_addr  *saddr = pd->src, *daddr = pd->dst;
4552	u_int16_t	 icmpid = 0, *icmpsum;
4553	u_int8_t	 icmptype;
4554	int		 state_icmp = 0;
4555	struct pf_state_key_cmp key;
4556
4557	bzero(&key, sizeof(key));
4558	switch (pd->proto) {
4559#ifdef INET
4560	case IPPROTO_ICMP:
4561		icmptype = pd->hdr.icmp->icmp_type;
4562		icmpid = pd->hdr.icmp->icmp_id;
4563		icmpsum = &pd->hdr.icmp->icmp_cksum;
4564
4565		if (icmptype == ICMP_UNREACH ||
4566		    icmptype == ICMP_SOURCEQUENCH ||
4567		    icmptype == ICMP_REDIRECT ||
4568		    icmptype == ICMP_TIMXCEED ||
4569		    icmptype == ICMP_PARAMPROB)
4570			state_icmp++;
4571		break;
4572#endif /* INET */
4573#ifdef INET6
4574	case IPPROTO_ICMPV6:
4575		icmptype = pd->hdr.icmp6->icmp6_type;
4576		icmpid = pd->hdr.icmp6->icmp6_id;
4577		icmpsum = &pd->hdr.icmp6->icmp6_cksum;
4578
4579		if (icmptype == ICMP6_DST_UNREACH ||
4580		    icmptype == ICMP6_PACKET_TOO_BIG ||
4581		    icmptype == ICMP6_TIME_EXCEEDED ||
4582		    icmptype == ICMP6_PARAM_PROB)
4583			state_icmp++;
4584		break;
4585#endif /* INET6 */
4586	}
4587
4588	if (!state_icmp) {
4589
4590		/*
4591		 * ICMP query/reply message not related to a TCP/UDP packet.
4592		 * Search for an ICMP state.
4593		 */
4594		key.af = pd->af;
4595		key.proto = pd->proto;
4596		key.port[0] = key.port[1] = icmpid;
4597		if (direction == PF_IN)	{	/* wire side, straight */
4598			PF_ACPY(&key.addr[0], pd->src, key.af);
4599			PF_ACPY(&key.addr[1], pd->dst, key.af);
4600		} else {			/* stack side, reverse */
4601			PF_ACPY(&key.addr[1], pd->src, key.af);
4602			PF_ACPY(&key.addr[0], pd->dst, key.af);
4603		}
4604
4605		STATE_LOOKUP(kif, &key, direction, *state, pd);
4606
4607		(*state)->expire = time_uptime;
4608		(*state)->timeout = PFTM_ICMP_ERROR_REPLY;
4609
4610		/* translate source/destination address, if necessary */
4611		if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4612			struct pf_state_key *nk = (*state)->key[pd->didx];
4613
4614			switch (pd->af) {
4615#ifdef INET
4616			case AF_INET:
4617				if (PF_ANEQ(pd->src,
4618				    &nk->addr[pd->sidx], AF_INET))
4619					pf_change_a(&saddr->v4.s_addr,
4620					    pd->ip_sum,
4621					    nk->addr[pd->sidx].v4.s_addr, 0);
4622
4623				if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
4624				    AF_INET))
4625					pf_change_a(&daddr->v4.s_addr,
4626					    pd->ip_sum,
4627					    nk->addr[pd->didx].v4.s_addr, 0);
4628
4629				if (nk->port[0] !=
4630				    pd->hdr.icmp->icmp_id) {
4631					pd->hdr.icmp->icmp_cksum =
4632					    pf_cksum_fixup(
4633					    pd->hdr.icmp->icmp_cksum, icmpid,
4634					    nk->port[pd->sidx], 0);
4635					pd->hdr.icmp->icmp_id =
4636					    nk->port[pd->sidx];
4637				}
4638
4639				m_copyback(m, off, ICMP_MINLEN,
4640				    (caddr_t )pd->hdr.icmp);
4641				break;
4642#endif /* INET */
4643#ifdef INET6
4644			case AF_INET6:
4645				if (PF_ANEQ(pd->src,
4646				    &nk->addr[pd->sidx], AF_INET6))
4647					pf_change_a6(saddr,
4648					    &pd->hdr.icmp6->icmp6_cksum,
4649					    &nk->addr[pd->sidx], 0);
4650
4651				if (PF_ANEQ(pd->dst,
4652				    &nk->addr[pd->didx], AF_INET6))
4653					pf_change_a6(daddr,
4654					    &pd->hdr.icmp6->icmp6_cksum,
4655					    &nk->addr[pd->didx], 0);
4656
4657				m_copyback(m, off, sizeof(struct icmp6_hdr),
4658				    (caddr_t )pd->hdr.icmp6);
4659				break;
4660#endif /* INET6 */
4661			}
4662		}
4663		return (PF_PASS);
4664
4665	} else {
4666		/*
4667		 * ICMP error message in response to a TCP/UDP packet.
4668		 * Extract the inner TCP/UDP header and search for that state.
4669		 */
4670
4671		struct pf_pdesc	pd2;
4672		bzero(&pd2, sizeof pd2);
4673#ifdef INET
4674		struct ip	h2;
4675#endif /* INET */
4676#ifdef INET6
4677		struct ip6_hdr	h2_6;
4678		int		terminal = 0;
4679#endif /* INET6 */
4680		int		ipoff2 = 0;
4681		int		off2 = 0;
4682
4683		pd2.af = pd->af;
4684		/* Payload packet is from the opposite direction. */
4685		pd2.sidx = (direction == PF_IN) ? 1 : 0;
4686		pd2.didx = (direction == PF_IN) ? 0 : 1;
4687		switch (pd->af) {
4688#ifdef INET
4689		case AF_INET:
4690			/* offset of h2 in mbuf chain */
4691			ipoff2 = off + ICMP_MINLEN;
4692
4693			if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
4694			    NULL, reason, pd2.af)) {
4695				DPFPRINTF(PF_DEBUG_MISC,
4696				    ("pf: ICMP error message too short "
4697				    "(ip)\n"));
4698				return (PF_DROP);
4699			}
4700			/*
4701			 * ICMP error messages don't refer to non-first
4702			 * fragments
4703			 */
4704			if (h2.ip_off & htons(IP_OFFMASK)) {
4705				REASON_SET(reason, PFRES_FRAG);
4706				return (PF_DROP);
4707			}
4708
4709			/* offset of protocol header that follows h2 */
4710			off2 = ipoff2 + (h2.ip_hl << 2);
4711
4712			pd2.proto = h2.ip_p;
4713			pd2.src = (struct pf_addr *)&h2.ip_src;
4714			pd2.dst = (struct pf_addr *)&h2.ip_dst;
4715			pd2.ip_sum = &h2.ip_sum;
4716			break;
4717#endif /* INET */
4718#ifdef INET6
4719		case AF_INET6:
4720			ipoff2 = off + sizeof(struct icmp6_hdr);
4721
4722			if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
4723			    NULL, reason, pd2.af)) {
4724				DPFPRINTF(PF_DEBUG_MISC,
4725				    ("pf: ICMP error message too short "
4726				    "(ip6)\n"));
4727				return (PF_DROP);
4728			}
4729			pd2.proto = h2_6.ip6_nxt;
4730			pd2.src = (struct pf_addr *)&h2_6.ip6_src;
4731			pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
4732			pd2.ip_sum = NULL;
4733			off2 = ipoff2 + sizeof(h2_6);
4734			do {
4735				switch (pd2.proto) {
4736				case IPPROTO_FRAGMENT:
4737					/*
4738					 * ICMPv6 error messages for
4739					 * non-first fragments
4740					 */
4741					REASON_SET(reason, PFRES_FRAG);
4742					return (PF_DROP);
4743				case IPPROTO_AH:
4744				case IPPROTO_HOPOPTS:
4745				case IPPROTO_ROUTING:
4746				case IPPROTO_DSTOPTS: {
4747					/* get next header and header length */
4748					struct ip6_ext opt6;
4749
4750					if (!pf_pull_hdr(m, off2, &opt6,
4751					    sizeof(opt6), NULL, reason,
4752					    pd2.af)) {
4753						DPFPRINTF(PF_DEBUG_MISC,
4754						    ("pf: ICMPv6 short opt\n"));
4755						return (PF_DROP);
4756					}
4757					if (pd2.proto == IPPROTO_AH)
4758						off2 += (opt6.ip6e_len + 2) * 4;
4759					else
4760						off2 += (opt6.ip6e_len + 1) * 8;
4761					pd2.proto = opt6.ip6e_nxt;
4762					/* goto the next header */
4763					break;
4764				}
4765				default:
4766					terminal++;
4767					break;
4768				}
4769			} while (!terminal);
4770			break;
4771#endif /* INET6 */
4772		}
4773
4774		switch (pd2.proto) {
4775		case IPPROTO_TCP: {
4776			struct tcphdr		 th;
4777			u_int32_t		 seq;
4778			struct pf_state_peer	*src, *dst;
4779			u_int8_t		 dws;
4780			int			 copyback = 0;
4781
4782			/*
4783			 * Only the first 8 bytes of the TCP header can be
4784			 * expected. Don't access any TCP header fields after
4785			 * th_seq, an ackskew test is not possible.
4786			 */
4787			if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
4788			    pd2.af)) {
4789				DPFPRINTF(PF_DEBUG_MISC,
4790				    ("pf: ICMP error message too short "
4791				    "(tcp)\n"));
4792				return (PF_DROP);
4793			}
4794
4795			key.af = pd2.af;
4796			key.proto = IPPROTO_TCP;
4797			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4798			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4799			key.port[pd2.sidx] = th.th_sport;
4800			key.port[pd2.didx] = th.th_dport;
4801
4802			STATE_LOOKUP(kif, &key, direction, *state, pd);
4803
4804			if (direction == (*state)->direction) {
4805				src = &(*state)->dst;
4806				dst = &(*state)->src;
4807			} else {
4808				src = &(*state)->src;
4809				dst = &(*state)->dst;
4810			}
4811
4812			if (src->wscale && dst->wscale)
4813				dws = dst->wscale & PF_WSCALE_MASK;
4814			else
4815				dws = 0;
4816
4817			/* Demodulate sequence number */
4818			seq = ntohl(th.th_seq) - src->seqdiff;
4819			if (src->seqdiff) {
4820				pf_change_a(&th.th_seq, icmpsum,
4821				    htonl(seq), 0);
4822				copyback = 1;
4823			}
4824
4825			if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
4826			    (!SEQ_GEQ(src->seqhi, seq) ||
4827			    !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
4828				if (V_pf_status.debug >= PF_DEBUG_MISC) {
4829					printf("pf: BAD ICMP %d:%d ",
4830					    icmptype, pd->hdr.icmp->icmp_code);
4831					pf_print_host(pd->src, 0, pd->af);
4832					printf(" -> ");
4833					pf_print_host(pd->dst, 0, pd->af);
4834					printf(" state: ");
4835					pf_print_state(*state);
4836					printf(" seq=%u\n", seq);
4837				}
4838				REASON_SET(reason, PFRES_BADSTATE);
4839				return (PF_DROP);
4840			} else {
4841				if (V_pf_status.debug >= PF_DEBUG_MISC) {
4842					printf("pf: OK ICMP %d:%d ",
4843					    icmptype, pd->hdr.icmp->icmp_code);
4844					pf_print_host(pd->src, 0, pd->af);
4845					printf(" -> ");
4846					pf_print_host(pd->dst, 0, pd->af);
4847					printf(" state: ");
4848					pf_print_state(*state);
4849					printf(" seq=%u\n", seq);
4850				}
4851			}
4852
4853			/* translate source/destination address, if necessary */
4854			if ((*state)->key[PF_SK_WIRE] !=
4855			    (*state)->key[PF_SK_STACK]) {
4856				struct pf_state_key *nk =
4857				    (*state)->key[pd->didx];
4858
4859				if (PF_ANEQ(pd2.src,
4860				    &nk->addr[pd2.sidx], pd2.af) ||
4861				    nk->port[pd2.sidx] != th.th_sport)
4862					pf_change_icmp(pd2.src, &th.th_sport,
4863					    daddr, &nk->addr[pd2.sidx],
4864					    nk->port[pd2.sidx], NULL,
4865					    pd2.ip_sum, icmpsum,
4866					    pd->ip_sum, 0, pd2.af);
4867
4868				if (PF_ANEQ(pd2.dst,
4869				    &nk->addr[pd2.didx], pd2.af) ||
4870				    nk->port[pd2.didx] != th.th_dport)
4871					pf_change_icmp(pd2.dst, &th.th_dport,
4872					    saddr, &nk->addr[pd2.didx],
4873					    nk->port[pd2.didx], NULL,
4874					    pd2.ip_sum, icmpsum,
4875					    pd->ip_sum, 0, pd2.af);
4876				copyback = 1;
4877			}
4878
4879			if (copyback) {
4880				switch (pd2.af) {
4881#ifdef INET
4882				case AF_INET:
4883					m_copyback(m, off, ICMP_MINLEN,
4884					    (caddr_t )pd->hdr.icmp);
4885					m_copyback(m, ipoff2, sizeof(h2),
4886					    (caddr_t )&h2);
4887					break;
4888#endif /* INET */
4889#ifdef INET6
4890				case AF_INET6:
4891					m_copyback(m, off,
4892					    sizeof(struct icmp6_hdr),
4893					    (caddr_t )pd->hdr.icmp6);
4894					m_copyback(m, ipoff2, sizeof(h2_6),
4895					    (caddr_t )&h2_6);
4896					break;
4897#endif /* INET6 */
4898				}
4899				m_copyback(m, off2, 8, (caddr_t)&th);
4900			}
4901
4902			return (PF_PASS);
4903			break;
4904		}
4905		case IPPROTO_UDP: {
4906			struct udphdr		uh;
4907
4908			if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
4909			    NULL, reason, pd2.af)) {
4910				DPFPRINTF(PF_DEBUG_MISC,
4911				    ("pf: ICMP error message too short "
4912				    "(udp)\n"));
4913				return (PF_DROP);
4914			}
4915
4916			key.af = pd2.af;
4917			key.proto = IPPROTO_UDP;
4918			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4919			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4920			key.port[pd2.sidx] = uh.uh_sport;
4921			key.port[pd2.didx] = uh.uh_dport;
4922
4923			STATE_LOOKUP(kif, &key, direction, *state, pd);
4924
4925			/* translate source/destination address, if necessary */
4926			if ((*state)->key[PF_SK_WIRE] !=
4927			    (*state)->key[PF_SK_STACK]) {
4928				struct pf_state_key *nk =
4929				    (*state)->key[pd->didx];
4930
4931				if (PF_ANEQ(pd2.src,
4932				    &nk->addr[pd2.sidx], pd2.af) ||
4933				    nk->port[pd2.sidx] != uh.uh_sport)
4934					pf_change_icmp(pd2.src, &uh.uh_sport,
4935					    daddr, &nk->addr[pd2.sidx],
4936					    nk->port[pd2.sidx], &uh.uh_sum,
4937					    pd2.ip_sum, icmpsum,
4938					    pd->ip_sum, 1, pd2.af);
4939
4940				if (PF_ANEQ(pd2.dst,
4941				    &nk->addr[pd2.didx], pd2.af) ||
4942				    nk->port[pd2.didx] != uh.uh_dport)
4943					pf_change_icmp(pd2.dst, &uh.uh_dport,
4944					    saddr, &nk->addr[pd2.didx],
4945					    nk->port[pd2.didx], &uh.uh_sum,
4946					    pd2.ip_sum, icmpsum,
4947					    pd->ip_sum, 1, pd2.af);
4948
4949				switch (pd2.af) {
4950#ifdef INET
4951				case AF_INET:
4952					m_copyback(m, off, ICMP_MINLEN,
4953					    (caddr_t )pd->hdr.icmp);
4954					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4955					break;
4956#endif /* INET */
4957#ifdef INET6
4958				case AF_INET6:
4959					m_copyback(m, off,
4960					    sizeof(struct icmp6_hdr),
4961					    (caddr_t )pd->hdr.icmp6);
4962					m_copyback(m, ipoff2, sizeof(h2_6),
4963					    (caddr_t )&h2_6);
4964					break;
4965#endif /* INET6 */
4966				}
4967				m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
4968			}
4969			return (PF_PASS);
4970			break;
4971		}
4972#ifdef INET
4973		case IPPROTO_ICMP: {
4974			struct icmp		iih;
4975
4976			if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
4977			    NULL, reason, pd2.af)) {
4978				DPFPRINTF(PF_DEBUG_MISC,
4979				    ("pf: ICMP error message too short i"
4980				    "(icmp)\n"));
4981				return (PF_DROP);
4982			}
4983
4984			key.af = pd2.af;
4985			key.proto = IPPROTO_ICMP;
4986			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4987			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4988			key.port[0] = key.port[1] = iih.icmp_id;
4989
4990			STATE_LOOKUP(kif, &key, direction, *state, pd);
4991
4992			/* translate source/destination address, if necessary */
4993			if ((*state)->key[PF_SK_WIRE] !=
4994			    (*state)->key[PF_SK_STACK]) {
4995				struct pf_state_key *nk =
4996				    (*state)->key[pd->didx];
4997
4998				if (PF_ANEQ(pd2.src,
4999				    &nk->addr[pd2.sidx], pd2.af) ||
5000				    nk->port[pd2.sidx] != iih.icmp_id)
5001					pf_change_icmp(pd2.src, &iih.icmp_id,
5002					    daddr, &nk->addr[pd2.sidx],
5003					    nk->port[pd2.sidx], NULL,
5004					    pd2.ip_sum, icmpsum,
5005					    pd->ip_sum, 0, AF_INET);
5006
5007				if (PF_ANEQ(pd2.dst,
5008				    &nk->addr[pd2.didx], pd2.af) ||
5009				    nk->port[pd2.didx] != iih.icmp_id)
5010					pf_change_icmp(pd2.dst, &iih.icmp_id,
5011					    saddr, &nk->addr[pd2.didx],
5012					    nk->port[pd2.didx], NULL,
5013					    pd2.ip_sum, icmpsum,
5014					    pd->ip_sum, 0, AF_INET);
5015
5016				m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
5017				m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
5018				m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
5019			}
5020			return (PF_PASS);
5021			break;
5022		}
5023#endif /* INET */
5024#ifdef INET6
5025		case IPPROTO_ICMPV6: {
5026			struct icmp6_hdr	iih;
5027
5028			if (!pf_pull_hdr(m, off2, &iih,
5029			    sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
5030				DPFPRINTF(PF_DEBUG_MISC,
5031				    ("pf: ICMP error message too short "
5032				    "(icmp6)\n"));
5033				return (PF_DROP);
5034			}
5035
5036			key.af = pd2.af;
5037			key.proto = IPPROTO_ICMPV6;
5038			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5039			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5040			key.port[0] = key.port[1] = iih.icmp6_id;
5041
5042			STATE_LOOKUP(kif, &key, direction, *state, pd);
5043
5044			/* translate source/destination address, if necessary */
5045			if ((*state)->key[PF_SK_WIRE] !=
5046			    (*state)->key[PF_SK_STACK]) {
5047				struct pf_state_key *nk =
5048				    (*state)->key[pd->didx];
5049
5050				if (PF_ANEQ(pd2.src,
5051				    &nk->addr[pd2.sidx], pd2.af) ||
5052				    nk->port[pd2.sidx] != iih.icmp6_id)
5053					pf_change_icmp(pd2.src, &iih.icmp6_id,
5054					    daddr, &nk->addr[pd2.sidx],
5055					    nk->port[pd2.sidx], NULL,
5056					    pd2.ip_sum, icmpsum,
5057					    pd->ip_sum, 0, AF_INET6);
5058
5059				if (PF_ANEQ(pd2.dst,
5060				    &nk->addr[pd2.didx], pd2.af) ||
5061				    nk->port[pd2.didx] != iih.icmp6_id)
5062					pf_change_icmp(pd2.dst, &iih.icmp6_id,
5063					    saddr, &nk->addr[pd2.didx],
5064					    nk->port[pd2.didx], NULL,
5065					    pd2.ip_sum, icmpsum,
5066					    pd->ip_sum, 0, AF_INET6);
5067
5068				m_copyback(m, off, sizeof(struct icmp6_hdr),
5069				    (caddr_t)pd->hdr.icmp6);
5070				m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
5071				m_copyback(m, off2, sizeof(struct icmp6_hdr),
5072				    (caddr_t)&iih);
5073			}
5074			return (PF_PASS);
5075			break;
5076		}
5077#endif /* INET6 */
5078		default: {
5079			key.af = pd2.af;
5080			key.proto = pd2.proto;
5081			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5082			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5083			key.port[0] = key.port[1] = 0;
5084
5085			STATE_LOOKUP(kif, &key, direction, *state, pd);
5086
5087			/* translate source/destination address, if necessary */
5088			if ((*state)->key[PF_SK_WIRE] !=
5089			    (*state)->key[PF_SK_STACK]) {
5090				struct pf_state_key *nk =
5091				    (*state)->key[pd->didx];
5092
5093				if (PF_ANEQ(pd2.src,
5094				    &nk->addr[pd2.sidx], pd2.af))
5095					pf_change_icmp(pd2.src, NULL, daddr,
5096					    &nk->addr[pd2.sidx], 0, NULL,
5097					    pd2.ip_sum, icmpsum,
5098					    pd->ip_sum, 0, pd2.af);
5099
5100				if (PF_ANEQ(pd2.dst,
5101				    &nk->addr[pd2.didx], pd2.af))
5102					pf_change_icmp(pd2.dst, NULL, saddr,
5103					    &nk->addr[pd2.didx], 0, NULL,
5104					    pd2.ip_sum, icmpsum,
5105					    pd->ip_sum, 0, pd2.af);
5106
5107				switch (pd2.af) {
5108#ifdef INET
5109				case AF_INET:
5110					m_copyback(m, off, ICMP_MINLEN,
5111					    (caddr_t)pd->hdr.icmp);
5112					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
5113					break;
5114#endif /* INET */
5115#ifdef INET6
5116				case AF_INET6:
5117					m_copyback(m, off,
5118					    sizeof(struct icmp6_hdr),
5119					    (caddr_t )pd->hdr.icmp6);
5120					m_copyback(m, ipoff2, sizeof(h2_6),
5121					    (caddr_t )&h2_6);
5122					break;
5123#endif /* INET6 */
5124				}
5125			}
5126			return (PF_PASS);
5127			break;
5128		}
5129		}
5130	}
5131}
5132
5133static int
5134pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif,
5135    struct mbuf *m, struct pf_pdesc *pd)
5136{
5137	struct pf_state_peer	*src, *dst;
5138	struct pf_state_key_cmp	 key;
5139
5140	bzero(&key, sizeof(key));
5141	key.af = pd->af;
5142	key.proto = pd->proto;
5143	if (direction == PF_IN)	{
5144		PF_ACPY(&key.addr[0], pd->src, key.af);
5145		PF_ACPY(&key.addr[1], pd->dst, key.af);
5146		key.port[0] = key.port[1] = 0;
5147	} else {
5148		PF_ACPY(&key.addr[1], pd->src, key.af);
5149		PF_ACPY(&key.addr[0], pd->dst, key.af);
5150		key.port[1] = key.port[0] = 0;
5151	}
5152
5153	STATE_LOOKUP(kif, &key, direction, *state, pd);
5154
5155	if (direction == (*state)->direction) {
5156		src = &(*state)->src;
5157		dst = &(*state)->dst;
5158	} else {
5159		src = &(*state)->dst;
5160		dst = &(*state)->src;
5161	}
5162
5163	/* update states */
5164	if (src->state < PFOTHERS_SINGLE)
5165		src->state = PFOTHERS_SINGLE;
5166	if (dst->state == PFOTHERS_SINGLE)
5167		dst->state = PFOTHERS_MULTIPLE;
5168
5169	/* update expire time */
5170	(*state)->expire = time_uptime;
5171	if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
5172		(*state)->timeout = PFTM_OTHER_MULTIPLE;
5173	else
5174		(*state)->timeout = PFTM_OTHER_SINGLE;
5175
5176	/* translate source/destination address, if necessary */
5177	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5178		struct pf_state_key *nk = (*state)->key[pd->didx];
5179
5180		KASSERT(nk, ("%s: nk is null", __func__));
5181		KASSERT(pd, ("%s: pd is null", __func__));
5182		KASSERT(pd->src, ("%s: pd->src is null", __func__));
5183		KASSERT(pd->dst, ("%s: pd->dst is null", __func__));
5184		switch (pd->af) {
5185#ifdef INET
5186		case AF_INET:
5187			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
5188				pf_change_a(&pd->src->v4.s_addr,
5189				    pd->ip_sum,
5190				    nk->addr[pd->sidx].v4.s_addr,
5191				    0);
5192
5193
5194			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
5195				pf_change_a(&pd->dst->v4.s_addr,
5196				    pd->ip_sum,
5197				    nk->addr[pd->didx].v4.s_addr,
5198				    0);
5199
5200				break;
5201#endif /* INET */
5202#ifdef INET6
5203		case AF_INET6:
5204			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
5205				PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
5206
5207			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
5208				PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
5209#endif /* INET6 */
5210		}
5211	}
5212	return (PF_PASS);
5213}
5214
5215/*
5216 * ipoff and off are measured from the start of the mbuf chain.
5217 * h must be at "ipoff" on the mbuf chain.
5218 */
5219void *
5220pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
5221    u_short *actionp, u_short *reasonp, sa_family_t af)
5222{
5223	switch (af) {
5224#ifdef INET
5225	case AF_INET: {
5226		struct ip	*h = mtod(m, struct ip *);
5227		u_int16_t	 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
5228
5229		if (fragoff) {
5230			if (fragoff >= len)
5231				ACTION_SET(actionp, PF_PASS);
5232			else {
5233				ACTION_SET(actionp, PF_DROP);
5234				REASON_SET(reasonp, PFRES_FRAG);
5235			}
5236			return (NULL);
5237		}
5238		if (m->m_pkthdr.len < off + len ||
5239		    ntohs(h->ip_len) < off + len) {
5240			ACTION_SET(actionp, PF_DROP);
5241			REASON_SET(reasonp, PFRES_SHORT);
5242			return (NULL);
5243		}
5244		break;
5245	}
5246#endif /* INET */
5247#ifdef INET6
5248	case AF_INET6: {
5249		struct ip6_hdr	*h = mtod(m, struct ip6_hdr *);
5250
5251		if (m->m_pkthdr.len < off + len ||
5252		    (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
5253		    (unsigned)(off + len)) {
5254			ACTION_SET(actionp, PF_DROP);
5255			REASON_SET(reasonp, PFRES_SHORT);
5256			return (NULL);
5257		}
5258		break;
5259	}
5260#endif /* INET6 */
5261	}
5262	m_copydata(m, off, len, p);
5263	return (p);
5264}
5265
5266#ifdef RADIX_MPATH
5267static int
5268pf_routable_oldmpath(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
5269    int rtableid)
5270{
5271	struct radix_node_head	*rnh;
5272	struct sockaddr_in	*dst;
5273	int			 ret = 1;
5274	int			 check_mpath;
5275#ifdef INET6
5276	struct sockaddr_in6	*dst6;
5277	struct route_in6	 ro;
5278#else
5279	struct route		 ro;
5280#endif
5281	struct radix_node	*rn;
5282	struct rtentry		*rt;
5283	struct ifnet		*ifp;
5284
5285	check_mpath = 0;
5286	/* XXX: stick to table 0 for now */
5287	rnh = rt_tables_get_rnh(0, af);
5288	if (rnh != NULL && rn_mpath_capable(rnh))
5289		check_mpath = 1;
5290	bzero(&ro, sizeof(ro));
5291	switch (af) {
5292	case AF_INET:
5293		dst = satosin(&ro.ro_dst);
5294		dst->sin_family = AF_INET;
5295		dst->sin_len = sizeof(*dst);
5296		dst->sin_addr = addr->v4;
5297		break;
5298#ifdef INET6
5299	case AF_INET6:
5300		/*
5301		 * Skip check for addresses with embedded interface scope,
5302		 * as they would always match anyway.
5303		 */
5304		if (IN6_IS_SCOPE_EMBED(&addr->v6))
5305			goto out;
5306		dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
5307		dst6->sin6_family = AF_INET6;
5308		dst6->sin6_len = sizeof(*dst6);
5309		dst6->sin6_addr = addr->v6;
5310		break;
5311#endif /* INET6 */
5312	default:
5313		return (0);
5314	}
5315
5316	/* Skip checks for ipsec interfaces */
5317	if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
5318		goto out;
5319
5320	switch (af) {
5321#ifdef INET6
5322	case AF_INET6:
5323		in6_rtalloc_ign(&ro, 0, rtableid);
5324		break;
5325#endif
5326#ifdef INET
5327	case AF_INET:
5328		in_rtalloc_ign((struct route *)&ro, 0, rtableid);
5329		break;
5330#endif
5331	}
5332
5333	if (ro.ro_rt != NULL) {
5334		/* No interface given, this is a no-route check */
5335		if (kif == NULL)
5336			goto out;
5337
5338		if (kif->pfik_ifp == NULL) {
5339			ret = 0;
5340			goto out;
5341		}
5342
5343		/* Perform uRPF check if passed input interface */
5344		ret = 0;
5345		rn = (struct radix_node *)ro.ro_rt;
5346		do {
5347			rt = (struct rtentry *)rn;
5348			ifp = rt->rt_ifp;
5349
5350			if (kif->pfik_ifp == ifp)
5351				ret = 1;
5352			rn = rn_mpath_next(rn);
5353		} while (check_mpath == 1 && rn != NULL && ret == 0);
5354	} else
5355		ret = 0;
5356out:
5357	if (ro.ro_rt != NULL)
5358		RTFREE(ro.ro_rt);
5359	return (ret);
5360}
5361#endif
5362
5363int
5364pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
5365    int rtableid)
5366{
5367#ifdef INET
5368	struct nhop4_basic	nh4;
5369#endif
5370#ifdef INET6
5371	struct nhop6_basic	nh6;
5372#endif
5373	struct ifnet		*ifp;
5374#ifdef RADIX_MPATH
5375	struct radix_node_head	*rnh;
5376
5377	/* XXX: stick to table 0 for now */
5378	rnh = rt_tables_get_rnh(0, af);
5379	if (rnh != NULL && rn_mpath_capable(rnh))
5380		return (pf_routable_oldmpath(addr, af, kif, rtableid));
5381#endif
5382	/*
5383	 * Skip check for addresses with embedded interface scope,
5384	 * as they would always match anyway.
5385	 */
5386	if (af == AF_INET6 && IN6_IS_SCOPE_EMBED(&addr->v6))
5387		return (1);
5388
5389	if (af != AF_INET && af != AF_INET6)
5390		return (0);
5391
5392	/* Skip checks for ipsec interfaces */
5393	if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
5394		return (1);
5395
5396	ifp = NULL;
5397
5398	switch (af) {
5399#ifdef INET6
5400	case AF_INET6:
5401		if (fib6_lookup_nh_basic(rtableid, &addr->v6, 0, 0, 0, &nh6)!=0)
5402			return (0);
5403		ifp = nh6.nh_ifp;
5404		break;
5405#endif
5406#ifdef INET
5407	case AF_INET:
5408		if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) != 0)
5409			return (0);
5410		ifp = nh4.nh_ifp;
5411		break;
5412#endif
5413	}
5414
5415	/* No interface given, this is a no-route check */
5416	if (kif == NULL)
5417		return (1);
5418
5419	if (kif->pfik_ifp == NULL)
5420		return (0);
5421
5422	/* Perform uRPF check if passed input interface */
5423	if (kif->pfik_ifp == ifp)
5424		return (1);
5425	return (0);
5426}
5427
5428#ifdef INET
5429static void
5430pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5431    struct pf_state *s, struct pf_pdesc *pd)
5432{
5433	struct mbuf		*m0, *m1;
5434	struct sockaddr_in	dst;
5435	struct ip		*ip;
5436	struct ifnet		*ifp = NULL;
5437	struct pf_addr		 naddr;
5438	struct pf_src_node	*sn = NULL;
5439	int			 error = 0;
5440	uint16_t		 ip_len, ip_off;
5441
5442	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5443	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5444	    __func__));
5445
5446	if ((pd->pf_mtag == NULL &&
5447	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5448	    pd->pf_mtag->routed++ > 3) {
5449		m0 = *m;
5450		*m = NULL;
5451		goto bad_locked;
5452	}
5453
5454	if (r->rt == PF_DUPTO) {
5455		if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5456			if (s)
5457				PF_STATE_UNLOCK(s);
5458			return;
5459		}
5460	} else {
5461		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5462			if (s)
5463				PF_STATE_UNLOCK(s);
5464			return;
5465		}
5466		m0 = *m;
5467	}
5468
5469	ip = mtod(m0, struct ip *);
5470
5471	bzero(&dst, sizeof(dst));
5472	dst.sin_family = AF_INET;
5473	dst.sin_len = sizeof(dst);
5474	dst.sin_addr = ip->ip_dst;
5475
5476	if (r->rt == PF_FASTROUTE) {
5477		struct nhop4_basic nh4;
5478
5479		if (s)
5480			PF_STATE_UNLOCK(s);
5481
5482		if (fib4_lookup_nh_basic(M_GETFIB(m0), ip->ip_dst, 0,
5483		    m0->m_pkthdr.flowid, &nh4) != 0) {
5484			KMOD_IPSTAT_INC(ips_noroute);
5485			error = EHOSTUNREACH;
5486			goto bad;
5487		}
5488
5489		ifp = nh4.nh_ifp;
5490		dst.sin_addr = nh4.nh_addr;
5491	} else {
5492		if (TAILQ_EMPTY(&r->rpool.list)) {
5493			DPFPRINTF(PF_DEBUG_URGENT,
5494			    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5495			goto bad_locked;
5496		}
5497		if (s == NULL) {
5498			pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
5499			    &naddr, NULL, &sn);
5500			if (!PF_AZERO(&naddr, AF_INET))
5501				dst.sin_addr.s_addr = naddr.v4.s_addr;
5502			ifp = r->rpool.cur->kif ?
5503			    r->rpool.cur->kif->pfik_ifp : NULL;
5504		} else {
5505			if (!PF_AZERO(&s->rt_addr, AF_INET))
5506				dst.sin_addr.s_addr =
5507				    s->rt_addr.v4.s_addr;
5508			ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5509			PF_STATE_UNLOCK(s);
5510		}
5511	}
5512	if (ifp == NULL)
5513		goto bad;
5514
5515	if (oifp != ifp) {
5516		if (pf_test(PF_OUT, 0, ifp, &m0, NULL) != PF_PASS)
5517			goto bad;
5518		else if (m0 == NULL)
5519			goto done;
5520		if (m0->m_len < sizeof(struct ip)) {
5521			DPFPRINTF(PF_DEBUG_URGENT,
5522			    ("%s: m0->m_len < sizeof(struct ip)\n", __func__));
5523			goto bad;
5524		}
5525		ip = mtod(m0, struct ip *);
5526	}
5527
5528	if (ifp->if_flags & IFF_LOOPBACK)
5529		m0->m_flags |= M_SKIP_FIREWALL;
5530
5531	ip_len = ntohs(ip->ip_len);
5532	ip_off = ntohs(ip->ip_off);
5533
5534	/* Copied from FreeBSD 10.0-CURRENT ip_output. */
5535	m0->m_pkthdr.csum_flags |= CSUM_IP;
5536	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
5537		in_delayed_cksum(m0);
5538		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
5539	}
5540#ifdef SCTP
5541	if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
5542		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
5543		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
5544	}
5545#endif
5546
5547	/*
5548	 * If small enough for interface, or the interface will take
5549	 * care of the fragmentation for us, we can just send directly.
5550	 */
5551	if (ip_len <= ifp->if_mtu ||
5552	    (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
5553		ip->ip_sum = 0;
5554		if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
5555			ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
5556			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
5557		}
5558		m_clrprotoflags(m0);	/* Avoid confusing lower layers. */
5559		error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5560		goto done;
5561	}
5562
5563	/* Balk when DF bit is set or the interface didn't support TSO. */
5564	if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) {
5565		error = EMSGSIZE;
5566		KMOD_IPSTAT_INC(ips_cantfrag);
5567		if (r->rt != PF_DUPTO) {
5568			icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
5569			    ifp->if_mtu);
5570			goto done;
5571		} else
5572			goto bad;
5573	}
5574
5575	error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist);
5576	if (error)
5577		goto bad;
5578
5579	for (; m0; m0 = m1) {
5580		m1 = m0->m_nextpkt;
5581		m0->m_nextpkt = NULL;
5582		if (error == 0) {
5583			m_clrprotoflags(m0);
5584			error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5585		} else
5586			m_freem(m0);
5587	}
5588
5589	if (error == 0)
5590		KMOD_IPSTAT_INC(ips_fragmented);
5591
5592done:
5593	if (r->rt != PF_DUPTO)
5594		*m = NULL;
5595	return;
5596
5597bad_locked:
5598	if (s)
5599		PF_STATE_UNLOCK(s);
5600bad:
5601	m_freem(m0);
5602	goto done;
5603}
5604#endif /* INET */
5605
5606#ifdef INET6
5607static void
5608pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5609    struct pf_state *s, struct pf_pdesc *pd)
5610{
5611	struct mbuf		*m0;
5612	struct sockaddr_in6	dst;
5613	struct ip6_hdr		*ip6;
5614	struct ifnet		*ifp = NULL;
5615	struct pf_addr		 naddr;
5616	struct pf_src_node	*sn = NULL;
5617
5618	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5619	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5620	    __func__));
5621
5622	if ((pd->pf_mtag == NULL &&
5623	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5624	    pd->pf_mtag->routed++ > 3) {
5625		m0 = *m;
5626		*m = NULL;
5627		goto bad_locked;
5628	}
5629
5630	if (r->rt == PF_DUPTO) {
5631		if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5632			if (s)
5633				PF_STATE_UNLOCK(s);
5634			return;
5635		}
5636	} else {
5637		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5638			if (s)
5639				PF_STATE_UNLOCK(s);
5640			return;
5641		}
5642		m0 = *m;
5643	}
5644
5645	ip6 = mtod(m0, struct ip6_hdr *);
5646
5647	bzero(&dst, sizeof(dst));
5648	dst.sin6_family = AF_INET6;
5649	dst.sin6_len = sizeof(dst);
5650	dst.sin6_addr = ip6->ip6_dst;
5651
5652	/* Cheat. XXX why only in the v6 case??? */
5653	if (r->rt == PF_FASTROUTE) {
5654		if (s)
5655			PF_STATE_UNLOCK(s);
5656		m0->m_flags |= M_SKIP_FIREWALL;
5657		ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
5658		*m = NULL;
5659		return;
5660	}
5661
5662	if (TAILQ_EMPTY(&r->rpool.list)) {
5663		DPFPRINTF(PF_DEBUG_URGENT,
5664		    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5665		goto bad_locked;
5666	}
5667	if (s == NULL) {
5668		pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
5669		    &naddr, NULL, &sn);
5670		if (!PF_AZERO(&naddr, AF_INET6))
5671			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5672			    &naddr, AF_INET6);
5673		ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
5674	} else {
5675		if (!PF_AZERO(&s->rt_addr, AF_INET6))
5676			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5677			    &s->rt_addr, AF_INET6);
5678		ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5679	}
5680
5681	if (s)
5682		PF_STATE_UNLOCK(s);
5683
5684	if (ifp == NULL)
5685		goto bad;
5686
5687	if (oifp != ifp) {
5688		if (pf_test6(PF_OUT, PFIL_FWD, ifp, &m0, NULL) != PF_PASS)
5689			goto bad;
5690		else if (m0 == NULL)
5691			goto done;
5692		if (m0->m_len < sizeof(struct ip6_hdr)) {
5693			DPFPRINTF(PF_DEBUG_URGENT,
5694			    ("%s: m0->m_len < sizeof(struct ip6_hdr)\n",
5695			    __func__));
5696			goto bad;
5697		}
5698		ip6 = mtod(m0, struct ip6_hdr *);
5699	}
5700
5701	if (ifp->if_flags & IFF_LOOPBACK)
5702		m0->m_flags |= M_SKIP_FIREWALL;
5703
5704	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 &
5705	    ~ifp->if_hwassist) {
5706		uint32_t plen = m0->m_pkthdr.len - sizeof(*ip6);
5707		in6_delayed_cksum(m0, plen, sizeof(struct ip6_hdr));
5708		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
5709	}
5710
5711	/*
5712	 * If the packet is too large for the outgoing interface,
5713	 * send back an icmp6 error.
5714	 */
5715	if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr))
5716		dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
5717	if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu)
5718		nd6_output_ifp(ifp, ifp, m0, &dst, NULL);
5719	else {
5720		in6_ifstat_inc(ifp, ifs6_in_toobig);
5721		if (r->rt != PF_DUPTO)
5722			icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
5723		else
5724			goto bad;
5725	}
5726
5727done:
5728	if (r->rt != PF_DUPTO)
5729		*m = NULL;
5730	return;
5731
5732bad_locked:
5733	if (s)
5734		PF_STATE_UNLOCK(s);
5735bad:
5736	m_freem(m0);
5737	goto done;
5738}
5739#endif /* INET6 */
5740
5741/*
5742 * FreeBSD supports cksum offloads for the following drivers.
5743 *  em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4),
5744 *   ti(4), txp(4), xl(4)
5745 *
5746 * CSUM_DATA_VALID | CSUM_PSEUDO_HDR :
5747 *  network driver performed cksum including pseudo header, need to verify
5748 *   csum_data
5749 * CSUM_DATA_VALID :
5750 *  network driver performed cksum, needs to additional pseudo header
5751 *  cksum computation with partial csum_data(i.e. lack of H/W support for
5752 *  pseudo header, for instance hme(4), sk(4) and possibly gem(4))
5753 *
5754 * After validating the cksum of packet, set both flag CSUM_DATA_VALID and
5755 * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
5756 * TCP/UDP layer.
5757 * Also, set csum_data to 0xffff to force cksum validation.
5758 */
5759static int
5760pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
5761{
5762	u_int16_t sum = 0;
5763	int hw_assist = 0;
5764	struct ip *ip;
5765
5766	if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
5767		return (1);
5768	if (m->m_pkthdr.len < off + len)
5769		return (1);
5770
5771	switch (p) {
5772	case IPPROTO_TCP:
5773		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5774			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5775				sum = m->m_pkthdr.csum_data;
5776			} else {
5777				ip = mtod(m, struct ip *);
5778				sum = in_pseudo(ip->ip_src.s_addr,
5779				ip->ip_dst.s_addr, htonl((u_short)len +
5780				m->m_pkthdr.csum_data + IPPROTO_TCP));
5781			}
5782			sum ^= 0xffff;
5783			++hw_assist;
5784		}
5785		break;
5786	case IPPROTO_UDP:
5787		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5788			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5789				sum = m->m_pkthdr.csum_data;
5790			} else {
5791				ip = mtod(m, struct ip *);
5792				sum = in_pseudo(ip->ip_src.s_addr,
5793				ip->ip_dst.s_addr, htonl((u_short)len +
5794				m->m_pkthdr.csum_data + IPPROTO_UDP));
5795			}
5796			sum ^= 0xffff;
5797			++hw_assist;
5798		}
5799		break;
5800	case IPPROTO_ICMP:
5801#ifdef INET6
5802	case IPPROTO_ICMPV6:
5803#endif /* INET6 */
5804		break;
5805	default:
5806		return (1);
5807	}
5808
5809	if (!hw_assist) {
5810		switch (af) {
5811		case AF_INET:
5812			if (p == IPPROTO_ICMP) {
5813				if (m->m_len < off)
5814					return (1);
5815				m->m_data += off;
5816				m->m_len -= off;
5817				sum = in_cksum(m, len);
5818				m->m_data -= off;
5819				m->m_len += off;
5820			} else {
5821				if (m->m_len < sizeof(struct ip))
5822					return (1);
5823				sum = in4_cksum(m, p, off, len);
5824			}
5825			break;
5826#ifdef INET6
5827		case AF_INET6:
5828			if (m->m_len < sizeof(struct ip6_hdr))
5829				return (1);
5830			sum = in6_cksum(m, p, off, len);
5831			break;
5832#endif /* INET6 */
5833		default:
5834			return (1);
5835		}
5836	}
5837	if (sum) {
5838		switch (p) {
5839		case IPPROTO_TCP:
5840		    {
5841			KMOD_TCPSTAT_INC(tcps_rcvbadsum);
5842			break;
5843		    }
5844		case IPPROTO_UDP:
5845		    {
5846			KMOD_UDPSTAT_INC(udps_badsum);
5847			break;
5848		    }
5849#ifdef INET
5850		case IPPROTO_ICMP:
5851		    {
5852			KMOD_ICMPSTAT_INC(icps_checksum);
5853			break;
5854		    }
5855#endif
5856#ifdef INET6
5857		case IPPROTO_ICMPV6:
5858		    {
5859			KMOD_ICMP6STAT_INC(icp6s_checksum);
5860			break;
5861		    }
5862#endif /* INET6 */
5863		}
5864		return (1);
5865	} else {
5866		if (p == IPPROTO_TCP || p == IPPROTO_UDP) {
5867			m->m_pkthdr.csum_flags |=
5868			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
5869			m->m_pkthdr.csum_data = 0xffff;
5870		}
5871	}
5872	return (0);
5873}
5874
5875
5876#ifdef INET
5877int
5878pf_test(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
5879{
5880	struct pfi_kif		*kif;
5881	u_short			 action, reason = 0, log = 0;
5882	struct mbuf		*m = *m0;
5883	struct ip		*h = NULL;
5884	struct m_tag		*ipfwtag;
5885	struct pf_rule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
5886	struct pf_state		*s = NULL;
5887	struct pf_ruleset	*ruleset = NULL;
5888	struct pf_pdesc		 pd;
5889	int			 off, dirndx, pqid = 0;
5890
5891	M_ASSERTPKTHDR(m);
5892
5893	if (!V_pf_status.running)
5894		return (PF_PASS);
5895
5896	memset(&pd, 0, sizeof(pd));
5897
5898	kif = (struct pfi_kif *)ifp->if_pf_kif;
5899
5900	if (kif == NULL) {
5901		DPFPRINTF(PF_DEBUG_URGENT,
5902		    ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
5903		return (PF_DROP);
5904	}
5905	if (kif->pfik_flags & PFI_IFLAG_SKIP)
5906		return (PF_PASS);
5907
5908	if (m->m_flags & M_SKIP_FIREWALL)
5909		return (PF_PASS);
5910
5911	pd.pf_mtag = pf_find_mtag(m);
5912
5913	PF_RULES_RLOCK();
5914
5915	if (ip_divert_ptr != NULL &&
5916	    ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) {
5917		struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1);
5918		if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) {
5919			if (pd.pf_mtag == NULL &&
5920			    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5921				action = PF_DROP;
5922				goto done;
5923			}
5924			pd.pf_mtag->flags |= PF_PACKET_LOOPED;
5925			m_tag_delete(m, ipfwtag);
5926		}
5927		if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) {
5928			m->m_flags |= M_FASTFWD_OURS;
5929			pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT;
5930		}
5931	} else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
5932		/* We do IP header normalization and packet reassembly here */
5933		action = PF_DROP;
5934		goto done;
5935	}
5936	m = *m0;	/* pf_normalize messes with m0 */
5937	h = mtod(m, struct ip *);
5938
5939	off = h->ip_hl << 2;
5940	if (off < (int)sizeof(struct ip)) {
5941		action = PF_DROP;
5942		REASON_SET(&reason, PFRES_SHORT);
5943		log = 1;
5944		goto done;
5945	}
5946
5947	pd.src = (struct pf_addr *)&h->ip_src;
5948	pd.dst = (struct pf_addr *)&h->ip_dst;
5949	pd.sport = pd.dport = NULL;
5950	pd.ip_sum = &h->ip_sum;
5951	pd.proto_sum = NULL;
5952	pd.proto = h->ip_p;
5953	pd.dir = dir;
5954	pd.sidx = (dir == PF_IN) ? 0 : 1;
5955	pd.didx = (dir == PF_IN) ? 1 : 0;
5956	pd.af = AF_INET;
5957	pd.tos = h->ip_tos;
5958	pd.tot_len = ntohs(h->ip_len);
5959
5960	/* handle fragments that didn't get reassembled by normalization */
5961	if (h->ip_off & htons(IP_MF | IP_OFFMASK)) {
5962		action = pf_test_fragment(&r, dir, kif, m, h,
5963		    &pd, &a, &ruleset);
5964		goto done;
5965	}
5966
5967	switch (h->ip_p) {
5968
5969	case IPPROTO_TCP: {
5970		struct tcphdr	th;
5971
5972		pd.hdr.tcp = &th;
5973		if (!pf_pull_hdr(m, off, &th, sizeof(th),
5974		    &action, &reason, AF_INET)) {
5975			log = action != PF_PASS;
5976			goto done;
5977		}
5978		pd.p_len = pd.tot_len - off - (th.th_off << 2);
5979		if ((th.th_flags & TH_ACK) && pd.p_len == 0)
5980			pqid = 1;
5981		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
5982		if (action == PF_DROP)
5983			goto done;
5984		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
5985		    &reason);
5986		if (action == PF_PASS) {
5987			if (pfsync_update_state_ptr != NULL)
5988				pfsync_update_state_ptr(s);
5989			r = s->rule.ptr;
5990			a = s->anchor.ptr;
5991			log = s->log;
5992		} else if (s == NULL)
5993			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5994			    &a, &ruleset, inp);
5995		break;
5996	}
5997
5998	case IPPROTO_UDP: {
5999		struct udphdr	uh;
6000
6001		pd.hdr.udp = &uh;
6002		if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
6003		    &action, &reason, AF_INET)) {
6004			log = action != PF_PASS;
6005			goto done;
6006		}
6007		if (uh.uh_dport == 0 ||
6008		    ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
6009		    ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
6010			action = PF_DROP;
6011			REASON_SET(&reason, PFRES_SHORT);
6012			goto done;
6013		}
6014		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
6015		if (action == PF_PASS) {
6016			if (pfsync_update_state_ptr != NULL)
6017				pfsync_update_state_ptr(s);
6018			r = s->rule.ptr;
6019			a = s->anchor.ptr;
6020			log = s->log;
6021		} else if (s == NULL)
6022			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6023			    &a, &ruleset, inp);
6024		break;
6025	}
6026
6027	case IPPROTO_ICMP: {
6028		struct icmp	ih;
6029
6030		pd.hdr.icmp = &ih;
6031		if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
6032		    &action, &reason, AF_INET)) {
6033			log = action != PF_PASS;
6034			goto done;
6035		}
6036		action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
6037		    &reason);
6038		if (action == PF_PASS) {
6039			if (pfsync_update_state_ptr != NULL)
6040				pfsync_update_state_ptr(s);
6041			r = s->rule.ptr;
6042			a = s->anchor.ptr;
6043			log = s->log;
6044		} else if (s == NULL)
6045			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6046			    &a, &ruleset, inp);
6047		break;
6048	}
6049
6050#ifdef INET6
6051	case IPPROTO_ICMPV6: {
6052		action = PF_DROP;
6053		DPFPRINTF(PF_DEBUG_MISC,
6054		    ("pf: dropping IPv4 packet with ICMPv6 payload\n"));
6055		goto done;
6056	}
6057#endif
6058
6059	default:
6060		action = pf_test_state_other(&s, dir, kif, m, &pd);
6061		if (action == PF_PASS) {
6062			if (pfsync_update_state_ptr != NULL)
6063				pfsync_update_state_ptr(s);
6064			r = s->rule.ptr;
6065			a = s->anchor.ptr;
6066			log = s->log;
6067		} else if (s == NULL)
6068			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6069			    &a, &ruleset, inp);
6070		break;
6071	}
6072
6073done:
6074	PF_RULES_RUNLOCK();
6075	if (action == PF_PASS && h->ip_hl > 5 &&
6076	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
6077		action = PF_DROP;
6078		REASON_SET(&reason, PFRES_IPOPTIONS);
6079		log = r->log;
6080		DPFPRINTF(PF_DEBUG_MISC,
6081		    ("pf: dropping packet with ip options\n"));
6082	}
6083
6084	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
6085		action = PF_DROP;
6086		REASON_SET(&reason, PFRES_MEMORY);
6087	}
6088	if (r->rtableid >= 0)
6089		M_SETFIB(m, r->rtableid);
6090
6091	if (r->scrub_flags & PFSTATE_SETPRIO) {
6092		if (pd.tos & IPTOS_LOWDELAY)
6093			pqid = 1;
6094		if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) {
6095			action = PF_DROP;
6096			REASON_SET(&reason, PFRES_MEMORY);
6097			log = 1;
6098			DPFPRINTF(PF_DEBUG_MISC,
6099			    ("pf: failed to allocate 802.1q mtag\n"));
6100		}
6101	}
6102
6103#ifdef ALTQ
6104	if (action == PF_PASS && r->qid) {
6105		if (pd.pf_mtag == NULL &&
6106		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
6107			action = PF_DROP;
6108			REASON_SET(&reason, PFRES_MEMORY);
6109		} else {
6110			if (s != NULL)
6111				pd.pf_mtag->qid_hash = pf_state_hash(s);
6112			if (pqid || (pd.tos & IPTOS_LOWDELAY))
6113				pd.pf_mtag->qid = r->pqid;
6114			else
6115				pd.pf_mtag->qid = r->qid;
6116			/* Add hints for ecn. */
6117			pd.pf_mtag->hdr = h;
6118		}
6119
6120	}
6121#endif /* ALTQ */
6122
6123	/*
6124	 * connections redirected to loopback should not match sockets
6125	 * bound specifically to loopback due to security implications,
6126	 * see tcp_input() and in_pcblookup_listen().
6127	 */
6128	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
6129	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
6130	    (s->nat_rule.ptr->action == PF_RDR ||
6131	    s->nat_rule.ptr->action == PF_BINAT) &&
6132	    (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
6133		m->m_flags |= M_SKIP_FIREWALL;
6134
6135	if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL &&
6136	    !PACKET_LOOPED(&pd)) {
6137
6138		ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
6139		    sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
6140		if (ipfwtag != NULL) {
6141			((struct ipfw_rule_ref *)(ipfwtag+1))->info =
6142			    ntohs(r->divert.port);
6143			((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir;
6144
6145			if (s)
6146				PF_STATE_UNLOCK(s);
6147
6148			m_tag_prepend(m, ipfwtag);
6149			if (m->m_flags & M_FASTFWD_OURS) {
6150				if (pd.pf_mtag == NULL &&
6151				    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
6152					action = PF_DROP;
6153					REASON_SET(&reason, PFRES_MEMORY);
6154					log = 1;
6155					DPFPRINTF(PF_DEBUG_MISC,
6156					    ("pf: failed to allocate tag\n"));
6157				} else {
6158					pd.pf_mtag->flags |=
6159					    PF_FASTFWD_OURS_PRESENT;
6160					m->m_flags &= ~M_FASTFWD_OURS;
6161				}
6162			}
6163			ip_divert_ptr(*m0, dir ==  PF_IN ? DIR_IN : DIR_OUT);
6164			*m0 = NULL;
6165
6166			return (action);
6167		} else {
6168			/* XXX: ipfw has the same behaviour! */
6169			action = PF_DROP;
6170			REASON_SET(&reason, PFRES_MEMORY);
6171			log = 1;
6172			DPFPRINTF(PF_DEBUG_MISC,
6173			    ("pf: failed to allocate divert tag\n"));
6174		}
6175	}
6176
6177	if (log) {
6178		struct pf_rule *lr;
6179
6180		if (s != NULL && s->nat_rule.ptr != NULL &&
6181		    s->nat_rule.ptr->log & PF_LOG_ALL)
6182			lr = s->nat_rule.ptr;
6183		else
6184			lr = r;
6185		PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd,
6186		    (s == NULL));
6187	}
6188
6189	kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
6190	kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
6191
6192	if (action == PF_PASS || r->action == PF_DROP) {
6193		dirndx = (dir == PF_OUT);
6194		r->packets[dirndx]++;
6195		r->bytes[dirndx] += pd.tot_len;
6196		if (a != NULL) {
6197			a->packets[dirndx]++;
6198			a->bytes[dirndx] += pd.tot_len;
6199		}
6200		if (s != NULL) {
6201			if (s->nat_rule.ptr != NULL) {
6202				s->nat_rule.ptr->packets[dirndx]++;
6203				s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
6204			}
6205			if (s->src_node != NULL) {
6206				s->src_node->packets[dirndx]++;
6207				s->src_node->bytes[dirndx] += pd.tot_len;
6208			}
6209			if (s->nat_src_node != NULL) {
6210				s->nat_src_node->packets[dirndx]++;
6211				s->nat_src_node->bytes[dirndx] += pd.tot_len;
6212			}
6213			dirndx = (dir == s->direction) ? 0 : 1;
6214			s->packets[dirndx]++;
6215			s->bytes[dirndx] += pd.tot_len;
6216		}
6217		tr = r;
6218		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
6219		if (nr != NULL && r == &V_pf_default_rule)
6220			tr = nr;
6221		if (tr->src.addr.type == PF_ADDR_TABLE)
6222			pfr_update_stats(tr->src.addr.p.tbl,
6223			    (s == NULL) ? pd.src :
6224			    &s->key[(s->direction == PF_IN)]->
6225				addr[(s->direction == PF_OUT)],
6226			    pd.af, pd.tot_len, dir == PF_OUT,
6227			    r->action == PF_PASS, tr->src.neg);
6228		if (tr->dst.addr.type == PF_ADDR_TABLE)
6229			pfr_update_stats(tr->dst.addr.p.tbl,
6230			    (s == NULL) ? pd.dst :
6231			    &s->key[(s->direction == PF_IN)]->
6232				addr[(s->direction == PF_IN)],
6233			    pd.af, pd.tot_len, dir == PF_OUT,
6234			    r->action == PF_PASS, tr->dst.neg);
6235	}
6236
6237	switch (action) {
6238	case PF_SYNPROXY_DROP:
6239		m_freem(*m0);
6240	case PF_DEFER:
6241		*m0 = NULL;
6242		action = PF_PASS;
6243		break;
6244	case PF_DROP:
6245		m_freem(*m0);
6246		*m0 = NULL;
6247		break;
6248	default:
6249		/* pf_route() returns unlocked. */
6250		if (r->rt) {
6251			pf_route(m0, r, dir, kif->pfik_ifp, s, &pd);
6252			return (action);
6253		}
6254		break;
6255	}
6256	if (s)
6257		PF_STATE_UNLOCK(s);
6258
6259	return (action);
6260}
6261#endif /* INET */
6262
6263#ifdef INET6
6264int
6265pf_test6(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
6266{
6267	struct pfi_kif		*kif;
6268	u_short			 action, reason = 0, log = 0;
6269	struct mbuf		*m = *m0, *n = NULL;
6270	struct m_tag		*mtag;
6271	struct ip6_hdr		*h = NULL;
6272	struct pf_rule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
6273	struct pf_state		*s = NULL;
6274	struct pf_ruleset	*ruleset = NULL;
6275	struct pf_pdesc		 pd;
6276	int			 off, terminal = 0, dirndx, rh_cnt = 0, pqid = 0;
6277
6278	M_ASSERTPKTHDR(m);
6279
6280	if (!V_pf_status.running)
6281		return (PF_PASS);
6282
6283	memset(&pd, 0, sizeof(pd));
6284	pd.pf_mtag = pf_find_mtag(m);
6285
6286	if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED)
6287		return (PF_PASS);
6288
6289	kif = (struct pfi_kif *)ifp->if_pf_kif;
6290	if (kif == NULL) {
6291		DPFPRINTF(PF_DEBUG_URGENT,
6292		    ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
6293		return (PF_DROP);
6294	}
6295	if (kif->pfik_flags & PFI_IFLAG_SKIP)
6296		return (PF_PASS);
6297
6298	if (m->m_flags & M_SKIP_FIREWALL)
6299		return (PF_PASS);
6300
6301	PF_RULES_RLOCK();
6302
6303	/* We do IP header normalization and packet reassembly here */
6304	if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
6305		action = PF_DROP;
6306		goto done;
6307	}
6308	m = *m0;	/* pf_normalize messes with m0 */
6309	h = mtod(m, struct ip6_hdr *);
6310
6311#if 1
6312	/*
6313	 * we do not support jumbogram yet.  if we keep going, zero ip6_plen
6314	 * will do something bad, so drop the packet for now.
6315	 */
6316	if (htons(h->ip6_plen) == 0) {
6317		action = PF_DROP;
6318		REASON_SET(&reason, PFRES_NORM);	/*XXX*/
6319		goto done;
6320	}
6321#endif
6322
6323	pd.src = (struct pf_addr *)&h->ip6_src;
6324	pd.dst = (struct pf_addr *)&h->ip6_dst;
6325	pd.sport = pd.dport = NULL;
6326	pd.ip_sum = NULL;
6327	pd.proto_sum = NULL;
6328	pd.dir = dir;
6329	pd.sidx = (dir == PF_IN) ? 0 : 1;
6330	pd.didx = (dir == PF_IN) ? 1 : 0;
6331	pd.af = AF_INET6;
6332	pd.tos = 0;
6333	pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
6334
6335	off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
6336	pd.proto = h->ip6_nxt;
6337	do {
6338		switch (pd.proto) {
6339		case IPPROTO_FRAGMENT:
6340			action = pf_test_fragment(&r, dir, kif, m, h,
6341			    &pd, &a, &ruleset);
6342			if (action == PF_DROP)
6343				REASON_SET(&reason, PFRES_FRAG);
6344			goto done;
6345		case IPPROTO_ROUTING: {
6346			struct ip6_rthdr rthdr;
6347
6348			if (rh_cnt++) {
6349				DPFPRINTF(PF_DEBUG_MISC,
6350				    ("pf: IPv6 more than one rthdr\n"));
6351				action = PF_DROP;
6352				REASON_SET(&reason, PFRES_IPOPTIONS);
6353				log = 1;
6354				goto done;
6355			}
6356			if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
6357			    &reason, pd.af)) {
6358				DPFPRINTF(PF_DEBUG_MISC,
6359				    ("pf: IPv6 short rthdr\n"));
6360				action = PF_DROP;
6361				REASON_SET(&reason, PFRES_SHORT);
6362				log = 1;
6363				goto done;
6364			}
6365			if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
6366				DPFPRINTF(PF_DEBUG_MISC,
6367				    ("pf: IPv6 rthdr0\n"));
6368				action = PF_DROP;
6369				REASON_SET(&reason, PFRES_IPOPTIONS);
6370				log = 1;
6371				goto done;
6372			}
6373			/* FALLTHROUGH */
6374		}
6375		case IPPROTO_AH:
6376		case IPPROTO_HOPOPTS:
6377		case IPPROTO_DSTOPTS: {
6378			/* get next header and header length */
6379			struct ip6_ext	opt6;
6380
6381			if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
6382			    NULL, &reason, pd.af)) {
6383				DPFPRINTF(PF_DEBUG_MISC,
6384				    ("pf: IPv6 short opt\n"));
6385				action = PF_DROP;
6386				log = 1;
6387				goto done;
6388			}
6389			if (pd.proto == IPPROTO_AH)
6390				off += (opt6.ip6e_len + 2) * 4;
6391			else
6392				off += (opt6.ip6e_len + 1) * 8;
6393			pd.proto = opt6.ip6e_nxt;
6394			/* goto the next header */
6395			break;
6396		}
6397		default:
6398			terminal++;
6399			break;
6400		}
6401	} while (!terminal);
6402
6403	/* if there's no routing header, use unmodified mbuf for checksumming */
6404	if (!n)
6405		n = m;
6406
6407	switch (pd.proto) {
6408
6409	case IPPROTO_TCP: {
6410		struct tcphdr	th;
6411
6412		pd.hdr.tcp = &th;
6413		if (!pf_pull_hdr(m, off, &th, sizeof(th),
6414		    &action, &reason, AF_INET6)) {
6415			log = action != PF_PASS;
6416			goto done;
6417		}
6418		pd.p_len = pd.tot_len - off - (th.th_off << 2);
6419		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
6420		if (action == PF_DROP)
6421			goto done;
6422		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
6423		    &reason);
6424		if (action == PF_PASS) {
6425			if (pfsync_update_state_ptr != NULL)
6426				pfsync_update_state_ptr(s);
6427			r = s->rule.ptr;
6428			a = s->anchor.ptr;
6429			log = s->log;
6430		} else if (s == NULL)
6431			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6432			    &a, &ruleset, inp);
6433		break;
6434	}
6435
6436	case IPPROTO_UDP: {
6437		struct udphdr	uh;
6438
6439		pd.hdr.udp = &uh;
6440		if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
6441		    &action, &reason, AF_INET6)) {
6442			log = action != PF_PASS;
6443			goto done;
6444		}
6445		if (uh.uh_dport == 0 ||
6446		    ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
6447		    ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
6448			action = PF_DROP;
6449			REASON_SET(&reason, PFRES_SHORT);
6450			goto done;
6451		}
6452		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
6453		if (action == PF_PASS) {
6454			if (pfsync_update_state_ptr != NULL)
6455				pfsync_update_state_ptr(s);
6456			r = s->rule.ptr;
6457			a = s->anchor.ptr;
6458			log = s->log;
6459		} else if (s == NULL)
6460			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6461			    &a, &ruleset, inp);
6462		break;
6463	}
6464
6465	case IPPROTO_ICMP: {
6466		action = PF_DROP;
6467		DPFPRINTF(PF_DEBUG_MISC,
6468		    ("pf: dropping IPv6 packet with ICMPv4 payload\n"));
6469		goto done;
6470	}
6471
6472	case IPPROTO_ICMPV6: {
6473		struct icmp6_hdr	ih;
6474
6475		pd.hdr.icmp6 = &ih;
6476		if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
6477		    &action, &reason, AF_INET6)) {
6478			log = action != PF_PASS;
6479			goto done;
6480		}
6481		action = pf_test_state_icmp(&s, dir, kif,
6482		    m, off, h, &pd, &reason);
6483		if (action == PF_PASS) {
6484			if (pfsync_update_state_ptr != NULL)
6485				pfsync_update_state_ptr(s);
6486			r = s->rule.ptr;
6487			a = s->anchor.ptr;
6488			log = s->log;
6489		} else if (s == NULL)
6490			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6491			    &a, &ruleset, inp);
6492		break;
6493	}
6494
6495	default:
6496		action = pf_test_state_other(&s, dir, kif, m, &pd);
6497		if (action == PF_PASS) {
6498			if (pfsync_update_state_ptr != NULL)
6499				pfsync_update_state_ptr(s);
6500			r = s->rule.ptr;
6501			a = s->anchor.ptr;
6502			log = s->log;
6503		} else if (s == NULL)
6504			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6505			    &a, &ruleset, inp);
6506		break;
6507	}
6508
6509done:
6510	PF_RULES_RUNLOCK();
6511	if (n != m) {
6512		m_freem(n);
6513		n = NULL;
6514	}
6515
6516	/* handle dangerous IPv6 extension headers. */
6517	if (action == PF_PASS && rh_cnt &&
6518	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
6519		action = PF_DROP;
6520		REASON_SET(&reason, PFRES_IPOPTIONS);
6521		log = r->log;
6522		DPFPRINTF(PF_DEBUG_MISC,
6523		    ("pf: dropping packet with dangerous v6 headers\n"));
6524	}
6525
6526	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
6527		action = PF_DROP;
6528		REASON_SET(&reason, PFRES_MEMORY);
6529	}
6530	if (r->rtableid >= 0)
6531		M_SETFIB(m, r->rtableid);
6532
6533	if (r->scrub_flags & PFSTATE_SETPRIO) {
6534		if (pd.tos & IPTOS_LOWDELAY)
6535			pqid = 1;
6536		if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) {
6537			action = PF_DROP;
6538			REASON_SET(&reason, PFRES_MEMORY);
6539			log = 1;
6540			DPFPRINTF(PF_DEBUG_MISC,
6541			    ("pf: failed to allocate 802.1q mtag\n"));
6542		}
6543	}
6544
6545#ifdef ALTQ
6546	if (action == PF_PASS && r->qid) {
6547		if (pd.pf_mtag == NULL &&
6548		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
6549			action = PF_DROP;
6550			REASON_SET(&reason, PFRES_MEMORY);
6551		} else {
6552			if (s != NULL)
6553				pd.pf_mtag->qid_hash = pf_state_hash(s);
6554			if (pd.tos & IPTOS_LOWDELAY)
6555				pd.pf_mtag->qid = r->pqid;
6556			else
6557				pd.pf_mtag->qid = r->qid;
6558			/* Add hints for ecn. */
6559			pd.pf_mtag->hdr = h;
6560		}
6561	}
6562#endif /* ALTQ */
6563
6564	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
6565	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
6566	    (s->nat_rule.ptr->action == PF_RDR ||
6567	    s->nat_rule.ptr->action == PF_BINAT) &&
6568	    IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
6569		m->m_flags |= M_SKIP_FIREWALL;
6570
6571	/* XXX: Anybody working on it?! */
6572	if (r->divert.port)
6573		printf("pf: divert(9) is not supported for IPv6\n");
6574
6575	if (log) {
6576		struct pf_rule *lr;
6577
6578		if (s != NULL && s->nat_rule.ptr != NULL &&
6579		    s->nat_rule.ptr->log & PF_LOG_ALL)
6580			lr = s->nat_rule.ptr;
6581		else
6582			lr = r;
6583		PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset,
6584		    &pd, (s == NULL));
6585	}
6586
6587	kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
6588	kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
6589
6590	if (action == PF_PASS || r->action == PF_DROP) {
6591		dirndx = (dir == PF_OUT);
6592		r->packets[dirndx]++;
6593		r->bytes[dirndx] += pd.tot_len;
6594		if (a != NULL) {
6595			a->packets[dirndx]++;
6596			a->bytes[dirndx] += pd.tot_len;
6597		}
6598		if (s != NULL) {
6599			if (s->nat_rule.ptr != NULL) {
6600				s->nat_rule.ptr->packets[dirndx]++;
6601				s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
6602			}
6603			if (s->src_node != NULL) {
6604				s->src_node->packets[dirndx]++;
6605				s->src_node->bytes[dirndx] += pd.tot_len;
6606			}
6607			if (s->nat_src_node != NULL) {
6608				s->nat_src_node->packets[dirndx]++;
6609				s->nat_src_node->bytes[dirndx] += pd.tot_len;
6610			}
6611			dirndx = (dir == s->direction) ? 0 : 1;
6612			s->packets[dirndx]++;
6613			s->bytes[dirndx] += pd.tot_len;
6614		}
6615		tr = r;
6616		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
6617		if (nr != NULL && r == &V_pf_default_rule)
6618			tr = nr;
6619		if (tr->src.addr.type == PF_ADDR_TABLE)
6620			pfr_update_stats(tr->src.addr.p.tbl,
6621			    (s == NULL) ? pd.src :
6622			    &s->key[(s->direction == PF_IN)]->addr[0],
6623			    pd.af, pd.tot_len, dir == PF_OUT,
6624			    r->action == PF_PASS, tr->src.neg);
6625		if (tr->dst.addr.type == PF_ADDR_TABLE)
6626			pfr_update_stats(tr->dst.addr.p.tbl,
6627			    (s == NULL) ? pd.dst :
6628			    &s->key[(s->direction == PF_IN)]->addr[1],
6629			    pd.af, pd.tot_len, dir == PF_OUT,
6630			    r->action == PF_PASS, tr->dst.neg);
6631	}
6632
6633	switch (action) {
6634	case PF_SYNPROXY_DROP:
6635		m_freem(*m0);
6636	case PF_DEFER:
6637		*m0 = NULL;
6638		action = PF_PASS;
6639		break;
6640	case PF_DROP:
6641		m_freem(*m0);
6642		*m0 = NULL;
6643		break;
6644	default:
6645		/* pf_route6() returns unlocked. */
6646		if (r->rt) {
6647			pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd);
6648			return (action);
6649		}
6650		break;
6651	}
6652
6653	if (s)
6654		PF_STATE_UNLOCK(s);
6655
6656	/* If reassembled packet passed, create new fragments. */
6657	if (action == PF_PASS && *m0 && (pflags & PFIL_FWD) &&
6658	    (mtag = m_tag_find(m, PF_REASSEMBLED, NULL)) != NULL)
6659		action = pf_refragment6(ifp, m0, mtag);
6660
6661	return (action);
6662}
6663#endif /* INET6 */
6664