1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2001 Daniel Hartmeier
5 * Copyright (c) 2002 - 2008 Henning Brauer
6 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 *    - Redistributions of source code must retain the above copyright
14 *      notice, this list of conditions and the following disclaimer.
15 *    - Redistributions in binary form must reproduce the above
16 *      copyright notice, this list of conditions and the following
17 *      disclaimer in the documentation and/or other materials provided
18 *      with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Effort sponsored in part by the Defense Advanced Research Projects
34 * Agency (DARPA) and Air Force Research Laboratory, Air Force
35 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
36 *
37 *	$OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
38 */
39
40#include <sys/cdefs.h>
41#include "opt_bpf.h"
42#include "opt_inet.h"
43#include "opt_inet6.h"
44#include "opt_pf.h"
45#include "opt_sctp.h"
46
47#include <sys/param.h>
48#include <sys/bus.h>
49#include <sys/endian.h>
50#include <sys/gsb_crc32.h>
51#include <sys/hash.h>
52#include <sys/interrupt.h>
53#include <sys/kernel.h>
54#include <sys/kthread.h>
55#include <sys/limits.h>
56#include <sys/mbuf.h>
57#include <sys/md5.h>
58#include <sys/random.h>
59#include <sys/refcount.h>
60#include <sys/sdt.h>
61#include <sys/socket.h>
62#include <sys/sysctl.h>
63#include <sys/taskqueue.h>
64#include <sys/ucred.h>
65
66#include <net/if.h>
67#include <net/if_var.h>
68#include <net/if_private.h>
69#include <net/if_types.h>
70#include <net/if_vlan_var.h>
71#include <net/route.h>
72#include <net/route/nhop.h>
73#include <net/vnet.h>
74
75#include <net/pfil.h>
76#include <net/pfvar.h>
77#include <net/if_pflog.h>
78#include <net/if_pfsync.h>
79
80#include <netinet/in_pcb.h>
81#include <netinet/in_var.h>
82#include <netinet/in_fib.h>
83#include <netinet/ip.h>
84#include <netinet/ip_fw.h>
85#include <netinet/ip_icmp.h>
86#include <netinet/icmp_var.h>
87#include <netinet/ip_var.h>
88#include <netinet/tcp.h>
89#include <netinet/tcp_fsm.h>
90#include <netinet/tcp_seq.h>
91#include <netinet/tcp_timer.h>
92#include <netinet/tcp_var.h>
93#include <netinet/udp.h>
94#include <netinet/udp_var.h>
95
96/* dummynet */
97#include <netinet/ip_dummynet.h>
98#include <netinet/ip_fw.h>
99#include <netpfil/ipfw/dn_heap.h>
100#include <netpfil/ipfw/ip_fw_private.h>
101#include <netpfil/ipfw/ip_dn_private.h>
102
103#ifdef INET6
104#include <netinet/ip6.h>
105#include <netinet/icmp6.h>
106#include <netinet6/nd6.h>
107#include <netinet6/ip6_var.h>
108#include <netinet6/in6_pcb.h>
109#include <netinet6/in6_fib.h>
110#include <netinet6/scope6_var.h>
111#endif /* INET6 */
112
113#include <netinet/sctp_header.h>
114#include <netinet/sctp_crc32.h>
115
116#include <machine/in_cksum.h>
117#include <security/mac/mac_framework.h>
118
119#define	DPFPRINTF(n, x)	if (V_pf_status.debug >= (n)) printf x
120
121SDT_PROVIDER_DEFINE(pf);
122SDT_PROBE_DEFINE4(pf, ip, test, done, "int", "int", "struct pf_krule *",
123    "struct pf_kstate *");
124SDT_PROBE_DEFINE4(pf, ip, test6, done, "int", "int", "struct pf_krule *",
125    "struct pf_kstate *");
126SDT_PROBE_DEFINE5(pf, ip, state, lookup, "struct pfi_kkif *",
127    "struct pf_state_key_cmp *", "int", "struct pf_pdesc *",
128    "struct pf_kstate *");
129SDT_PROBE_DEFINE2(pf, ip, , bound_iface, "struct pf_kstate *",
130    "struct pfi_kkif *");
131SDT_PROBE_DEFINE4(pf, sctp, multihome, test, "struct pfi_kkif *",
132    "struct pf_krule *", "struct mbuf *", "int");
133SDT_PROBE_DEFINE2(pf, sctp, multihome, add, "uint32_t",
134    "struct pf_sctp_source *");
135SDT_PROBE_DEFINE3(pf, sctp, multihome, remove, "uint32_t",
136    "struct pf_kstate *", "struct pf_sctp_source *");
137
138SDT_PROBE_DEFINE3(pf, eth, test_rule, entry, "int", "struct ifnet *",
139    "struct mbuf *");
140SDT_PROBE_DEFINE2(pf, eth, test_rule, test, "int", "struct pf_keth_rule *");
141SDT_PROBE_DEFINE3(pf, eth, test_rule, mismatch,
142    "int", "struct pf_keth_rule *", "char *");
143SDT_PROBE_DEFINE2(pf, eth, test_rule, match, "int", "struct pf_keth_rule *");
144SDT_PROBE_DEFINE2(pf, eth, test_rule, final_match,
145    "int", "struct pf_keth_rule *");
146SDT_PROBE_DEFINE2(pf, purge, state, rowcount, "int", "size_t");
147
148/*
149 * Global variables
150 */
151
152/* state tables */
153VNET_DEFINE(struct pf_altqqueue,	 pf_altqs[4]);
154VNET_DEFINE(struct pf_kpalist,		 pf_pabuf);
155VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_active);
156VNET_DEFINE(struct pf_altqqueue *,	 pf_altq_ifs_active);
157VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_inactive);
158VNET_DEFINE(struct pf_altqqueue *,	 pf_altq_ifs_inactive);
159VNET_DEFINE(struct pf_kstatus,		 pf_status);
160
161VNET_DEFINE(u_int32_t,			 ticket_altqs_active);
162VNET_DEFINE(u_int32_t,			 ticket_altqs_inactive);
163VNET_DEFINE(int,			 altqs_inactive_open);
164VNET_DEFINE(u_int32_t,			 ticket_pabuf);
165
166VNET_DEFINE(MD5_CTX,			 pf_tcp_secret_ctx);
167#define	V_pf_tcp_secret_ctx		 VNET(pf_tcp_secret_ctx)
168VNET_DEFINE(u_char,			 pf_tcp_secret[16]);
169#define	V_pf_tcp_secret			 VNET(pf_tcp_secret)
170VNET_DEFINE(int,			 pf_tcp_secret_init);
171#define	V_pf_tcp_secret_init		 VNET(pf_tcp_secret_init)
172VNET_DEFINE(int,			 pf_tcp_iss_off);
173#define	V_pf_tcp_iss_off		 VNET(pf_tcp_iss_off)
174VNET_DECLARE(int,			 pf_vnet_active);
175#define	V_pf_vnet_active		 VNET(pf_vnet_active)
176
177VNET_DEFINE_STATIC(uint32_t, pf_purge_idx);
178#define V_pf_purge_idx	VNET(pf_purge_idx)
179
180#ifdef PF_WANT_32_TO_64_COUNTER
181VNET_DEFINE_STATIC(uint32_t, pf_counter_periodic_iter);
182#define	V_pf_counter_periodic_iter	VNET(pf_counter_periodic_iter)
183
184VNET_DEFINE(struct allrulelist_head, pf_allrulelist);
185VNET_DEFINE(size_t, pf_allrulecount);
186VNET_DEFINE(struct pf_krule *, pf_rulemarker);
187#endif
188
189struct pf_sctp_endpoint;
190RB_HEAD(pf_sctp_endpoints, pf_sctp_endpoint);
191struct pf_sctp_source {
192	sa_family_t			af;
193	struct pf_addr			addr;
194	TAILQ_ENTRY(pf_sctp_source)	entry;
195};
196TAILQ_HEAD(pf_sctp_sources, pf_sctp_source);
197struct pf_sctp_endpoint
198{
199	uint32_t		 v_tag;
200	struct pf_sctp_sources	 sources;
201	RB_ENTRY(pf_sctp_endpoint)	entry;
202};
203static int
204pf_sctp_endpoint_compare(struct pf_sctp_endpoint *a, struct pf_sctp_endpoint *b)
205{
206	return (a->v_tag - b->v_tag);
207}
208RB_PROTOTYPE(pf_sctp_endpoints, pf_sctp_endpoint, entry, pf_sctp_endpoint_compare);
209RB_GENERATE(pf_sctp_endpoints, pf_sctp_endpoint, entry, pf_sctp_endpoint_compare);
210VNET_DEFINE_STATIC(struct pf_sctp_endpoints, pf_sctp_endpoints);
211#define V_pf_sctp_endpoints	VNET(pf_sctp_endpoints)
212static struct mtx_padalign pf_sctp_endpoints_mtx;
213MTX_SYSINIT(pf_sctp_endpoints_mtx, &pf_sctp_endpoints_mtx, "SCTP endpoints", MTX_DEF);
214#define	PF_SCTP_ENDPOINTS_LOCK()	mtx_lock(&pf_sctp_endpoints_mtx)
215#define	PF_SCTP_ENDPOINTS_UNLOCK()	mtx_unlock(&pf_sctp_endpoints_mtx)
216
217/*
218 * Queue for pf_intr() sends.
219 */
220static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations");
221struct pf_send_entry {
222	STAILQ_ENTRY(pf_send_entry)	pfse_next;
223	struct mbuf			*pfse_m;
224	enum {
225		PFSE_IP,
226		PFSE_IP6,
227		PFSE_ICMP,
228		PFSE_ICMP6,
229	}				pfse_type;
230	struct {
231		int		type;
232		int		code;
233		int		mtu;
234	} icmpopts;
235};
236
237STAILQ_HEAD(pf_send_head, pf_send_entry);
238VNET_DEFINE_STATIC(struct pf_send_head, pf_sendqueue);
239#define	V_pf_sendqueue	VNET(pf_sendqueue)
240
241static struct mtx_padalign pf_sendqueue_mtx;
242MTX_SYSINIT(pf_sendqueue_mtx, &pf_sendqueue_mtx, "pf send queue", MTX_DEF);
243#define	PF_SENDQ_LOCK()		mtx_lock(&pf_sendqueue_mtx)
244#define	PF_SENDQ_UNLOCK()	mtx_unlock(&pf_sendqueue_mtx)
245
246/*
247 * Queue for pf_overload_task() tasks.
248 */
249struct pf_overload_entry {
250	SLIST_ENTRY(pf_overload_entry)	next;
251	struct pf_addr  		addr;
252	sa_family_t			af;
253	uint8_t				dir;
254	struct pf_krule  		*rule;
255};
256
257SLIST_HEAD(pf_overload_head, pf_overload_entry);
258VNET_DEFINE_STATIC(struct pf_overload_head, pf_overloadqueue);
259#define V_pf_overloadqueue	VNET(pf_overloadqueue)
260VNET_DEFINE_STATIC(struct task, pf_overloadtask);
261#define	V_pf_overloadtask	VNET(pf_overloadtask)
262
263static struct mtx_padalign pf_overloadqueue_mtx;
264MTX_SYSINIT(pf_overloadqueue_mtx, &pf_overloadqueue_mtx,
265    "pf overload/flush queue", MTX_DEF);
266#define	PF_OVERLOADQ_LOCK()	mtx_lock(&pf_overloadqueue_mtx)
267#define	PF_OVERLOADQ_UNLOCK()	mtx_unlock(&pf_overloadqueue_mtx)
268
269VNET_DEFINE(struct pf_krulequeue, pf_unlinked_rules);
270struct mtx_padalign pf_unlnkdrules_mtx;
271MTX_SYSINIT(pf_unlnkdrules_mtx, &pf_unlnkdrules_mtx, "pf unlinked rules",
272    MTX_DEF);
273
274struct sx pf_config_lock;
275SX_SYSINIT(pf_config_lock, &pf_config_lock, "pf config");
276
277struct mtx_padalign pf_table_stats_lock;
278MTX_SYSINIT(pf_table_stats_lock, &pf_table_stats_lock, "pf table stats",
279    MTX_DEF);
280
281VNET_DEFINE_STATIC(uma_zone_t,	pf_sources_z);
282#define	V_pf_sources_z	VNET(pf_sources_z)
283uma_zone_t		pf_mtag_z;
284VNET_DEFINE(uma_zone_t,	 pf_state_z);
285VNET_DEFINE(uma_zone_t,	 pf_state_key_z);
286
287VNET_DEFINE(struct unrhdr64, pf_stateid);
288
289static void		 pf_src_tree_remove_state(struct pf_kstate *);
290static void		 pf_init_threshold(struct pf_threshold *, u_int32_t,
291			    u_int32_t);
292static void		 pf_add_threshold(struct pf_threshold *);
293static int		 pf_check_threshold(struct pf_threshold *);
294
295static void		 pf_change_ap(struct mbuf *, struct pf_addr *, u_int16_t *,
296			    u_int16_t *, u_int16_t *, struct pf_addr *,
297			    u_int16_t, u_int8_t, sa_family_t);
298static int		 pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
299			    struct tcphdr *, struct pf_state_peer *);
300static void		 pf_change_icmp(struct pf_addr *, u_int16_t *,
301			    struct pf_addr *, struct pf_addr *, u_int16_t,
302			    u_int16_t *, u_int16_t *, u_int16_t *,
303			    u_int16_t *, u_int8_t, sa_family_t);
304static void		 pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
305			    sa_family_t, struct pf_krule *, int);
306static void		 pf_detach_state(struct pf_kstate *);
307static int		 pf_state_key_attach(struct pf_state_key *,
308			    struct pf_state_key *, struct pf_kstate *);
309static void		 pf_state_key_detach(struct pf_kstate *, int);
310static int		 pf_state_key_ctor(void *, int, void *, int);
311static u_int32_t	 pf_tcp_iss(struct pf_pdesc *);
312static __inline void	 pf_dummynet_flag_remove(struct mbuf *m,
313			    struct pf_mtag *pf_mtag);
314static int		 pf_dummynet(struct pf_pdesc *, struct pf_kstate *,
315			    struct pf_krule *, struct mbuf **);
316static int		 pf_dummynet_route(struct pf_pdesc *,
317			    struct pf_kstate *, struct pf_krule *,
318			    struct ifnet *, struct sockaddr *, struct mbuf **);
319static int		 pf_test_eth_rule(int, struct pfi_kkif *,
320			    struct mbuf **);
321static int		 pf_test_rule(struct pf_krule **, struct pf_kstate **,
322			    struct pfi_kkif *, struct mbuf *, int,
323			    struct pf_pdesc *, struct pf_krule **,
324			    struct pf_kruleset **, struct inpcb *);
325static int		 pf_create_state(struct pf_krule *, struct pf_krule *,
326			    struct pf_krule *, struct pf_pdesc *,
327			    struct pf_ksrc_node *, struct pf_state_key *,
328			    struct pf_state_key *, struct mbuf *, int,
329			    u_int16_t, u_int16_t, int *, struct pfi_kkif *,
330			    struct pf_kstate **, int, u_int16_t, u_int16_t,
331			    int, struct pf_krule_slist *);
332static int		 pf_test_fragment(struct pf_krule **, struct pfi_kkif *,
333			    struct mbuf *, void *, struct pf_pdesc *,
334			    struct pf_krule **, struct pf_kruleset **);
335static int		 pf_tcp_track_full(struct pf_kstate **,
336			    struct pfi_kkif *, struct mbuf *, int,
337			    struct pf_pdesc *, u_short *, int *);
338static int		 pf_tcp_track_sloppy(struct pf_kstate **,
339			    struct pf_pdesc *, u_short *);
340static int		 pf_test_state_tcp(struct pf_kstate **,
341			    struct pfi_kkif *, struct mbuf *, int,
342			    void *, struct pf_pdesc *, u_short *);
343static int		 pf_test_state_udp(struct pf_kstate **,
344			    struct pfi_kkif *, struct mbuf *, int,
345			    void *, struct pf_pdesc *);
346static int		 pf_test_state_icmp(struct pf_kstate **,
347			    struct pfi_kkif *, struct mbuf *, int,
348			    void *, struct pf_pdesc *, u_short *);
349static void		 pf_sctp_multihome_detach_addr(const struct pf_kstate *);
350static void		 pf_sctp_multihome_delayed(struct pf_pdesc *, int,
351			    struct pfi_kkif *, struct pf_kstate *, int);
352static int		 pf_test_state_sctp(struct pf_kstate **,
353			    struct pfi_kkif *, struct mbuf *, int,
354			    void *, struct pf_pdesc *, u_short *);
355static int		 pf_test_state_other(struct pf_kstate **,
356			    struct pfi_kkif *, struct mbuf *, struct pf_pdesc *);
357static u_int16_t	 pf_calc_mss(struct pf_addr *, sa_family_t,
358				int, u_int16_t);
359static int		 pf_check_proto_cksum(struct mbuf *, int, int,
360			    u_int8_t, sa_family_t);
361static void		 pf_print_state_parts(struct pf_kstate *,
362			    struct pf_state_key *, struct pf_state_key *);
363static void		 pf_patch_8(struct mbuf *, u_int16_t *, u_int8_t *, u_int8_t,
364			    bool, u_int8_t);
365static struct pf_kstate	*pf_find_state(struct pfi_kkif *,
366			    struct pf_state_key_cmp *, u_int);
367static int		 pf_src_connlimit(struct pf_kstate **);
368static void		 pf_overload_task(void *v, int pending);
369static u_short		 pf_insert_src_node(struct pf_ksrc_node **,
370			    struct pf_krule *, struct pf_addr *, sa_family_t);
371static u_int		 pf_purge_expired_states(u_int, int);
372static void		 pf_purge_unlinked_rules(void);
373static int		 pf_mtag_uminit(void *, int, int);
374static void		 pf_mtag_free(struct m_tag *);
375static void		 pf_packet_rework_nat(struct mbuf *, struct pf_pdesc *,
376			    int, struct pf_state_key *);
377#ifdef INET
378static void		 pf_route(struct mbuf **, struct pf_krule *,
379			    struct ifnet *, struct pf_kstate *,
380			    struct pf_pdesc *, struct inpcb *);
381#endif /* INET */
382#ifdef INET6
383static void		 pf_change_a6(struct pf_addr *, u_int16_t *,
384			    struct pf_addr *, u_int8_t);
385static void		 pf_route6(struct mbuf **, struct pf_krule *,
386			    struct ifnet *, struct pf_kstate *,
387			    struct pf_pdesc *, struct inpcb *);
388#endif /* INET6 */
389static __inline void pf_set_protostate(struct pf_kstate *, int, u_int8_t);
390
391int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
392
393extern int pf_end_threads;
394extern struct proc *pf_purge_proc;
395
396VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]);
397
398#define	PACKET_UNDO_NAT(_m, _pd, _off, _s)		\
399	do {								\
400		struct pf_state_key *nk;				\
401		if ((pd->dir) == PF_OUT)					\
402			nk = (_s)->key[PF_SK_STACK];			\
403		else							\
404			nk = (_s)->key[PF_SK_WIRE];			\
405		pf_packet_rework_nat(_m, _pd, _off, nk);		\
406	} while (0)
407
408#define	PACKET_LOOPED(pd)	((pd)->pf_mtag &&			\
409				 (pd)->pf_mtag->flags & PF_MTAG_FLAG_PACKET_LOOPED)
410
411#define	STATE_LOOKUP(i, k, s, pd)					\
412	do {								\
413		(s) = pf_find_state((i), (k), (pd->dir));			\
414		SDT_PROBE5(pf, ip, state, lookup, i, k, (pd->dir), pd, (s));	\
415		if ((s) == NULL)					\
416			return (PF_DROP);				\
417		if (PACKET_LOOPED(pd))					\
418			return (PF_PASS);				\
419	} while (0)
420
421static struct pfi_kkif *
422BOUND_IFACE(struct pf_kstate *st, struct pfi_kkif *k)
423{
424	SDT_PROBE2(pf, ip, , bound_iface, st, k);
425
426	/* Floating unless otherwise specified. */
427	if (! (st->rule.ptr->rule_flag & PFRULE_IFBOUND))
428		return (V_pfi_all);
429
430	/*
431	 * Initially set to all, because we don't know what interface we'll be
432	 * sending this out when we create the state.
433	 */
434	if (st->rule.ptr->rt == PF_REPLYTO)
435		return (V_pfi_all);
436
437	/* Don't overrule the interface for states created on incoming packets. */
438	if (st->direction == PF_IN)
439		return (k);
440
441	/* No route-to, so don't overrule. */
442	if (st->rt != PF_ROUTETO)
443		return (k);
444
445	/* Bind to the route-to interface. */
446	return (st->rt_kif);
447}
448
449#define	STATE_INC_COUNTERS(s)						\
450	do {								\
451		struct pf_krule_item *mrm;				\
452		counter_u64_add(s->rule.ptr->states_cur, 1);		\
453		counter_u64_add(s->rule.ptr->states_tot, 1);		\
454		if (s->anchor.ptr != NULL) {				\
455			counter_u64_add(s->anchor.ptr->states_cur, 1);	\
456			counter_u64_add(s->anchor.ptr->states_tot, 1);	\
457		}							\
458		if (s->nat_rule.ptr != NULL) {				\
459			counter_u64_add(s->nat_rule.ptr->states_cur, 1);\
460			counter_u64_add(s->nat_rule.ptr->states_tot, 1);\
461		}							\
462		SLIST_FOREACH(mrm, &s->match_rules, entry) {		\
463			counter_u64_add(mrm->r->states_cur, 1);		\
464			counter_u64_add(mrm->r->states_tot, 1);		\
465		}							\
466	} while (0)
467
468#define	STATE_DEC_COUNTERS(s)						\
469	do {								\
470		struct pf_krule_item *mrm;				\
471		if (s->nat_rule.ptr != NULL)				\
472			counter_u64_add(s->nat_rule.ptr->states_cur, -1);\
473		if (s->anchor.ptr != NULL)				\
474			counter_u64_add(s->anchor.ptr->states_cur, -1);	\
475		counter_u64_add(s->rule.ptr->states_cur, -1);		\
476		SLIST_FOREACH(mrm, &s->match_rules, entry)		\
477			counter_u64_add(mrm->r->states_cur, -1);	\
478	} while (0)
479
480MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures");
481MALLOC_DEFINE(M_PF_RULE_ITEM, "pf_krule_item", "pf(4) rule items");
482VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
483VNET_DEFINE(struct pf_idhash *, pf_idhash);
484VNET_DEFINE(struct pf_srchash *, pf_srchash);
485
486SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
487    "pf(4)");
488
489u_long	pf_hashmask;
490u_long	pf_srchashmask;
491static u_long	pf_hashsize;
492static u_long	pf_srchashsize;
493u_long	pf_ioctl_maxcount = 65535;
494
495SYSCTL_ULONG(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN,
496    &pf_hashsize, 0, "Size of pf(4) states hashtable");
497SYSCTL_ULONG(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN,
498    &pf_srchashsize, 0, "Size of pf(4) source nodes hashtable");
499SYSCTL_ULONG(_net_pf, OID_AUTO, request_maxcount, CTLFLAG_RWTUN,
500    &pf_ioctl_maxcount, 0, "Maximum number of tables, addresses, ... in a single ioctl() call");
501
502VNET_DEFINE(void *, pf_swi_cookie);
503VNET_DEFINE(struct intr_event *, pf_swi_ie);
504
505VNET_DEFINE(uint32_t, pf_hashseed);
506#define	V_pf_hashseed	VNET(pf_hashseed)
507
508static void
509pf_sctp_checksum(struct mbuf *m, int off)
510{
511	uint32_t sum = 0;
512
513	/* Zero out the checksum, to enable recalculation. */
514	m_copyback(m, off + offsetof(struct sctphdr, checksum),
515	    sizeof(sum), (caddr_t)&sum);
516
517	sum = sctp_calculate_cksum(m, off);
518
519	m_copyback(m, off + offsetof(struct sctphdr, checksum),
520	    sizeof(sum), (caddr_t)&sum);
521}
522
523int
524pf_addr_cmp(struct pf_addr *a, struct pf_addr *b, sa_family_t af)
525{
526
527	switch (af) {
528#ifdef INET
529	case AF_INET:
530		if (a->addr32[0] > b->addr32[0])
531			return (1);
532		if (a->addr32[0] < b->addr32[0])
533			return (-1);
534		break;
535#endif /* INET */
536#ifdef INET6
537	case AF_INET6:
538		if (a->addr32[3] > b->addr32[3])
539			return (1);
540		if (a->addr32[3] < b->addr32[3])
541			return (-1);
542		if (a->addr32[2] > b->addr32[2])
543			return (1);
544		if (a->addr32[2] < b->addr32[2])
545			return (-1);
546		if (a->addr32[1] > b->addr32[1])
547			return (1);
548		if (a->addr32[1] < b->addr32[1])
549			return (-1);
550		if (a->addr32[0] > b->addr32[0])
551			return (1);
552		if (a->addr32[0] < b->addr32[0])
553			return (-1);
554		break;
555#endif /* INET6 */
556	default:
557		panic("%s: unknown address family %u", __func__, af);
558	}
559	return (0);
560}
561
562static void
563pf_packet_rework_nat(struct mbuf *m, struct pf_pdesc *pd, int off,
564	struct pf_state_key *nk)
565{
566
567	switch (pd->proto) {
568	case IPPROTO_TCP: {
569		struct tcphdr *th = &pd->hdr.tcp;
570
571		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af))
572			pf_change_ap(m, pd->src, &th->th_sport, pd->ip_sum,
573			    &th->th_sum, &nk->addr[pd->sidx],
574			    nk->port[pd->sidx], 0, pd->af);
575		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af))
576			pf_change_ap(m, pd->dst, &th->th_dport, pd->ip_sum,
577			    &th->th_sum, &nk->addr[pd->didx],
578			    nk->port[pd->didx], 0, pd->af);
579		m_copyback(m, off, sizeof(*th), (caddr_t)th);
580		break;
581	}
582	case IPPROTO_UDP: {
583		struct udphdr *uh = &pd->hdr.udp;
584
585		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af))
586			pf_change_ap(m, pd->src, &uh->uh_sport, pd->ip_sum,
587			    &uh->uh_sum, &nk->addr[pd->sidx],
588			    nk->port[pd->sidx], 1, pd->af);
589		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af))
590			pf_change_ap(m, pd->dst, &uh->uh_dport, pd->ip_sum,
591			    &uh->uh_sum, &nk->addr[pd->didx],
592			    nk->port[pd->didx], 1, pd->af);
593		m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
594		break;
595	}
596	case IPPROTO_SCTP: {
597		struct sctphdr *sh = &pd->hdr.sctp;
598		uint16_t checksum = 0;
599
600		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af)) {
601			pf_change_ap(m, pd->src, &sh->src_port, pd->ip_sum,
602			    &checksum, &nk->addr[pd->sidx],
603			    nk->port[pd->sidx], 1, pd->af);
604		}
605		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af)) {
606			pf_change_ap(m, pd->dst, &sh->dest_port, pd->ip_sum,
607			    &checksum, &nk->addr[pd->didx],
608			    nk->port[pd->didx], 1, pd->af);
609		}
610
611		break;
612	}
613	case IPPROTO_ICMP: {
614		struct icmp *ih = &pd->hdr.icmp;
615
616		if (nk->port[pd->sidx] != ih->icmp_id) {
617			pd->hdr.icmp.icmp_cksum = pf_cksum_fixup(
618			    ih->icmp_cksum, ih->icmp_id,
619			    nk->port[pd->sidx], 0);
620			ih->icmp_id = nk->port[pd->sidx];
621			pd->sport = &ih->icmp_id;
622
623			m_copyback(m, off, ICMP_MINLEN, (caddr_t)ih);
624		}
625		/* FALLTHROUGH */
626	}
627	default:
628		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af)) {
629			switch (pd->af) {
630			case AF_INET:
631				pf_change_a(&pd->src->v4.s_addr,
632				    pd->ip_sum, nk->addr[pd->sidx].v4.s_addr,
633				    0);
634				break;
635			case AF_INET6:
636				PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
637				break;
638			}
639		}
640		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af)) {
641			switch (pd->af) {
642			case AF_INET:
643				pf_change_a(&pd->dst->v4.s_addr,
644				    pd->ip_sum, nk->addr[pd->didx].v4.s_addr,
645				    0);
646				break;
647			case AF_INET6:
648				PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
649				break;
650			}
651		}
652		break;
653	}
654}
655
656static __inline uint32_t
657pf_hashkey(struct pf_state_key *sk)
658{
659	uint32_t h;
660
661	h = murmur3_32_hash32((uint32_t *)sk,
662	    sizeof(struct pf_state_key_cmp)/sizeof(uint32_t),
663	    V_pf_hashseed);
664
665	return (h & pf_hashmask);
666}
667
668static __inline uint32_t
669pf_hashsrc(struct pf_addr *addr, sa_family_t af)
670{
671	uint32_t h;
672
673	switch (af) {
674	case AF_INET:
675		h = murmur3_32_hash32((uint32_t *)&addr->v4,
676		    sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed);
677		break;
678	case AF_INET6:
679		h = murmur3_32_hash32((uint32_t *)&addr->v6,
680		    sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed);
681		break;
682	default:
683		panic("%s: unknown address family %u", __func__, af);
684	}
685
686	return (h & pf_srchashmask);
687}
688
689#ifdef ALTQ
690static int
691pf_state_hash(struct pf_kstate *s)
692{
693	u_int32_t hv = (intptr_t)s / sizeof(*s);
694
695	hv ^= crc32(&s->src, sizeof(s->src));
696	hv ^= crc32(&s->dst, sizeof(s->dst));
697	if (hv == 0)
698		hv = 1;
699	return (hv);
700}
701#endif
702
703static __inline void
704pf_set_protostate(struct pf_kstate *s, int which, u_int8_t newstate)
705{
706	if (which == PF_PEER_DST || which == PF_PEER_BOTH)
707		s->dst.state = newstate;
708	if (which == PF_PEER_DST)
709		return;
710	if (s->src.state == newstate)
711		return;
712	if (s->creatorid == V_pf_status.hostid &&
713	    s->key[PF_SK_STACK] != NULL &&
714	    s->key[PF_SK_STACK]->proto == IPPROTO_TCP &&
715	    !(TCPS_HAVEESTABLISHED(s->src.state) ||
716	    s->src.state == TCPS_CLOSED) &&
717	    (TCPS_HAVEESTABLISHED(newstate) || newstate == TCPS_CLOSED))
718		atomic_add_32(&V_pf_status.states_halfopen, -1);
719
720	s->src.state = newstate;
721}
722
723#ifdef INET6
724void
725pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
726{
727	switch (af) {
728#ifdef INET
729	case AF_INET:
730		memcpy(&dst->v4, &src->v4, sizeof(dst->v4));
731		break;
732#endif /* INET */
733	case AF_INET6:
734		memcpy(&dst->v6, &src->v6, sizeof(dst->v6));
735		break;
736	}
737}
738#endif /* INET6 */
739
740static void
741pf_init_threshold(struct pf_threshold *threshold,
742    u_int32_t limit, u_int32_t seconds)
743{
744	threshold->limit = limit * PF_THRESHOLD_MULT;
745	threshold->seconds = seconds;
746	threshold->count = 0;
747	threshold->last = time_uptime;
748}
749
750static void
751pf_add_threshold(struct pf_threshold *threshold)
752{
753	u_int32_t t = time_uptime, diff = t - threshold->last;
754
755	if (diff >= threshold->seconds)
756		threshold->count = 0;
757	else
758		threshold->count -= threshold->count * diff /
759		    threshold->seconds;
760	threshold->count += PF_THRESHOLD_MULT;
761	threshold->last = t;
762}
763
764static int
765pf_check_threshold(struct pf_threshold *threshold)
766{
767	return (threshold->count > threshold->limit);
768}
769
770static int
771pf_src_connlimit(struct pf_kstate **state)
772{
773	struct pf_overload_entry *pfoe;
774	int bad = 0;
775
776	PF_STATE_LOCK_ASSERT(*state);
777	/*
778	 * XXXKS: The src node is accessed unlocked!
779	 * PF_SRC_NODE_LOCK_ASSERT((*state)->src_node);
780	 */
781
782	(*state)->src_node->conn++;
783	(*state)->src.tcp_est = 1;
784	pf_add_threshold(&(*state)->src_node->conn_rate);
785
786	if ((*state)->rule.ptr->max_src_conn &&
787	    (*state)->rule.ptr->max_src_conn <
788	    (*state)->src_node->conn) {
789		counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONN], 1);
790		bad++;
791	}
792
793	if ((*state)->rule.ptr->max_src_conn_rate.limit &&
794	    pf_check_threshold(&(*state)->src_node->conn_rate)) {
795		counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONNRATE], 1);
796		bad++;
797	}
798
799	if (!bad)
800		return (0);
801
802	/* Kill this state. */
803	(*state)->timeout = PFTM_PURGE;
804	pf_set_protostate(*state, PF_PEER_BOTH, TCPS_CLOSED);
805
806	if ((*state)->rule.ptr->overload_tbl == NULL)
807		return (1);
808
809	/* Schedule overloading and flushing task. */
810	pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT);
811	if (pfoe == NULL)
812		return (1);	/* too bad :( */
813
814	bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr));
815	pfoe->af = (*state)->key[PF_SK_WIRE]->af;
816	pfoe->rule = (*state)->rule.ptr;
817	pfoe->dir = (*state)->direction;
818	PF_OVERLOADQ_LOCK();
819	SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next);
820	PF_OVERLOADQ_UNLOCK();
821	taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask);
822
823	return (1);
824}
825
826static void
827pf_overload_task(void *v, int pending)
828{
829	struct pf_overload_head queue;
830	struct pfr_addr p;
831	struct pf_overload_entry *pfoe, *pfoe1;
832	uint32_t killed = 0;
833
834	CURVNET_SET((struct vnet *)v);
835
836	PF_OVERLOADQ_LOCK();
837	queue = V_pf_overloadqueue;
838	SLIST_INIT(&V_pf_overloadqueue);
839	PF_OVERLOADQ_UNLOCK();
840
841	bzero(&p, sizeof(p));
842	SLIST_FOREACH(pfoe, &queue, next) {
843		counter_u64_add(V_pf_status.lcounters[LCNT_OVERLOAD_TABLE], 1);
844		if (V_pf_status.debug >= PF_DEBUG_MISC) {
845			printf("%s: blocking address ", __func__);
846			pf_print_host(&pfoe->addr, 0, pfoe->af);
847			printf("\n");
848		}
849
850		p.pfra_af = pfoe->af;
851		switch (pfoe->af) {
852#ifdef INET
853		case AF_INET:
854			p.pfra_net = 32;
855			p.pfra_ip4addr = pfoe->addr.v4;
856			break;
857#endif
858#ifdef INET6
859		case AF_INET6:
860			p.pfra_net = 128;
861			p.pfra_ip6addr = pfoe->addr.v6;
862			break;
863#endif
864		}
865
866		PF_RULES_WLOCK();
867		pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second);
868		PF_RULES_WUNLOCK();
869	}
870
871	/*
872	 * Remove those entries, that don't need flushing.
873	 */
874	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
875		if (pfoe->rule->flush == 0) {
876			SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next);
877			free(pfoe, M_PFTEMP);
878		} else
879			counter_u64_add(
880			    V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH], 1);
881
882	/* If nothing to flush, return. */
883	if (SLIST_EMPTY(&queue)) {
884		CURVNET_RESTORE();
885		return;
886	}
887
888	for (int i = 0; i <= pf_hashmask; i++) {
889		struct pf_idhash *ih = &V_pf_idhash[i];
890		struct pf_state_key *sk;
891		struct pf_kstate *s;
892
893		PF_HASHROW_LOCK(ih);
894		LIST_FOREACH(s, &ih->states, entry) {
895		    sk = s->key[PF_SK_WIRE];
896		    SLIST_FOREACH(pfoe, &queue, next)
897			if (sk->af == pfoe->af &&
898			    ((pfoe->rule->flush & PF_FLUSH_GLOBAL) ||
899			    pfoe->rule == s->rule.ptr) &&
900			    ((pfoe->dir == PF_OUT &&
901			    PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) ||
902			    (pfoe->dir == PF_IN &&
903			    PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) {
904				s->timeout = PFTM_PURGE;
905				pf_set_protostate(s, PF_PEER_BOTH, TCPS_CLOSED);
906				killed++;
907			}
908		}
909		PF_HASHROW_UNLOCK(ih);
910	}
911	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
912		free(pfoe, M_PFTEMP);
913	if (V_pf_status.debug >= PF_DEBUG_MISC)
914		printf("%s: %u states killed", __func__, killed);
915
916	CURVNET_RESTORE();
917}
918
919/*
920 * Can return locked on failure, so that we can consistently
921 * allocate and insert a new one.
922 */
923struct pf_ksrc_node *
924pf_find_src_node(struct pf_addr *src, struct pf_krule *rule, sa_family_t af,
925	struct pf_srchash **sh, bool returnlocked)
926{
927	struct pf_ksrc_node *n;
928
929	counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1);
930
931	*sh = &V_pf_srchash[pf_hashsrc(src, af)];
932	PF_HASHROW_LOCK(*sh);
933	LIST_FOREACH(n, &(*sh)->nodes, entry)
934		if (n->rule.ptr == rule && n->af == af &&
935		    ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) ||
936		    (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0)))
937			break;
938
939	if (n != NULL) {
940		n->states++;
941		PF_HASHROW_UNLOCK(*sh);
942	} else if (returnlocked == false)
943		PF_HASHROW_UNLOCK(*sh);
944
945	return (n);
946}
947
948static void
949pf_free_src_node(struct pf_ksrc_node *sn)
950{
951
952	for (int i = 0; i < 2; i++) {
953		counter_u64_free(sn->bytes[i]);
954		counter_u64_free(sn->packets[i]);
955	}
956	uma_zfree(V_pf_sources_z, sn);
957}
958
959static u_short
960pf_insert_src_node(struct pf_ksrc_node **sn, struct pf_krule *rule,
961    struct pf_addr *src, sa_family_t af)
962{
963	u_short			 reason = 0;
964	struct pf_srchash	*sh = NULL;
965
966	KASSERT((rule->rule_flag & PFRULE_SRCTRACK ||
967	    rule->rpool.opts & PF_POOL_STICKYADDR),
968	    ("%s for non-tracking rule %p", __func__, rule));
969
970	if (*sn == NULL)
971		*sn = pf_find_src_node(src, rule, af, &sh, true);
972
973	if (*sn == NULL) {
974		PF_HASHROW_ASSERT(sh);
975
976		if (rule->max_src_nodes &&
977		    counter_u64_fetch(rule->src_nodes) >= rule->max_src_nodes) {
978			counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES], 1);
979			PF_HASHROW_UNLOCK(sh);
980			reason = PFRES_SRCLIMIT;
981			goto done;
982		}
983
984		(*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO);
985		if ((*sn) == NULL) {
986			PF_HASHROW_UNLOCK(sh);
987			reason = PFRES_MEMORY;
988			goto done;
989		}
990
991		for (int i = 0; i < 2; i++) {
992			(*sn)->bytes[i] = counter_u64_alloc(M_NOWAIT);
993			(*sn)->packets[i] = counter_u64_alloc(M_NOWAIT);
994
995			if ((*sn)->bytes[i] == NULL || (*sn)->packets[i] == NULL) {
996				pf_free_src_node(*sn);
997				PF_HASHROW_UNLOCK(sh);
998				reason = PFRES_MEMORY;
999				goto done;
1000			}
1001		}
1002
1003		pf_init_threshold(&(*sn)->conn_rate,
1004		    rule->max_src_conn_rate.limit,
1005		    rule->max_src_conn_rate.seconds);
1006
1007		MPASS((*sn)->lock == NULL);
1008		(*sn)->lock = &sh->lock;
1009
1010		(*sn)->af = af;
1011		(*sn)->rule.ptr = rule;
1012		PF_ACPY(&(*sn)->addr, src, af);
1013		LIST_INSERT_HEAD(&sh->nodes, *sn, entry);
1014		(*sn)->creation = time_uptime;
1015		(*sn)->ruletype = rule->action;
1016		(*sn)->states = 1;
1017		if ((*sn)->rule.ptr != NULL)
1018			counter_u64_add((*sn)->rule.ptr->src_nodes, 1);
1019		PF_HASHROW_UNLOCK(sh);
1020		counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1);
1021	} else {
1022		if (rule->max_src_states &&
1023		    (*sn)->states >= rule->max_src_states) {
1024			counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES],
1025			    1);
1026			reason = PFRES_SRCLIMIT;
1027			goto done;
1028		}
1029	}
1030done:
1031	return (reason);
1032}
1033
1034void
1035pf_unlink_src_node(struct pf_ksrc_node *src)
1036{
1037	PF_SRC_NODE_LOCK_ASSERT(src);
1038
1039	LIST_REMOVE(src, entry);
1040	if (src->rule.ptr)
1041		counter_u64_add(src->rule.ptr->src_nodes, -1);
1042}
1043
1044u_int
1045pf_free_src_nodes(struct pf_ksrc_node_list *head)
1046{
1047	struct pf_ksrc_node *sn, *tmp;
1048	u_int count = 0;
1049
1050	LIST_FOREACH_SAFE(sn, head, entry, tmp) {
1051		pf_free_src_node(sn);
1052		count++;
1053	}
1054
1055	counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], count);
1056
1057	return (count);
1058}
1059
1060void
1061pf_mtag_initialize(void)
1062{
1063
1064	pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) +
1065	    sizeof(struct pf_mtag), NULL, NULL, pf_mtag_uminit, NULL,
1066	    UMA_ALIGN_PTR, 0);
1067}
1068
1069/* Per-vnet data storage structures initialization. */
1070void
1071pf_initialize(void)
1072{
1073	struct pf_keyhash	*kh;
1074	struct pf_idhash	*ih;
1075	struct pf_srchash	*sh;
1076	u_int i;
1077
1078	if (pf_hashsize == 0 || !powerof2(pf_hashsize))
1079		pf_hashsize = PF_HASHSIZ;
1080	if (pf_srchashsize == 0 || !powerof2(pf_srchashsize))
1081		pf_srchashsize = PF_SRCHASHSIZ;
1082
1083	V_pf_hashseed = arc4random();
1084
1085	/* States and state keys storage. */
1086	V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_kstate),
1087	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1088	V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z;
1089	uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
1090	uma_zone_set_warning(V_pf_state_z, "PF states limit reached");
1091
1092	V_pf_state_key_z = uma_zcreate("pf state keys",
1093	    sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL,
1094	    UMA_ALIGN_PTR, 0);
1095
1096	V_pf_keyhash = mallocarray(pf_hashsize, sizeof(struct pf_keyhash),
1097	    M_PFHASH, M_NOWAIT | M_ZERO);
1098	V_pf_idhash = mallocarray(pf_hashsize, sizeof(struct pf_idhash),
1099	    M_PFHASH, M_NOWAIT | M_ZERO);
1100	if (V_pf_keyhash == NULL || V_pf_idhash == NULL) {
1101		printf("pf: Unable to allocate memory for "
1102		    "state_hashsize %lu.\n", pf_hashsize);
1103
1104		free(V_pf_keyhash, M_PFHASH);
1105		free(V_pf_idhash, M_PFHASH);
1106
1107		pf_hashsize = PF_HASHSIZ;
1108		V_pf_keyhash = mallocarray(pf_hashsize,
1109		    sizeof(struct pf_keyhash), M_PFHASH, M_WAITOK | M_ZERO);
1110		V_pf_idhash = mallocarray(pf_hashsize,
1111		    sizeof(struct pf_idhash), M_PFHASH, M_WAITOK | M_ZERO);
1112	}
1113
1114	pf_hashmask = pf_hashsize - 1;
1115	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask;
1116	    i++, kh++, ih++) {
1117		mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK);
1118		mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
1119	}
1120
1121	/* Source nodes. */
1122	V_pf_sources_z = uma_zcreate("pf source nodes",
1123	    sizeof(struct pf_ksrc_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1124	    0);
1125	V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z;
1126	uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT);
1127	uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached");
1128
1129	V_pf_srchash = mallocarray(pf_srchashsize,
1130	    sizeof(struct pf_srchash), M_PFHASH, M_NOWAIT | M_ZERO);
1131	if (V_pf_srchash == NULL) {
1132		printf("pf: Unable to allocate memory for "
1133		    "source_hashsize %lu.\n", pf_srchashsize);
1134
1135		pf_srchashsize = PF_SRCHASHSIZ;
1136		V_pf_srchash = mallocarray(pf_srchashsize,
1137		    sizeof(struct pf_srchash), M_PFHASH, M_WAITOK | M_ZERO);
1138	}
1139
1140	pf_srchashmask = pf_srchashsize - 1;
1141	for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++)
1142		mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
1143
1144	/* ALTQ */
1145	TAILQ_INIT(&V_pf_altqs[0]);
1146	TAILQ_INIT(&V_pf_altqs[1]);
1147	TAILQ_INIT(&V_pf_altqs[2]);
1148	TAILQ_INIT(&V_pf_altqs[3]);
1149	TAILQ_INIT(&V_pf_pabuf);
1150	V_pf_altqs_active = &V_pf_altqs[0];
1151	V_pf_altq_ifs_active = &V_pf_altqs[1];
1152	V_pf_altqs_inactive = &V_pf_altqs[2];
1153	V_pf_altq_ifs_inactive = &V_pf_altqs[3];
1154
1155	/* Send & overload+flush queues. */
1156	STAILQ_INIT(&V_pf_sendqueue);
1157	SLIST_INIT(&V_pf_overloadqueue);
1158	TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, curvnet);
1159
1160	/* Unlinked, but may be referenced rules. */
1161	TAILQ_INIT(&V_pf_unlinked_rules);
1162}
1163
1164void
1165pf_mtag_cleanup(void)
1166{
1167
1168	uma_zdestroy(pf_mtag_z);
1169}
1170
1171void
1172pf_cleanup(void)
1173{
1174	struct pf_keyhash	*kh;
1175	struct pf_idhash	*ih;
1176	struct pf_srchash	*sh;
1177	struct pf_send_entry	*pfse, *next;
1178	u_int i;
1179
1180	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask;
1181	    i++, kh++, ih++) {
1182		KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
1183		    __func__));
1184		KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
1185		    __func__));
1186		mtx_destroy(&kh->lock);
1187		mtx_destroy(&ih->lock);
1188	}
1189	free(V_pf_keyhash, M_PFHASH);
1190	free(V_pf_idhash, M_PFHASH);
1191
1192	for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) {
1193		KASSERT(LIST_EMPTY(&sh->nodes),
1194		    ("%s: source node hash not empty", __func__));
1195		mtx_destroy(&sh->lock);
1196	}
1197	free(V_pf_srchash, M_PFHASH);
1198
1199	STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
1200		m_freem(pfse->pfse_m);
1201		free(pfse, M_PFTEMP);
1202	}
1203	MPASS(RB_EMPTY(&V_pf_sctp_endpoints));
1204
1205	uma_zdestroy(V_pf_sources_z);
1206	uma_zdestroy(V_pf_state_z);
1207	uma_zdestroy(V_pf_state_key_z);
1208}
1209
1210static int
1211pf_mtag_uminit(void *mem, int size, int how)
1212{
1213	struct m_tag *t;
1214
1215	t = (struct m_tag *)mem;
1216	t->m_tag_cookie = MTAG_ABI_COMPAT;
1217	t->m_tag_id = PACKET_TAG_PF;
1218	t->m_tag_len = sizeof(struct pf_mtag);
1219	t->m_tag_free = pf_mtag_free;
1220
1221	return (0);
1222}
1223
1224static void
1225pf_mtag_free(struct m_tag *t)
1226{
1227
1228	uma_zfree(pf_mtag_z, t);
1229}
1230
1231struct pf_mtag *
1232pf_get_mtag(struct mbuf *m)
1233{
1234	struct m_tag *mtag;
1235
1236	if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL)
1237		return ((struct pf_mtag *)(mtag + 1));
1238
1239	mtag = uma_zalloc(pf_mtag_z, M_NOWAIT);
1240	if (mtag == NULL)
1241		return (NULL);
1242	bzero(mtag + 1, sizeof(struct pf_mtag));
1243	m_tag_prepend(m, mtag);
1244
1245	return ((struct pf_mtag *)(mtag + 1));
1246}
1247
1248static int
1249pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
1250    struct pf_kstate *s)
1251{
1252	struct pf_keyhash	*khs, *khw, *kh;
1253	struct pf_state_key	*sk, *cur;
1254	struct pf_kstate	*si, *olds = NULL;
1255	int idx;
1256
1257	NET_EPOCH_ASSERT();
1258	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
1259	KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
1260	KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
1261
1262	/*
1263	 * We need to lock hash slots of both keys. To avoid deadlock
1264	 * we always lock the slot with lower address first. Unlock order
1265	 * isn't important.
1266	 *
1267	 * We also need to lock ID hash slot before dropping key
1268	 * locks. On success we return with ID hash slot locked.
1269	 */
1270
1271	if (skw == sks) {
1272		khs = khw = &V_pf_keyhash[pf_hashkey(skw)];
1273		PF_HASHROW_LOCK(khs);
1274	} else {
1275		khs = &V_pf_keyhash[pf_hashkey(sks)];
1276		khw = &V_pf_keyhash[pf_hashkey(skw)];
1277		if (khs == khw) {
1278			PF_HASHROW_LOCK(khs);
1279		} else if (khs < khw) {
1280			PF_HASHROW_LOCK(khs);
1281			PF_HASHROW_LOCK(khw);
1282		} else {
1283			PF_HASHROW_LOCK(khw);
1284			PF_HASHROW_LOCK(khs);
1285		}
1286	}
1287
1288#define	KEYS_UNLOCK()	do {			\
1289	if (khs != khw) {			\
1290		PF_HASHROW_UNLOCK(khs);		\
1291		PF_HASHROW_UNLOCK(khw);		\
1292	} else					\
1293		PF_HASHROW_UNLOCK(khs);		\
1294} while (0)
1295
1296	/*
1297	 * First run: start with wire key.
1298	 */
1299	sk = skw;
1300	kh = khw;
1301	idx = PF_SK_WIRE;
1302
1303	MPASS(s->lock == NULL);
1304	s->lock = &V_pf_idhash[PF_IDHASH(s)].lock;
1305
1306keyattach:
1307	LIST_FOREACH(cur, &kh->keys, entry)
1308		if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
1309			break;
1310
1311	if (cur != NULL) {
1312		/* Key exists. Check for same kif, if none, add to key. */
1313		TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
1314			struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
1315
1316			PF_HASHROW_LOCK(ih);
1317			if (si->kif == s->kif &&
1318			    si->direction == s->direction) {
1319				if (sk->proto == IPPROTO_TCP &&
1320				    si->src.state >= TCPS_FIN_WAIT_2 &&
1321				    si->dst.state >= TCPS_FIN_WAIT_2) {
1322					/*
1323					 * New state matches an old >FIN_WAIT_2
1324					 * state. We can't drop key hash locks,
1325					 * thus we can't unlink it properly.
1326					 *
1327					 * As a workaround we drop it into
1328					 * TCPS_CLOSED state, schedule purge
1329					 * ASAP and push it into the very end
1330					 * of the slot TAILQ, so that it won't
1331					 * conflict with our new state.
1332					 */
1333					pf_set_protostate(si, PF_PEER_BOTH,
1334					    TCPS_CLOSED);
1335					si->timeout = PFTM_PURGE;
1336					olds = si;
1337				} else {
1338					if (V_pf_status.debug >= PF_DEBUG_MISC) {
1339						printf("pf: %s key attach "
1340						    "failed on %s: ",
1341						    (idx == PF_SK_WIRE) ?
1342						    "wire" : "stack",
1343						    s->kif->pfik_name);
1344						pf_print_state_parts(s,
1345						    (idx == PF_SK_WIRE) ?
1346						    sk : NULL,
1347						    (idx == PF_SK_STACK) ?
1348						    sk : NULL);
1349						printf(", existing: ");
1350						pf_print_state_parts(si,
1351						    (idx == PF_SK_WIRE) ?
1352						    sk : NULL,
1353						    (idx == PF_SK_STACK) ?
1354						    sk : NULL);
1355						printf("\n");
1356					}
1357					s->timeout = PFTM_UNLINKED;
1358					PF_HASHROW_UNLOCK(ih);
1359					KEYS_UNLOCK();
1360					uma_zfree(V_pf_state_key_z, sk);
1361					if (idx == PF_SK_STACK)
1362						pf_detach_state(s);
1363					return (EEXIST); /* collision! */
1364				}
1365			}
1366			PF_HASHROW_UNLOCK(ih);
1367		}
1368		uma_zfree(V_pf_state_key_z, sk);
1369		s->key[idx] = cur;
1370	} else {
1371		LIST_INSERT_HEAD(&kh->keys, sk, entry);
1372		s->key[idx] = sk;
1373	}
1374
1375stateattach:
1376	/* List is sorted, if-bound states before floating. */
1377	if (s->kif == V_pfi_all)
1378		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
1379	else
1380		TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
1381
1382	if (olds) {
1383		TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]);
1384		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds,
1385		    key_list[idx]);
1386		olds = NULL;
1387	}
1388
1389	/*
1390	 * Attach done. See how should we (or should not?)
1391	 * attach a second key.
1392	 */
1393	if (sks == skw) {
1394		s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
1395		idx = PF_SK_STACK;
1396		sks = NULL;
1397		goto stateattach;
1398	} else if (sks != NULL) {
1399		/*
1400		 * Continue attaching with stack key.
1401		 */
1402		sk = sks;
1403		kh = khs;
1404		idx = PF_SK_STACK;
1405		sks = NULL;
1406		goto keyattach;
1407	}
1408
1409	PF_STATE_LOCK(s);
1410	KEYS_UNLOCK();
1411
1412	KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
1413	    ("%s failure", __func__));
1414
1415	return (0);
1416#undef	KEYS_UNLOCK
1417}
1418
1419static void
1420pf_detach_state(struct pf_kstate *s)
1421{
1422	struct pf_state_key *sks = s->key[PF_SK_STACK];
1423	struct pf_keyhash *kh;
1424
1425	NET_EPOCH_ASSERT();
1426	MPASS(s->timeout >= PFTM_MAX);
1427
1428	pf_sctp_multihome_detach_addr(s);
1429
1430	if ((s->state_flags & PFSTATE_PFLOW) && V_pflow_export_state_ptr)
1431		V_pflow_export_state_ptr(s);
1432
1433	if (sks != NULL) {
1434		kh = &V_pf_keyhash[pf_hashkey(sks)];
1435		PF_HASHROW_LOCK(kh);
1436		if (s->key[PF_SK_STACK] != NULL)
1437			pf_state_key_detach(s, PF_SK_STACK);
1438		/*
1439		 * If both point to same key, then we are done.
1440		 */
1441		if (sks == s->key[PF_SK_WIRE]) {
1442			pf_state_key_detach(s, PF_SK_WIRE);
1443			PF_HASHROW_UNLOCK(kh);
1444			return;
1445		}
1446		PF_HASHROW_UNLOCK(kh);
1447	}
1448
1449	if (s->key[PF_SK_WIRE] != NULL) {
1450		kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])];
1451		PF_HASHROW_LOCK(kh);
1452		if (s->key[PF_SK_WIRE] != NULL)
1453			pf_state_key_detach(s, PF_SK_WIRE);
1454		PF_HASHROW_UNLOCK(kh);
1455	}
1456}
1457
1458static void
1459pf_state_key_detach(struct pf_kstate *s, int idx)
1460{
1461	struct pf_state_key *sk = s->key[idx];
1462#ifdef INVARIANTS
1463	struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)];
1464
1465	PF_HASHROW_ASSERT(kh);
1466#endif
1467	TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]);
1468	s->key[idx] = NULL;
1469
1470	if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) {
1471		LIST_REMOVE(sk, entry);
1472		uma_zfree(V_pf_state_key_z, sk);
1473	}
1474}
1475
1476static int
1477pf_state_key_ctor(void *mem, int size, void *arg, int flags)
1478{
1479	struct pf_state_key *sk = mem;
1480
1481	bzero(sk, sizeof(struct pf_state_key_cmp));
1482	TAILQ_INIT(&sk->states[PF_SK_WIRE]);
1483	TAILQ_INIT(&sk->states[PF_SK_STACK]);
1484
1485	return (0);
1486}
1487
1488struct pf_state_key *
1489pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr,
1490	struct pf_addr *daddr, u_int16_t sport, u_int16_t dport)
1491{
1492	struct pf_state_key *sk;
1493
1494	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1495	if (sk == NULL)
1496		return (NULL);
1497
1498	PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af);
1499	PF_ACPY(&sk->addr[pd->didx], daddr, pd->af);
1500	sk->port[pd->sidx] = sport;
1501	sk->port[pd->didx] = dport;
1502	sk->proto = pd->proto;
1503	sk->af = pd->af;
1504
1505	return (sk);
1506}
1507
1508struct pf_state_key *
1509pf_state_key_clone(struct pf_state_key *orig)
1510{
1511	struct pf_state_key *sk;
1512
1513	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1514	if (sk == NULL)
1515		return (NULL);
1516
1517	bcopy(orig, sk, sizeof(struct pf_state_key_cmp));
1518
1519	return (sk);
1520}
1521
1522int
1523pf_state_insert(struct pfi_kkif *kif, struct pfi_kkif *orig_kif,
1524    struct pf_state_key *skw, struct pf_state_key *sks, struct pf_kstate *s)
1525{
1526	struct pf_idhash *ih;
1527	struct pf_kstate *cur;
1528	int error;
1529
1530	NET_EPOCH_ASSERT();
1531
1532	KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]),
1533	    ("%s: sks not pristine", __func__));
1534	KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]),
1535	    ("%s: skw not pristine", __func__));
1536	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
1537
1538	s->kif = kif;
1539	s->orig_kif = orig_kif;
1540
1541	if (s->id == 0 && s->creatorid == 0) {
1542		s->id = alloc_unr64(&V_pf_stateid);
1543		s->id = htobe64(s->id);
1544		s->creatorid = V_pf_status.hostid;
1545	}
1546
1547	/* Returns with ID locked on success. */
1548	if ((error = pf_state_key_attach(skw, sks, s)) != 0)
1549		return (error);
1550
1551	ih = &V_pf_idhash[PF_IDHASH(s)];
1552	PF_HASHROW_ASSERT(ih);
1553	LIST_FOREACH(cur, &ih->states, entry)
1554		if (cur->id == s->id && cur->creatorid == s->creatorid)
1555			break;
1556
1557	if (cur != NULL) {
1558		s->timeout = PFTM_UNLINKED;
1559		PF_HASHROW_UNLOCK(ih);
1560		if (V_pf_status.debug >= PF_DEBUG_MISC) {
1561			printf("pf: state ID collision: "
1562			    "id: %016llx creatorid: %08x\n",
1563			    (unsigned long long)be64toh(s->id),
1564			    ntohl(s->creatorid));
1565		}
1566		pf_detach_state(s);
1567		return (EEXIST);
1568	}
1569	LIST_INSERT_HEAD(&ih->states, s, entry);
1570	/* One for keys, one for ID hash. */
1571	refcount_init(&s->refs, 2);
1572
1573	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_INSERT], 1);
1574	if (V_pfsync_insert_state_ptr != NULL)
1575		V_pfsync_insert_state_ptr(s);
1576
1577	/* Returns locked. */
1578	return (0);
1579}
1580
1581/*
1582 * Find state by ID: returns with locked row on success.
1583 */
1584struct pf_kstate *
1585pf_find_state_byid(uint64_t id, uint32_t creatorid)
1586{
1587	struct pf_idhash *ih;
1588	struct pf_kstate *s;
1589
1590	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1591
1592	ih = &V_pf_idhash[(be64toh(id) % (pf_hashmask + 1))];
1593
1594	PF_HASHROW_LOCK(ih);
1595	LIST_FOREACH(s, &ih->states, entry)
1596		if (s->id == id && s->creatorid == creatorid)
1597			break;
1598
1599	if (s == NULL)
1600		PF_HASHROW_UNLOCK(ih);
1601
1602	return (s);
1603}
1604
1605/*
1606 * Find state by key.
1607 * Returns with ID hash slot locked on success.
1608 */
1609static struct pf_kstate *
1610pf_find_state(struct pfi_kkif *kif, struct pf_state_key_cmp *key, u_int dir)
1611{
1612	struct pf_keyhash	*kh;
1613	struct pf_state_key	*sk;
1614	struct pf_kstate	*s;
1615	int idx;
1616
1617	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1618
1619	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1620
1621	PF_HASHROW_LOCK(kh);
1622	LIST_FOREACH(sk, &kh->keys, entry)
1623		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1624			break;
1625	if (sk == NULL) {
1626		PF_HASHROW_UNLOCK(kh);
1627		return (NULL);
1628	}
1629
1630	idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK);
1631
1632	/* List is sorted, if-bound states before floating ones. */
1633	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx])
1634		if (s->kif == V_pfi_all || s->kif == kif || s->orig_kif == kif) {
1635			PF_STATE_LOCK(s);
1636			PF_HASHROW_UNLOCK(kh);
1637			if (__predict_false(s->timeout >= PFTM_MAX)) {
1638				/*
1639				 * State is either being processed by
1640				 * pf_unlink_state() in an other thread, or
1641				 * is scheduled for immediate expiry.
1642				 */
1643				PF_STATE_UNLOCK(s);
1644				return (NULL);
1645			}
1646			return (s);
1647		}
1648	PF_HASHROW_UNLOCK(kh);
1649
1650	return (NULL);
1651}
1652
1653/*
1654 * Returns with ID hash slot locked on success.
1655 */
1656struct pf_kstate *
1657pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
1658{
1659	struct pf_keyhash	*kh;
1660	struct pf_state_key	*sk;
1661	struct pf_kstate	*s, *ret = NULL;
1662	int			 idx, inout = 0;
1663
1664	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1665
1666	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1667
1668	PF_HASHROW_LOCK(kh);
1669	LIST_FOREACH(sk, &kh->keys, entry)
1670		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1671			break;
1672	if (sk == NULL) {
1673		PF_HASHROW_UNLOCK(kh);
1674		return (NULL);
1675	}
1676	switch (dir) {
1677	case PF_IN:
1678		idx = PF_SK_WIRE;
1679		break;
1680	case PF_OUT:
1681		idx = PF_SK_STACK;
1682		break;
1683	case PF_INOUT:
1684		idx = PF_SK_WIRE;
1685		inout = 1;
1686		break;
1687	default:
1688		panic("%s: dir %u", __func__, dir);
1689	}
1690second_run:
1691	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
1692		if (more == NULL) {
1693			PF_STATE_LOCK(s);
1694			PF_HASHROW_UNLOCK(kh);
1695			return (s);
1696		}
1697
1698		if (ret)
1699			(*more)++;
1700		else {
1701			ret = s;
1702			PF_STATE_LOCK(s);
1703		}
1704	}
1705	if (inout == 1) {
1706		inout = 0;
1707		idx = PF_SK_STACK;
1708		goto second_run;
1709	}
1710	PF_HASHROW_UNLOCK(kh);
1711
1712	return (ret);
1713}
1714
1715/*
1716 * FIXME
1717 * This routine is inefficient -- locks the state only to unlock immediately on
1718 * return.
1719 * It is racy -- after the state is unlocked nothing stops other threads from
1720 * removing it.
1721 */
1722bool
1723pf_find_state_all_exists(struct pf_state_key_cmp *key, u_int dir)
1724{
1725	struct pf_kstate *s;
1726
1727	s = pf_find_state_all(key, dir, NULL);
1728	if (s != NULL) {
1729		PF_STATE_UNLOCK(s);
1730		return (true);
1731	}
1732	return (false);
1733}
1734
1735/* END state table stuff */
1736
1737static void
1738pf_send(struct pf_send_entry *pfse)
1739{
1740
1741	PF_SENDQ_LOCK();
1742	STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next);
1743	PF_SENDQ_UNLOCK();
1744	swi_sched(V_pf_swi_cookie, 0);
1745}
1746
1747static bool
1748pf_isforlocal(struct mbuf *m, int af)
1749{
1750	switch (af) {
1751#ifdef INET
1752	case AF_INET: {
1753		struct ip *ip = mtod(m, struct ip *);
1754
1755		return (in_localip(ip->ip_dst));
1756	}
1757#endif
1758#ifdef INET6
1759	case AF_INET6: {
1760		struct ip6_hdr *ip6;
1761		struct in6_ifaddr *ia;
1762		ip6 = mtod(m, struct ip6_hdr *);
1763		ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false);
1764		if (ia == NULL)
1765			return (false);
1766		return (! (ia->ia6_flags & IN6_IFF_NOTREADY));
1767	}
1768#endif
1769	default:
1770		panic("Unsupported af %d", af);
1771	}
1772
1773	return (false);
1774}
1775
1776void
1777pf_intr(void *v)
1778{
1779	struct epoch_tracker et;
1780	struct pf_send_head queue;
1781	struct pf_send_entry *pfse, *next;
1782
1783	CURVNET_SET((struct vnet *)v);
1784
1785	PF_SENDQ_LOCK();
1786	queue = V_pf_sendqueue;
1787	STAILQ_INIT(&V_pf_sendqueue);
1788	PF_SENDQ_UNLOCK();
1789
1790	NET_EPOCH_ENTER(et);
1791
1792	STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) {
1793		switch (pfse->pfse_type) {
1794#ifdef INET
1795		case PFSE_IP: {
1796			if (pf_isforlocal(pfse->pfse_m, AF_INET)) {
1797				pfse->pfse_m->m_flags |= M_SKIP_FIREWALL;
1798				pfse->pfse_m->m_pkthdr.csum_flags |=
1799				    CSUM_IP_VALID | CSUM_IP_CHECKED;
1800				ip_input(pfse->pfse_m);
1801			} else {
1802				ip_output(pfse->pfse_m, NULL, NULL, 0, NULL,
1803				    NULL);
1804			}
1805			break;
1806		}
1807		case PFSE_ICMP:
1808			icmp_error(pfse->pfse_m, pfse->icmpopts.type,
1809			    pfse->icmpopts.code, 0, pfse->icmpopts.mtu);
1810			break;
1811#endif /* INET */
1812#ifdef INET6
1813		case PFSE_IP6:
1814			if (pf_isforlocal(pfse->pfse_m, AF_INET6)) {
1815				pfse->pfse_m->m_flags |= M_SKIP_FIREWALL;
1816				ip6_input(pfse->pfse_m);
1817			} else {
1818				ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL,
1819				    NULL, NULL);
1820			}
1821			break;
1822		case PFSE_ICMP6:
1823			icmp6_error(pfse->pfse_m, pfse->icmpopts.type,
1824			    pfse->icmpopts.code, pfse->icmpopts.mtu);
1825			break;
1826#endif /* INET6 */
1827		default:
1828			panic("%s: unknown type", __func__);
1829		}
1830		free(pfse, M_PFTEMP);
1831	}
1832	NET_EPOCH_EXIT(et);
1833	CURVNET_RESTORE();
1834}
1835
1836#define	pf_purge_thread_period	(hz / 10)
1837
1838#ifdef PF_WANT_32_TO_64_COUNTER
1839static void
1840pf_status_counter_u64_periodic(void)
1841{
1842
1843	PF_RULES_RASSERT();
1844
1845	if ((V_pf_counter_periodic_iter % (pf_purge_thread_period * 10 * 60)) != 0) {
1846		return;
1847	}
1848
1849	for (int i = 0; i < FCNT_MAX; i++) {
1850		pf_counter_u64_periodic(&V_pf_status.fcounters[i]);
1851	}
1852}
1853
1854static void
1855pf_kif_counter_u64_periodic(void)
1856{
1857	struct pfi_kkif *kif;
1858	size_t r, run;
1859
1860	PF_RULES_RASSERT();
1861
1862	if (__predict_false(V_pf_allkifcount == 0)) {
1863		return;
1864	}
1865
1866	if ((V_pf_counter_periodic_iter % (pf_purge_thread_period * 10 * 300)) != 0) {
1867		return;
1868	}
1869
1870	run = V_pf_allkifcount / 10;
1871	if (run < 5)
1872		run = 5;
1873
1874	for (r = 0; r < run; r++) {
1875		kif = LIST_NEXT(V_pf_kifmarker, pfik_allkiflist);
1876		if (kif == NULL) {
1877			LIST_REMOVE(V_pf_kifmarker, pfik_allkiflist);
1878			LIST_INSERT_HEAD(&V_pf_allkiflist, V_pf_kifmarker, pfik_allkiflist);
1879			break;
1880		}
1881
1882		LIST_REMOVE(V_pf_kifmarker, pfik_allkiflist);
1883		LIST_INSERT_AFTER(kif, V_pf_kifmarker, pfik_allkiflist);
1884
1885		for (int i = 0; i < 2; i++) {
1886			for (int j = 0; j < 2; j++) {
1887				for (int k = 0; k < 2; k++) {
1888					pf_counter_u64_periodic(&kif->pfik_packets[i][j][k]);
1889					pf_counter_u64_periodic(&kif->pfik_bytes[i][j][k]);
1890				}
1891			}
1892		}
1893	}
1894}
1895
1896static void
1897pf_rule_counter_u64_periodic(void)
1898{
1899	struct pf_krule *rule;
1900	size_t r, run;
1901
1902	PF_RULES_RASSERT();
1903
1904	if (__predict_false(V_pf_allrulecount == 0)) {
1905		return;
1906	}
1907
1908	if ((V_pf_counter_periodic_iter % (pf_purge_thread_period * 10 * 300)) != 0) {
1909		return;
1910	}
1911
1912	run = V_pf_allrulecount / 10;
1913	if (run < 5)
1914		run = 5;
1915
1916	for (r = 0; r < run; r++) {
1917		rule = LIST_NEXT(V_pf_rulemarker, allrulelist);
1918		if (rule == NULL) {
1919			LIST_REMOVE(V_pf_rulemarker, allrulelist);
1920			LIST_INSERT_HEAD(&V_pf_allrulelist, V_pf_rulemarker, allrulelist);
1921			break;
1922		}
1923
1924		LIST_REMOVE(V_pf_rulemarker, allrulelist);
1925		LIST_INSERT_AFTER(rule, V_pf_rulemarker, allrulelist);
1926
1927		pf_counter_u64_periodic(&rule->evaluations);
1928		for (int i = 0; i < 2; i++) {
1929			pf_counter_u64_periodic(&rule->packets[i]);
1930			pf_counter_u64_periodic(&rule->bytes[i]);
1931		}
1932	}
1933}
1934
1935static void
1936pf_counter_u64_periodic_main(void)
1937{
1938	PF_RULES_RLOCK_TRACKER;
1939
1940	V_pf_counter_periodic_iter++;
1941
1942	PF_RULES_RLOCK();
1943	pf_counter_u64_critical_enter();
1944	pf_status_counter_u64_periodic();
1945	pf_kif_counter_u64_periodic();
1946	pf_rule_counter_u64_periodic();
1947	pf_counter_u64_critical_exit();
1948	PF_RULES_RUNLOCK();
1949}
1950#else
1951#define	pf_counter_u64_periodic_main()	do { } while (0)
1952#endif
1953
1954void
1955pf_purge_thread(void *unused __unused)
1956{
1957	struct epoch_tracker	 et;
1958
1959	VNET_ITERATOR_DECL(vnet_iter);
1960
1961	sx_xlock(&pf_end_lock);
1962	while (pf_end_threads == 0) {
1963		sx_sleep(pf_purge_thread, &pf_end_lock, 0, "pftm", pf_purge_thread_period);
1964
1965		VNET_LIST_RLOCK();
1966		NET_EPOCH_ENTER(et);
1967		VNET_FOREACH(vnet_iter) {
1968			CURVNET_SET(vnet_iter);
1969
1970			/* Wait until V_pf_default_rule is initialized. */
1971			if (V_pf_vnet_active == 0) {
1972				CURVNET_RESTORE();
1973				continue;
1974			}
1975
1976			pf_counter_u64_periodic_main();
1977
1978			/*
1979			 *  Process 1/interval fraction of the state
1980			 * table every run.
1981			 */
1982			V_pf_purge_idx =
1983			    pf_purge_expired_states(V_pf_purge_idx, pf_hashmask /
1984			    (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10));
1985
1986			/*
1987			 * Purge other expired types every
1988			 * PFTM_INTERVAL seconds.
1989			 */
1990			if (V_pf_purge_idx == 0) {
1991				/*
1992				 * Order is important:
1993				 * - states and src nodes reference rules
1994				 * - states and rules reference kifs
1995				 */
1996				pf_purge_expired_fragments();
1997				pf_purge_expired_src_nodes();
1998				pf_purge_unlinked_rules();
1999				pfi_kkif_purge();
2000			}
2001			CURVNET_RESTORE();
2002		}
2003		NET_EPOCH_EXIT(et);
2004		VNET_LIST_RUNLOCK();
2005	}
2006
2007	pf_end_threads++;
2008	sx_xunlock(&pf_end_lock);
2009	kproc_exit(0);
2010}
2011
2012void
2013pf_unload_vnet_purge(void)
2014{
2015
2016	/*
2017	 * To cleanse up all kifs and rules we need
2018	 * two runs: first one clears reference flags,
2019	 * then pf_purge_expired_states() doesn't
2020	 * raise them, and then second run frees.
2021	 */
2022	pf_purge_unlinked_rules();
2023	pfi_kkif_purge();
2024
2025	/*
2026	 * Now purge everything.
2027	 */
2028	pf_purge_expired_states(0, pf_hashmask);
2029	pf_purge_fragments(UINT_MAX);
2030	pf_purge_expired_src_nodes();
2031
2032	/*
2033	 * Now all kifs & rules should be unreferenced,
2034	 * thus should be successfully freed.
2035	 */
2036	pf_purge_unlinked_rules();
2037	pfi_kkif_purge();
2038}
2039
2040u_int32_t
2041pf_state_expires(const struct pf_kstate *state)
2042{
2043	u_int32_t	timeout;
2044	u_int32_t	start;
2045	u_int32_t	end;
2046	u_int32_t	states;
2047
2048	/* handle all PFTM_* > PFTM_MAX here */
2049	if (state->timeout == PFTM_PURGE)
2050		return (time_uptime);
2051	KASSERT(state->timeout != PFTM_UNLINKED,
2052	    ("pf_state_expires: timeout == PFTM_UNLINKED"));
2053	KASSERT((state->timeout < PFTM_MAX),
2054	    ("pf_state_expires: timeout > PFTM_MAX"));
2055	timeout = state->rule.ptr->timeout[state->timeout];
2056	if (!timeout)
2057		timeout = V_pf_default_rule.timeout[state->timeout];
2058	start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
2059	if (start && state->rule.ptr != &V_pf_default_rule) {
2060		end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
2061		states = counter_u64_fetch(state->rule.ptr->states_cur);
2062	} else {
2063		start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START];
2064		end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END];
2065		states = V_pf_status.states;
2066	}
2067	if (end && states > start && start < end) {
2068		if (states < end) {
2069			timeout = (u_int64_t)timeout * (end - states) /
2070			    (end - start);
2071			return ((state->expire / 1000) + timeout);
2072		}
2073		else
2074			return (time_uptime);
2075	}
2076	return ((state->expire / 1000) + timeout);
2077}
2078
2079void
2080pf_purge_expired_src_nodes(void)
2081{
2082	struct pf_ksrc_node_list	 freelist;
2083	struct pf_srchash	*sh;
2084	struct pf_ksrc_node	*cur, *next;
2085	int i;
2086
2087	LIST_INIT(&freelist);
2088	for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) {
2089	    PF_HASHROW_LOCK(sh);
2090	    LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next)
2091		if (cur->states == 0 && cur->expire <= time_uptime) {
2092			pf_unlink_src_node(cur);
2093			LIST_INSERT_HEAD(&freelist, cur, entry);
2094		} else if (cur->rule.ptr != NULL)
2095			cur->rule.ptr->rule_ref |= PFRULE_REFS;
2096	    PF_HASHROW_UNLOCK(sh);
2097	}
2098
2099	pf_free_src_nodes(&freelist);
2100
2101	V_pf_status.src_nodes = uma_zone_get_cur(V_pf_sources_z);
2102}
2103
2104static void
2105pf_src_tree_remove_state(struct pf_kstate *s)
2106{
2107	struct pf_ksrc_node *sn;
2108	uint32_t timeout;
2109
2110	timeout = s->rule.ptr->timeout[PFTM_SRC_NODE] ?
2111	    s->rule.ptr->timeout[PFTM_SRC_NODE] :
2112	    V_pf_default_rule.timeout[PFTM_SRC_NODE];
2113
2114	if (s->src_node != NULL) {
2115		sn = s->src_node;
2116		PF_SRC_NODE_LOCK(sn);
2117		if (s->src.tcp_est)
2118			--sn->conn;
2119		if (--sn->states == 0)
2120			sn->expire = time_uptime + timeout;
2121		PF_SRC_NODE_UNLOCK(sn);
2122	}
2123	if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
2124		sn = s->nat_src_node;
2125		PF_SRC_NODE_LOCK(sn);
2126		if (--sn->states == 0)
2127			sn->expire = time_uptime + timeout;
2128		PF_SRC_NODE_UNLOCK(sn);
2129	}
2130	s->src_node = s->nat_src_node = NULL;
2131}
2132
2133/*
2134 * Unlink and potentilly free a state. Function may be
2135 * called with ID hash row locked, but always returns
2136 * unlocked, since it needs to go through key hash locking.
2137 */
2138int
2139pf_unlink_state(struct pf_kstate *s)
2140{
2141	struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)];
2142
2143	NET_EPOCH_ASSERT();
2144	PF_HASHROW_ASSERT(ih);
2145
2146	if (s->timeout == PFTM_UNLINKED) {
2147		/*
2148		 * State is being processed
2149		 * by pf_unlink_state() in
2150		 * an other thread.
2151		 */
2152		PF_HASHROW_UNLOCK(ih);
2153		return (0);	/* XXXGL: undefined actually */
2154	}
2155
2156	if (s->src.state == PF_TCPS_PROXY_DST) {
2157		/* XXX wire key the right one? */
2158		pf_send_tcp(s->rule.ptr, s->key[PF_SK_WIRE]->af,
2159		    &s->key[PF_SK_WIRE]->addr[1],
2160		    &s->key[PF_SK_WIRE]->addr[0],
2161		    s->key[PF_SK_WIRE]->port[1],
2162		    s->key[PF_SK_WIRE]->port[0],
2163		    s->src.seqhi, s->src.seqlo + 1,
2164		    TH_RST|TH_ACK, 0, 0, 0, true, s->tag, 0, s->act.rtableid);
2165	}
2166
2167	LIST_REMOVE(s, entry);
2168	pf_src_tree_remove_state(s);
2169
2170	if (V_pfsync_delete_state_ptr != NULL)
2171		V_pfsync_delete_state_ptr(s);
2172
2173	STATE_DEC_COUNTERS(s);
2174
2175	s->timeout = PFTM_UNLINKED;
2176
2177	/* Ensure we remove it from the list of halfopen states, if needed. */
2178	if (s->key[PF_SK_STACK] != NULL &&
2179	    s->key[PF_SK_STACK]->proto == IPPROTO_TCP)
2180		pf_set_protostate(s, PF_PEER_BOTH, TCPS_CLOSED);
2181
2182	PF_HASHROW_UNLOCK(ih);
2183
2184	pf_detach_state(s);
2185	/* pf_state_insert() initialises refs to 2 */
2186	return (pf_release_staten(s, 2));
2187}
2188
2189struct pf_kstate *
2190pf_alloc_state(int flags)
2191{
2192
2193	return (uma_zalloc(V_pf_state_z, flags | M_ZERO));
2194}
2195
2196void
2197pf_free_state(struct pf_kstate *cur)
2198{
2199	struct pf_krule_item *ri;
2200
2201	KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur));
2202	KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__,
2203	    cur->timeout));
2204
2205	while ((ri = SLIST_FIRST(&cur->match_rules))) {
2206		SLIST_REMOVE_HEAD(&cur->match_rules, entry);
2207		free(ri, M_PF_RULE_ITEM);
2208	}
2209
2210	pf_normalize_tcp_cleanup(cur);
2211	uma_zfree(V_pf_state_z, cur);
2212	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_REMOVALS], 1);
2213}
2214
2215/*
2216 * Called only from pf_purge_thread(), thus serialized.
2217 */
2218static u_int
2219pf_purge_expired_states(u_int i, int maxcheck)
2220{
2221	struct pf_idhash *ih;
2222	struct pf_kstate *s;
2223	struct pf_krule_item *mrm;
2224	size_t count __unused;
2225
2226	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
2227
2228	/*
2229	 * Go through hash and unlink states that expire now.
2230	 */
2231	while (maxcheck > 0) {
2232		count = 0;
2233		ih = &V_pf_idhash[i];
2234
2235		/* only take the lock if we expect to do work */
2236		if (!LIST_EMPTY(&ih->states)) {
2237relock:
2238			PF_HASHROW_LOCK(ih);
2239			LIST_FOREACH(s, &ih->states, entry) {
2240				if (pf_state_expires(s) <= time_uptime) {
2241					V_pf_status.states -=
2242					    pf_unlink_state(s);
2243					goto relock;
2244				}
2245				s->rule.ptr->rule_ref |= PFRULE_REFS;
2246				if (s->nat_rule.ptr != NULL)
2247					s->nat_rule.ptr->rule_ref |= PFRULE_REFS;
2248				if (s->anchor.ptr != NULL)
2249					s->anchor.ptr->rule_ref |= PFRULE_REFS;
2250				s->kif->pfik_flags |= PFI_IFLAG_REFS;
2251				SLIST_FOREACH(mrm, &s->match_rules, entry)
2252					mrm->r->rule_ref |= PFRULE_REFS;
2253				if (s->rt_kif)
2254					s->rt_kif->pfik_flags |= PFI_IFLAG_REFS;
2255				count++;
2256			}
2257			PF_HASHROW_UNLOCK(ih);
2258		}
2259
2260		SDT_PROBE2(pf, purge, state, rowcount, i, count);
2261
2262		/* Return when we hit end of hash. */
2263		if (++i > pf_hashmask) {
2264			V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
2265			return (0);
2266		}
2267
2268		maxcheck--;
2269	}
2270
2271	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
2272
2273	return (i);
2274}
2275
2276static void
2277pf_purge_unlinked_rules(void)
2278{
2279	struct pf_krulequeue tmpq;
2280	struct pf_krule *r, *r1;
2281
2282	/*
2283	 * If we have overloading task pending, then we'd
2284	 * better skip purging this time. There is a tiny
2285	 * probability that overloading task references
2286	 * an already unlinked rule.
2287	 */
2288	PF_OVERLOADQ_LOCK();
2289	if (!SLIST_EMPTY(&V_pf_overloadqueue)) {
2290		PF_OVERLOADQ_UNLOCK();
2291		return;
2292	}
2293	PF_OVERLOADQ_UNLOCK();
2294
2295	/*
2296	 * Do naive mark-and-sweep garbage collecting of old rules.
2297	 * Reference flag is raised by pf_purge_expired_states()
2298	 * and pf_purge_expired_src_nodes().
2299	 *
2300	 * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK,
2301	 * use a temporary queue.
2302	 */
2303	TAILQ_INIT(&tmpq);
2304	PF_UNLNKDRULES_LOCK();
2305	TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) {
2306		if (!(r->rule_ref & PFRULE_REFS)) {
2307			TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries);
2308			TAILQ_INSERT_TAIL(&tmpq, r, entries);
2309		} else
2310			r->rule_ref &= ~PFRULE_REFS;
2311	}
2312	PF_UNLNKDRULES_UNLOCK();
2313
2314	if (!TAILQ_EMPTY(&tmpq)) {
2315		PF_CONFIG_LOCK();
2316		PF_RULES_WLOCK();
2317		TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) {
2318			TAILQ_REMOVE(&tmpq, r, entries);
2319			pf_free_rule(r);
2320		}
2321		PF_RULES_WUNLOCK();
2322		PF_CONFIG_UNLOCK();
2323	}
2324}
2325
2326void
2327pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
2328{
2329	switch (af) {
2330#ifdef INET
2331	case AF_INET: {
2332		u_int32_t a = ntohl(addr->addr32[0]);
2333		printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
2334		    (a>>8)&255, a&255);
2335		if (p) {
2336			p = ntohs(p);
2337			printf(":%u", p);
2338		}
2339		break;
2340	}
2341#endif /* INET */
2342#ifdef INET6
2343	case AF_INET6: {
2344		u_int16_t b;
2345		u_int8_t i, curstart, curend, maxstart, maxend;
2346		curstart = curend = maxstart = maxend = 255;
2347		for (i = 0; i < 8; i++) {
2348			if (!addr->addr16[i]) {
2349				if (curstart == 255)
2350					curstart = i;
2351				curend = i;
2352			} else {
2353				if ((curend - curstart) >
2354				    (maxend - maxstart)) {
2355					maxstart = curstart;
2356					maxend = curend;
2357				}
2358				curstart = curend = 255;
2359			}
2360		}
2361		if ((curend - curstart) >
2362		    (maxend - maxstart)) {
2363			maxstart = curstart;
2364			maxend = curend;
2365		}
2366		for (i = 0; i < 8; i++) {
2367			if (i >= maxstart && i <= maxend) {
2368				if (i == 0)
2369					printf(":");
2370				if (i == maxend)
2371					printf(":");
2372			} else {
2373				b = ntohs(addr->addr16[i]);
2374				printf("%x", b);
2375				if (i < 7)
2376					printf(":");
2377			}
2378		}
2379		if (p) {
2380			p = ntohs(p);
2381			printf("[%u]", p);
2382		}
2383		break;
2384	}
2385#endif /* INET6 */
2386	}
2387}
2388
2389void
2390pf_print_state(struct pf_kstate *s)
2391{
2392	pf_print_state_parts(s, NULL, NULL);
2393}
2394
2395static void
2396pf_print_state_parts(struct pf_kstate *s,
2397    struct pf_state_key *skwp, struct pf_state_key *sksp)
2398{
2399	struct pf_state_key *skw, *sks;
2400	u_int8_t proto, dir;
2401
2402	/* Do our best to fill these, but they're skipped if NULL */
2403	skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
2404	sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
2405	proto = skw ? skw->proto : (sks ? sks->proto : 0);
2406	dir = s ? s->direction : 0;
2407
2408	switch (proto) {
2409	case IPPROTO_IPV4:
2410		printf("IPv4");
2411		break;
2412	case IPPROTO_IPV6:
2413		printf("IPv6");
2414		break;
2415	case IPPROTO_TCP:
2416		printf("TCP");
2417		break;
2418	case IPPROTO_UDP:
2419		printf("UDP");
2420		break;
2421	case IPPROTO_ICMP:
2422		printf("ICMP");
2423		break;
2424	case IPPROTO_ICMPV6:
2425		printf("ICMPv6");
2426		break;
2427	default:
2428		printf("%u", proto);
2429		break;
2430	}
2431	switch (dir) {
2432	case PF_IN:
2433		printf(" in");
2434		break;
2435	case PF_OUT:
2436		printf(" out");
2437		break;
2438	}
2439	if (skw) {
2440		printf(" wire: ");
2441		pf_print_host(&skw->addr[0], skw->port[0], skw->af);
2442		printf(" ");
2443		pf_print_host(&skw->addr[1], skw->port[1], skw->af);
2444	}
2445	if (sks) {
2446		printf(" stack: ");
2447		if (sks != skw) {
2448			pf_print_host(&sks->addr[0], sks->port[0], sks->af);
2449			printf(" ");
2450			pf_print_host(&sks->addr[1], sks->port[1], sks->af);
2451		} else
2452			printf("-");
2453	}
2454	if (s) {
2455		if (proto == IPPROTO_TCP) {
2456			printf(" [lo=%u high=%u win=%u modulator=%u",
2457			    s->src.seqlo, s->src.seqhi,
2458			    s->src.max_win, s->src.seqdiff);
2459			if (s->src.wscale && s->dst.wscale)
2460				printf(" wscale=%u",
2461				    s->src.wscale & PF_WSCALE_MASK);
2462			printf("]");
2463			printf(" [lo=%u high=%u win=%u modulator=%u",
2464			    s->dst.seqlo, s->dst.seqhi,
2465			    s->dst.max_win, s->dst.seqdiff);
2466			if (s->src.wscale && s->dst.wscale)
2467				printf(" wscale=%u",
2468				s->dst.wscale & PF_WSCALE_MASK);
2469			printf("]");
2470		}
2471		printf(" %u:%u", s->src.state, s->dst.state);
2472	}
2473}
2474
2475void
2476pf_print_flags(u_int8_t f)
2477{
2478	if (f)
2479		printf(" ");
2480	if (f & TH_FIN)
2481		printf("F");
2482	if (f & TH_SYN)
2483		printf("S");
2484	if (f & TH_RST)
2485		printf("R");
2486	if (f & TH_PUSH)
2487		printf("P");
2488	if (f & TH_ACK)
2489		printf("A");
2490	if (f & TH_URG)
2491		printf("U");
2492	if (f & TH_ECE)
2493		printf("E");
2494	if (f & TH_CWR)
2495		printf("W");
2496}
2497
2498#define	PF_SET_SKIP_STEPS(i)					\
2499	do {							\
2500		while (head[i] != cur) {			\
2501			head[i]->skip[i].ptr = cur;		\
2502			head[i] = TAILQ_NEXT(head[i], entries);	\
2503		}						\
2504	} while (0)
2505
2506void
2507pf_calc_skip_steps(struct pf_krulequeue *rules)
2508{
2509	struct pf_krule *cur, *prev, *head[PF_SKIP_COUNT];
2510	int i;
2511
2512	cur = TAILQ_FIRST(rules);
2513	prev = cur;
2514	for (i = 0; i < PF_SKIP_COUNT; ++i)
2515		head[i] = cur;
2516	while (cur != NULL) {
2517		if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
2518			PF_SET_SKIP_STEPS(PF_SKIP_IFP);
2519		if (cur->direction != prev->direction)
2520			PF_SET_SKIP_STEPS(PF_SKIP_DIR);
2521		if (cur->af != prev->af)
2522			PF_SET_SKIP_STEPS(PF_SKIP_AF);
2523		if (cur->proto != prev->proto)
2524			PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
2525		if (cur->src.neg != prev->src.neg ||
2526		    pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
2527			PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
2528		if (cur->src.port[0] != prev->src.port[0] ||
2529		    cur->src.port[1] != prev->src.port[1] ||
2530		    cur->src.port_op != prev->src.port_op)
2531			PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
2532		if (cur->dst.neg != prev->dst.neg ||
2533		    pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
2534			PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
2535		if (cur->dst.port[0] != prev->dst.port[0] ||
2536		    cur->dst.port[1] != prev->dst.port[1] ||
2537		    cur->dst.port_op != prev->dst.port_op)
2538			PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
2539
2540		prev = cur;
2541		cur = TAILQ_NEXT(cur, entries);
2542	}
2543	for (i = 0; i < PF_SKIP_COUNT; ++i)
2544		PF_SET_SKIP_STEPS(i);
2545}
2546
2547int
2548pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
2549{
2550	if (aw1->type != aw2->type)
2551		return (1);
2552	switch (aw1->type) {
2553	case PF_ADDR_ADDRMASK:
2554	case PF_ADDR_RANGE:
2555		if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, AF_INET6))
2556			return (1);
2557		if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, AF_INET6))
2558			return (1);
2559		return (0);
2560	case PF_ADDR_DYNIFTL:
2561		return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
2562	case PF_ADDR_NOROUTE:
2563	case PF_ADDR_URPFFAILED:
2564		return (0);
2565	case PF_ADDR_TABLE:
2566		return (aw1->p.tbl != aw2->p.tbl);
2567	default:
2568		printf("invalid address type: %d\n", aw1->type);
2569		return (1);
2570	}
2571}
2572
2573/**
2574 * Checksum updates are a little complicated because the checksum in the TCP/UDP
2575 * header isn't always a full checksum. In some cases (i.e. output) it's a
2576 * pseudo-header checksum, which is a partial checksum over src/dst IP
2577 * addresses, protocol number and length.
2578 *
2579 * That means we have the following cases:
2580 *  * Input or forwarding: we don't have TSO, the checksum fields are full
2581 *  	checksums, we need to update the checksum whenever we change anything.
2582 *  * Output (i.e. the checksum is a pseudo-header checksum):
2583 *  	x The field being updated is src/dst address or affects the length of
2584 *  	the packet. We need to update the pseudo-header checksum (note that this
2585 *  	checksum is not ones' complement).
2586 *  	x Some other field is being modified (e.g. src/dst port numbers): We
2587 *  	don't have to update anything.
2588 **/
2589u_int16_t
2590pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
2591{
2592	u_int32_t x;
2593
2594	x = cksum + old - new;
2595	x = (x + (x >> 16)) & 0xffff;
2596
2597	/* optimise: eliminate a branch when not udp */
2598	if (udp && cksum == 0x0000)
2599		return cksum;
2600	if (udp && x == 0x0000)
2601		x = 0xffff;
2602
2603	return (u_int16_t)(x);
2604}
2605
2606static void
2607pf_patch_8(struct mbuf *m, u_int16_t *cksum, u_int8_t *f, u_int8_t v, bool hi,
2608    u_int8_t udp)
2609{
2610	u_int16_t old = htons(hi ? (*f << 8) : *f);
2611	u_int16_t new = htons(hi ? ( v << 8) :  v);
2612
2613	if (*f == v)
2614		return;
2615
2616	*f = v;
2617
2618	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
2619		return;
2620
2621	*cksum = pf_cksum_fixup(*cksum, old, new, udp);
2622}
2623
2624void
2625pf_patch_16_unaligned(struct mbuf *m, u_int16_t *cksum, void *f, u_int16_t v,
2626    bool hi, u_int8_t udp)
2627{
2628	u_int8_t *fb = (u_int8_t *)f;
2629	u_int8_t *vb = (u_int8_t *)&v;
2630
2631	pf_patch_8(m, cksum, fb++, *vb++, hi, udp);
2632	pf_patch_8(m, cksum, fb++, *vb++, !hi, udp);
2633}
2634
2635void
2636pf_patch_32_unaligned(struct mbuf *m, u_int16_t *cksum, void *f, u_int32_t v,
2637    bool hi, u_int8_t udp)
2638{
2639	u_int8_t *fb = (u_int8_t *)f;
2640	u_int8_t *vb = (u_int8_t *)&v;
2641
2642	pf_patch_8(m, cksum, fb++, *vb++, hi, udp);
2643	pf_patch_8(m, cksum, fb++, *vb++, !hi, udp);
2644	pf_patch_8(m, cksum, fb++, *vb++, hi, udp);
2645	pf_patch_8(m, cksum, fb++, *vb++, !hi, udp);
2646}
2647
2648u_int16_t
2649pf_proto_cksum_fixup(struct mbuf *m, u_int16_t cksum, u_int16_t old,
2650        u_int16_t new, u_int8_t udp)
2651{
2652	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
2653		return (cksum);
2654
2655	return (pf_cksum_fixup(cksum, old, new, udp));
2656}
2657
2658static void
2659pf_change_ap(struct mbuf *m, struct pf_addr *a, u_int16_t *p, u_int16_t *ic,
2660        u_int16_t *pc, struct pf_addr *an, u_int16_t pn, u_int8_t u,
2661        sa_family_t af)
2662{
2663	struct pf_addr	ao;
2664	u_int16_t	po = *p;
2665
2666	PF_ACPY(&ao, a, af);
2667	PF_ACPY(a, an, af);
2668
2669	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
2670		*pc = ~*pc;
2671
2672	*p = pn;
2673
2674	switch (af) {
2675#ifdef INET
2676	case AF_INET:
2677		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
2678		    ao.addr16[0], an->addr16[0], 0),
2679		    ao.addr16[1], an->addr16[1], 0);
2680		*p = pn;
2681
2682		*pc = pf_cksum_fixup(pf_cksum_fixup(*pc,
2683		    ao.addr16[0], an->addr16[0], u),
2684		    ao.addr16[1], an->addr16[1], u);
2685
2686		*pc = pf_proto_cksum_fixup(m, *pc, po, pn, u);
2687		break;
2688#endif /* INET */
2689#ifdef INET6
2690	case AF_INET6:
2691		*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2692		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2693		    pf_cksum_fixup(pf_cksum_fixup(*pc,
2694		    ao.addr16[0], an->addr16[0], u),
2695		    ao.addr16[1], an->addr16[1], u),
2696		    ao.addr16[2], an->addr16[2], u),
2697		    ao.addr16[3], an->addr16[3], u),
2698		    ao.addr16[4], an->addr16[4], u),
2699		    ao.addr16[5], an->addr16[5], u),
2700		    ao.addr16[6], an->addr16[6], u),
2701		    ao.addr16[7], an->addr16[7], u);
2702
2703		*pc = pf_proto_cksum_fixup(m, *pc, po, pn, u);
2704		break;
2705#endif /* INET6 */
2706	}
2707
2708	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA |
2709	    CSUM_DELAY_DATA_IPV6)) {
2710		*pc = ~*pc;
2711		if (! *pc)
2712			*pc = 0xffff;
2713	}
2714}
2715
2716/* Changes a u_int32_t.  Uses a void * so there are no align restrictions */
2717void
2718pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
2719{
2720	u_int32_t	ao;
2721
2722	memcpy(&ao, a, sizeof(ao));
2723	memcpy(a, &an, sizeof(u_int32_t));
2724	*c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
2725	    ao % 65536, an % 65536, u);
2726}
2727
2728void
2729pf_change_proto_a(struct mbuf *m, void *a, u_int16_t *c, u_int32_t an, u_int8_t udp)
2730{
2731	u_int32_t	ao;
2732
2733	memcpy(&ao, a, sizeof(ao));
2734	memcpy(a, &an, sizeof(u_int32_t));
2735
2736	*c = pf_proto_cksum_fixup(m,
2737	    pf_proto_cksum_fixup(m, *c, ao / 65536, an / 65536, udp),
2738	    ao % 65536, an % 65536, udp);
2739}
2740
2741#ifdef INET6
2742static void
2743pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
2744{
2745	struct pf_addr	ao;
2746
2747	PF_ACPY(&ao, a, AF_INET6);
2748	PF_ACPY(a, an, AF_INET6);
2749
2750	*c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2751	    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2752	    pf_cksum_fixup(pf_cksum_fixup(*c,
2753	    ao.addr16[0], an->addr16[0], u),
2754	    ao.addr16[1], an->addr16[1], u),
2755	    ao.addr16[2], an->addr16[2], u),
2756	    ao.addr16[3], an->addr16[3], u),
2757	    ao.addr16[4], an->addr16[4], u),
2758	    ao.addr16[5], an->addr16[5], u),
2759	    ao.addr16[6], an->addr16[6], u),
2760	    ao.addr16[7], an->addr16[7], u);
2761}
2762#endif /* INET6 */
2763
2764static void
2765pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
2766    struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
2767    u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
2768{
2769	struct pf_addr	oia, ooa;
2770
2771	PF_ACPY(&oia, ia, af);
2772	if (oa)
2773		PF_ACPY(&ooa, oa, af);
2774
2775	/* Change inner protocol port, fix inner protocol checksum. */
2776	if (ip != NULL) {
2777		u_int16_t	oip = *ip;
2778		u_int32_t	opc;
2779
2780		if (pc != NULL)
2781			opc = *pc;
2782		*ip = np;
2783		if (pc != NULL)
2784			*pc = pf_cksum_fixup(*pc, oip, *ip, u);
2785		*ic = pf_cksum_fixup(*ic, oip, *ip, 0);
2786		if (pc != NULL)
2787			*ic = pf_cksum_fixup(*ic, opc, *pc, 0);
2788	}
2789	/* Change inner ip address, fix inner ip and icmp checksums. */
2790	PF_ACPY(ia, na, af);
2791	switch (af) {
2792#ifdef INET
2793	case AF_INET: {
2794		u_int32_t	 oh2c = *h2c;
2795
2796		*h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
2797		    oia.addr16[0], ia->addr16[0], 0),
2798		    oia.addr16[1], ia->addr16[1], 0);
2799		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
2800		    oia.addr16[0], ia->addr16[0], 0),
2801		    oia.addr16[1], ia->addr16[1], 0);
2802		*ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
2803		break;
2804	}
2805#endif /* INET */
2806#ifdef INET6
2807	case AF_INET6:
2808		*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2809		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2810		    pf_cksum_fixup(pf_cksum_fixup(*ic,
2811		    oia.addr16[0], ia->addr16[0], u),
2812		    oia.addr16[1], ia->addr16[1], u),
2813		    oia.addr16[2], ia->addr16[2], u),
2814		    oia.addr16[3], ia->addr16[3], u),
2815		    oia.addr16[4], ia->addr16[4], u),
2816		    oia.addr16[5], ia->addr16[5], u),
2817		    oia.addr16[6], ia->addr16[6], u),
2818		    oia.addr16[7], ia->addr16[7], u);
2819		break;
2820#endif /* INET6 */
2821	}
2822	/* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
2823	if (oa) {
2824		PF_ACPY(oa, na, af);
2825		switch (af) {
2826#ifdef INET
2827		case AF_INET:
2828			*hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
2829			    ooa.addr16[0], oa->addr16[0], 0),
2830			    ooa.addr16[1], oa->addr16[1], 0);
2831			break;
2832#endif /* INET */
2833#ifdef INET6
2834		case AF_INET6:
2835			*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2836			    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2837			    pf_cksum_fixup(pf_cksum_fixup(*ic,
2838			    ooa.addr16[0], oa->addr16[0], u),
2839			    ooa.addr16[1], oa->addr16[1], u),
2840			    ooa.addr16[2], oa->addr16[2], u),
2841			    ooa.addr16[3], oa->addr16[3], u),
2842			    ooa.addr16[4], oa->addr16[4], u),
2843			    ooa.addr16[5], oa->addr16[5], u),
2844			    ooa.addr16[6], oa->addr16[6], u),
2845			    ooa.addr16[7], oa->addr16[7], u);
2846			break;
2847#endif /* INET6 */
2848		}
2849	}
2850}
2851
2852/*
2853 * Need to modulate the sequence numbers in the TCP SACK option
2854 * (credits to Krzysztof Pfaff for report and patch)
2855 */
2856static int
2857pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
2858    struct tcphdr *th, struct pf_state_peer *dst)
2859{
2860	int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
2861	u_int8_t opts[TCP_MAXOLEN], *opt = opts;
2862	int copyback = 0, i, olen;
2863	struct sackblk sack;
2864
2865#define	TCPOLEN_SACKLEN	(TCPOLEN_SACK + 2)
2866	if (hlen < TCPOLEN_SACKLEN ||
2867	    !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
2868		return 0;
2869
2870	while (hlen >= TCPOLEN_SACKLEN) {
2871		size_t startoff = opt - opts;
2872		olen = opt[1];
2873		switch (*opt) {
2874		case TCPOPT_EOL:	/* FALLTHROUGH */
2875		case TCPOPT_NOP:
2876			opt++;
2877			hlen--;
2878			break;
2879		case TCPOPT_SACK:
2880			if (olen > hlen)
2881				olen = hlen;
2882			if (olen >= TCPOLEN_SACKLEN) {
2883				for (i = 2; i + TCPOLEN_SACK <= olen;
2884				    i += TCPOLEN_SACK) {
2885					memcpy(&sack, &opt[i], sizeof(sack));
2886					pf_patch_32_unaligned(m,
2887					    &th->th_sum, &sack.start,
2888					    htonl(ntohl(sack.start) - dst->seqdiff),
2889					    PF_ALGNMNT(startoff),
2890					    0);
2891					pf_patch_32_unaligned(m, &th->th_sum,
2892					    &sack.end,
2893					    htonl(ntohl(sack.end) - dst->seqdiff),
2894					    PF_ALGNMNT(startoff),
2895					    0);
2896					memcpy(&opt[i], &sack, sizeof(sack));
2897				}
2898				copyback = 1;
2899			}
2900			/* FALLTHROUGH */
2901		default:
2902			if (olen < 2)
2903				olen = 2;
2904			hlen -= olen;
2905			opt += olen;
2906		}
2907	}
2908
2909	if (copyback)
2910		m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts);
2911	return (copyback);
2912}
2913
2914struct mbuf *
2915pf_build_tcp(const struct pf_krule *r, sa_family_t af,
2916    const struct pf_addr *saddr, const struct pf_addr *daddr,
2917    u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
2918    u_int8_t tcp_flags, u_int16_t win, u_int16_t mss, u_int8_t ttl,
2919    bool skip_firewall, u_int16_t mtag_tag, u_int16_t mtag_flags, int rtableid)
2920{
2921	struct mbuf	*m;
2922	int		 len, tlen;
2923#ifdef INET
2924	struct ip	*h = NULL;
2925#endif /* INET */
2926#ifdef INET6
2927	struct ip6_hdr	*h6 = NULL;
2928#endif /* INET6 */
2929	struct tcphdr	*th;
2930	char		*opt;
2931	struct pf_mtag  *pf_mtag;
2932
2933	len = 0;
2934	th = NULL;
2935
2936	/* maximum segment size tcp option */
2937	tlen = sizeof(struct tcphdr);
2938	if (mss)
2939		tlen += 4;
2940
2941	switch (af) {
2942#ifdef INET
2943	case AF_INET:
2944		len = sizeof(struct ip) + tlen;
2945		break;
2946#endif /* INET */
2947#ifdef INET6
2948	case AF_INET6:
2949		len = sizeof(struct ip6_hdr) + tlen;
2950		break;
2951#endif /* INET6 */
2952	default:
2953		panic("%s: unsupported af %d", __func__, af);
2954	}
2955
2956	m = m_gethdr(M_NOWAIT, MT_DATA);
2957	if (m == NULL)
2958		return (NULL);
2959
2960#ifdef MAC
2961	mac_netinet_firewall_send(m);
2962#endif
2963	if ((pf_mtag = pf_get_mtag(m)) == NULL) {
2964		m_freem(m);
2965		return (NULL);
2966	}
2967	if (skip_firewall)
2968		m->m_flags |= M_SKIP_FIREWALL;
2969	pf_mtag->tag = mtag_tag;
2970	pf_mtag->flags = mtag_flags;
2971
2972	if (rtableid >= 0)
2973		M_SETFIB(m, rtableid);
2974
2975#ifdef ALTQ
2976	if (r != NULL && r->qid) {
2977		pf_mtag->qid = r->qid;
2978
2979		/* add hints for ecn */
2980		pf_mtag->hdr = mtod(m, struct ip *);
2981	}
2982#endif /* ALTQ */
2983	m->m_data += max_linkhdr;
2984	m->m_pkthdr.len = m->m_len = len;
2985	/* The rest of the stack assumes a rcvif, so provide one.
2986	 * This is a locally generated packet, so .. close enough. */
2987	m->m_pkthdr.rcvif = V_loif;
2988	bzero(m->m_data, len);
2989	switch (af) {
2990#ifdef INET
2991	case AF_INET:
2992		h = mtod(m, struct ip *);
2993
2994		/* IP header fields included in the TCP checksum */
2995		h->ip_p = IPPROTO_TCP;
2996		h->ip_len = htons(tlen);
2997		h->ip_src.s_addr = saddr->v4.s_addr;
2998		h->ip_dst.s_addr = daddr->v4.s_addr;
2999
3000		th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
3001		break;
3002#endif /* INET */
3003#ifdef INET6
3004	case AF_INET6:
3005		h6 = mtod(m, struct ip6_hdr *);
3006
3007		/* IP header fields included in the TCP checksum */
3008		h6->ip6_nxt = IPPROTO_TCP;
3009		h6->ip6_plen = htons(tlen);
3010		memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
3011		memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
3012
3013		th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
3014		break;
3015#endif /* INET6 */
3016	}
3017
3018	/* TCP header */
3019	th->th_sport = sport;
3020	th->th_dport = dport;
3021	th->th_seq = htonl(seq);
3022	th->th_ack = htonl(ack);
3023	th->th_off = tlen >> 2;
3024	th->th_flags = tcp_flags;
3025	th->th_win = htons(win);
3026
3027	if (mss) {
3028		opt = (char *)(th + 1);
3029		opt[0] = TCPOPT_MAXSEG;
3030		opt[1] = 4;
3031		HTONS(mss);
3032		bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
3033	}
3034
3035	switch (af) {
3036#ifdef INET
3037	case AF_INET:
3038		/* TCP checksum */
3039		th->th_sum = in_cksum(m, len);
3040
3041		/* Finish the IP header */
3042		h->ip_v = 4;
3043		h->ip_hl = sizeof(*h) >> 2;
3044		h->ip_tos = IPTOS_LOWDELAY;
3045		h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0);
3046		h->ip_len = htons(len);
3047		h->ip_ttl = ttl ? ttl : V_ip_defttl;
3048		h->ip_sum = 0;
3049		break;
3050#endif /* INET */
3051#ifdef INET6
3052	case AF_INET6:
3053		/* TCP checksum */
3054		th->th_sum = in6_cksum(m, IPPROTO_TCP,
3055		    sizeof(struct ip6_hdr), tlen);
3056
3057		h6->ip6_vfc |= IPV6_VERSION;
3058		h6->ip6_hlim = IPV6_DEFHLIM;
3059		break;
3060#endif /* INET6 */
3061	}
3062
3063	return (m);
3064}
3065
3066static void
3067pf_send_sctp_abort(sa_family_t af, struct pf_pdesc *pd,
3068    uint8_t ttl, int rtableid)
3069{
3070	struct mbuf		*m;
3071#ifdef INET
3072	struct ip		*h = NULL;
3073#endif /* INET */
3074#ifdef INET6
3075	struct ip6_hdr		*h6 = NULL;
3076#endif /* INET6 */
3077	struct sctphdr		*hdr;
3078	struct sctp_chunkhdr	*chunk;
3079	struct pf_send_entry	*pfse;
3080	int			 off = 0;
3081
3082	MPASS(af == pd->af);
3083
3084	m = m_gethdr(M_NOWAIT, MT_DATA);
3085	if (m == NULL)
3086		return;
3087
3088	m->m_data += max_linkhdr;
3089	m->m_flags |= M_SKIP_FIREWALL;
3090	/* The rest of the stack assumes a rcvif, so provide one.
3091	 * This is a locally generated packet, so .. close enough. */
3092	m->m_pkthdr.rcvif = V_loif;
3093
3094	/* IPv4|6 header */
3095	switch (af) {
3096#ifdef INET
3097	case AF_INET:
3098		bzero(m->m_data, sizeof(struct ip) + sizeof(*hdr) + sizeof(*chunk));
3099
3100		h = mtod(m, struct ip *);
3101
3102		/* IP header fields included in the TCP checksum */
3103
3104		h->ip_p = IPPROTO_SCTP;
3105		h->ip_len = htons(sizeof(*h) + sizeof(*hdr) + sizeof(*chunk));
3106		h->ip_ttl = ttl ? ttl : V_ip_defttl;
3107		h->ip_src = pd->dst->v4;
3108		h->ip_dst = pd->src->v4;
3109
3110		off += sizeof(struct ip);
3111		break;
3112#endif /* INET */
3113#ifdef INET6
3114	case AF_INET6:
3115		bzero(m->m_data, sizeof(struct ip6_hdr) + sizeof(*hdr) + sizeof(*chunk));
3116
3117		h6 = mtod(m, struct ip6_hdr *);
3118
3119		/* IP header fields included in the TCP checksum */
3120		h6->ip6_vfc |= IPV6_VERSION;
3121		h6->ip6_nxt = IPPROTO_SCTP;
3122		h6->ip6_plen = htons(sizeof(*h6) + sizeof(*hdr) + sizeof(*chunk));
3123		h6->ip6_hlim = ttl ? ttl : V_ip6_defhlim;
3124		memcpy(&h6->ip6_src, &pd->dst->v6, sizeof(struct in6_addr));
3125		memcpy(&h6->ip6_dst, &pd->src->v6, sizeof(struct in6_addr));
3126
3127		off += sizeof(struct ip6_hdr);
3128		break;
3129#endif /* INET6 */
3130	}
3131
3132	/* SCTP header */
3133	hdr = mtodo(m, off);
3134
3135	hdr->src_port = pd->hdr.sctp.dest_port;
3136	hdr->dest_port = pd->hdr.sctp.src_port;
3137	hdr->v_tag = pd->sctp_initiate_tag;
3138	hdr->checksum = 0;
3139
3140	/* Abort chunk. */
3141	off += sizeof(struct sctphdr);
3142	chunk = mtodo(m, off);
3143
3144	chunk->chunk_type = SCTP_ABORT_ASSOCIATION;
3145	chunk->chunk_length = htons(sizeof(*chunk));
3146
3147	/* SCTP checksum */
3148	off += sizeof(*chunk);
3149	m->m_pkthdr.len = m->m_len = off;
3150
3151	pf_sctp_checksum(m, off - sizeof(*hdr) - sizeof(*chunk));
3152
3153	if (rtableid >= 0)
3154		M_SETFIB(m, rtableid);
3155
3156	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
3157	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
3158	if (pfse == NULL) {
3159		m_freem(m);
3160		return;
3161	}
3162
3163	switch (af) {
3164#ifdef INET
3165	case AF_INET:
3166		pfse->pfse_type = PFSE_IP;
3167		break;
3168#endif /* INET */
3169#ifdef INET6
3170	case AF_INET6:
3171		pfse->pfse_type = PFSE_IP6;
3172		break;
3173#endif /* INET6 */
3174	}
3175
3176	pfse->pfse_m = m;
3177	pf_send(pfse);
3178}
3179
3180void
3181pf_send_tcp(const struct pf_krule *r, sa_family_t af,
3182    const struct pf_addr *saddr, const struct pf_addr *daddr,
3183    u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
3184    u_int8_t tcp_flags, u_int16_t win, u_int16_t mss, u_int8_t ttl,
3185    bool skip_firewall, u_int16_t mtag_tag, u_int16_t mtag_flags, int rtableid)
3186{
3187	struct pf_send_entry *pfse;
3188	struct mbuf	*m;
3189
3190	m = pf_build_tcp(r, af, saddr, daddr, sport, dport, seq, ack, tcp_flags,
3191	    win, mss, ttl, skip_firewall, mtag_tag, mtag_flags, rtableid);
3192	if (m == NULL)
3193		return;
3194
3195	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
3196	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
3197	if (pfse == NULL) {
3198		m_freem(m);
3199		return;
3200	}
3201
3202	switch (af) {
3203#ifdef INET
3204	case AF_INET:
3205		pfse->pfse_type = PFSE_IP;
3206		break;
3207#endif /* INET */
3208#ifdef INET6
3209	case AF_INET6:
3210		pfse->pfse_type = PFSE_IP6;
3211		break;
3212#endif /* INET6 */
3213	}
3214
3215	pfse->pfse_m = m;
3216	pf_send(pfse);
3217}
3218
3219static void
3220pf_return(struct pf_krule *r, struct pf_krule *nr, struct pf_pdesc *pd,
3221    struct pf_state_key *sk, int off, struct mbuf *m, struct tcphdr *th,
3222    struct pfi_kkif *kif, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen,
3223    u_short *reason, int rtableid)
3224{
3225	struct pf_addr	* const saddr = pd->src;
3226	struct pf_addr	* const daddr = pd->dst;
3227	sa_family_t	 af = pd->af;
3228
3229	/* undo NAT changes, if they have taken place */
3230	if (nr != NULL) {
3231		PF_ACPY(saddr, &sk->addr[pd->sidx], af);
3232		PF_ACPY(daddr, &sk->addr[pd->didx], af);
3233		if (pd->sport)
3234			*pd->sport = sk->port[pd->sidx];
3235		if (pd->dport)
3236			*pd->dport = sk->port[pd->didx];
3237		if (pd->proto_sum)
3238			*pd->proto_sum = bproto_sum;
3239		if (pd->ip_sum)
3240			*pd->ip_sum = bip_sum;
3241		m_copyback(m, off, hdrlen, pd->hdr.any);
3242	}
3243	if (pd->proto == IPPROTO_TCP &&
3244	    ((r->rule_flag & PFRULE_RETURNRST) ||
3245	    (r->rule_flag & PFRULE_RETURN)) &&
3246	    !(th->th_flags & TH_RST)) {
3247		u_int32_t	 ack = ntohl(th->th_seq) + pd->p_len;
3248		int		 len = 0;
3249#ifdef INET
3250		struct ip	*h4;
3251#endif
3252#ifdef INET6
3253		struct ip6_hdr	*h6;
3254#endif
3255
3256		switch (af) {
3257#ifdef INET
3258		case AF_INET:
3259			h4 = mtod(m, struct ip *);
3260			len = ntohs(h4->ip_len) - off;
3261			break;
3262#endif
3263#ifdef INET6
3264		case AF_INET6:
3265			h6 = mtod(m, struct ip6_hdr *);
3266			len = ntohs(h6->ip6_plen) - (off - sizeof(*h6));
3267			break;
3268#endif
3269		}
3270
3271		if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
3272			REASON_SET(reason, PFRES_PROTCKSUM);
3273		else {
3274			if (th->th_flags & TH_SYN)
3275				ack++;
3276			if (th->th_flags & TH_FIN)
3277				ack++;
3278			pf_send_tcp(r, af, pd->dst,
3279				pd->src, th->th_dport, th->th_sport,
3280				ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
3281				r->return_ttl, true, 0, 0, rtableid);
3282		}
3283	} else if (pd->proto == IPPROTO_SCTP &&
3284	    (r->rule_flag & PFRULE_RETURN)) {
3285		pf_send_sctp_abort(af, pd, r->return_ttl, rtableid);
3286	} else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
3287		r->return_icmp)
3288		pf_send_icmp(m, r->return_icmp >> 8,
3289			r->return_icmp & 255, af, r, rtableid);
3290	else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
3291		r->return_icmp6)
3292		pf_send_icmp(m, r->return_icmp6 >> 8,
3293			r->return_icmp6 & 255, af, r, rtableid);
3294}
3295
3296static int
3297pf_match_ieee8021q_pcp(u_int8_t prio, struct mbuf *m)
3298{
3299	struct m_tag *mtag;
3300	u_int8_t mpcp;
3301
3302	mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL);
3303	if (mtag == NULL)
3304		return (0);
3305
3306	if (prio == PF_PRIO_ZERO)
3307		prio = 0;
3308
3309	mpcp = *(uint8_t *)(mtag + 1);
3310
3311	return (mpcp == prio);
3312}
3313
3314static int
3315pf_icmp_to_bandlim(uint8_t type)
3316{
3317	switch (type) {
3318		case ICMP_ECHO:
3319		case ICMP_ECHOREPLY:
3320			return (BANDLIM_ICMP_ECHO);
3321		case ICMP_TSTAMP:
3322		case ICMP_TSTAMPREPLY:
3323			return (BANDLIM_ICMP_TSTAMP);
3324		case ICMP_UNREACH:
3325		default:
3326			return (BANDLIM_ICMP_UNREACH);
3327	}
3328}
3329
3330static void
3331pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
3332    struct pf_krule *r, int rtableid)
3333{
3334	struct pf_send_entry *pfse;
3335	struct mbuf *m0;
3336	struct pf_mtag *pf_mtag;
3337
3338	/* ICMP packet rate limitation. */
3339#ifdef INET6
3340	if (af == AF_INET6) {
3341		if (icmp6_ratelimit(NULL, type, code))
3342			return;
3343	}
3344#endif
3345#ifdef INET
3346	if (af == AF_INET) {
3347		if (badport_bandlim(pf_icmp_to_bandlim(type)) != 0)
3348			return;
3349	}
3350#endif
3351
3352	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
3353	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
3354	if (pfse == NULL)
3355		return;
3356
3357	if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) {
3358		free(pfse, M_PFTEMP);
3359		return;
3360	}
3361
3362	if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
3363		free(pfse, M_PFTEMP);
3364		return;
3365	}
3366	/* XXX: revisit */
3367	m0->m_flags |= M_SKIP_FIREWALL;
3368
3369	if (rtableid >= 0)
3370		M_SETFIB(m0, rtableid);
3371
3372#ifdef ALTQ
3373	if (r->qid) {
3374		pf_mtag->qid = r->qid;
3375		/* add hints for ecn */
3376		pf_mtag->hdr = mtod(m0, struct ip *);
3377	}
3378#endif /* ALTQ */
3379
3380	switch (af) {
3381#ifdef INET
3382	case AF_INET:
3383		pfse->pfse_type = PFSE_ICMP;
3384		break;
3385#endif /* INET */
3386#ifdef INET6
3387	case AF_INET6:
3388		pfse->pfse_type = PFSE_ICMP6;
3389		break;
3390#endif /* INET6 */
3391	}
3392	pfse->pfse_m = m0;
3393	pfse->icmpopts.type = type;
3394	pfse->icmpopts.code = code;
3395	pf_send(pfse);
3396}
3397
3398/*
3399 * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
3400 * If n is 0, they match if they are equal. If n is != 0, they match if they
3401 * are different.
3402 */
3403int
3404pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
3405    struct pf_addr *b, sa_family_t af)
3406{
3407	int	match = 0;
3408
3409	switch (af) {
3410#ifdef INET
3411	case AF_INET:
3412		if (IN_ARE_MASKED_ADDR_EQUAL(a->v4, b->v4, m->v4))
3413			match++;
3414		break;
3415#endif /* INET */
3416#ifdef INET6
3417	case AF_INET6:
3418		if (IN6_ARE_MASKED_ADDR_EQUAL(&a->v6, &b->v6, &m->v6))
3419			match++;
3420		break;
3421#endif /* INET6 */
3422	}
3423	if (match) {
3424		if (n)
3425			return (0);
3426		else
3427			return (1);
3428	} else {
3429		if (n)
3430			return (1);
3431		else
3432			return (0);
3433	}
3434}
3435
3436/*
3437 * Return 1 if b <= a <= e, otherwise return 0.
3438 */
3439int
3440pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
3441    struct pf_addr *a, sa_family_t af)
3442{
3443	switch (af) {
3444#ifdef INET
3445	case AF_INET:
3446		if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) ||
3447		    (ntohl(a->addr32[0]) > ntohl(e->addr32[0])))
3448			return (0);
3449		break;
3450#endif /* INET */
3451#ifdef INET6
3452	case AF_INET6: {
3453		int	i;
3454
3455		/* check a >= b */
3456		for (i = 0; i < 4; ++i)
3457			if (ntohl(a->addr32[i]) > ntohl(b->addr32[i]))
3458				break;
3459			else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i]))
3460				return (0);
3461		/* check a <= e */
3462		for (i = 0; i < 4; ++i)
3463			if (ntohl(a->addr32[i]) < ntohl(e->addr32[i]))
3464				break;
3465			else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i]))
3466				return (0);
3467		break;
3468	}
3469#endif /* INET6 */
3470	}
3471	return (1);
3472}
3473
3474static int
3475pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
3476{
3477	switch (op) {
3478	case PF_OP_IRG:
3479		return ((p > a1) && (p < a2));
3480	case PF_OP_XRG:
3481		return ((p < a1) || (p > a2));
3482	case PF_OP_RRG:
3483		return ((p >= a1) && (p <= a2));
3484	case PF_OP_EQ:
3485		return (p == a1);
3486	case PF_OP_NE:
3487		return (p != a1);
3488	case PF_OP_LT:
3489		return (p < a1);
3490	case PF_OP_LE:
3491		return (p <= a1);
3492	case PF_OP_GT:
3493		return (p > a1);
3494	case PF_OP_GE:
3495		return (p >= a1);
3496	}
3497	return (0); /* never reached */
3498}
3499
3500int
3501pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
3502{
3503	NTOHS(a1);
3504	NTOHS(a2);
3505	NTOHS(p);
3506	return (pf_match(op, a1, a2, p));
3507}
3508
3509static int
3510pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
3511{
3512	if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
3513		return (0);
3514	return (pf_match(op, a1, a2, u));
3515}
3516
3517static int
3518pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
3519{
3520	if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
3521		return (0);
3522	return (pf_match(op, a1, a2, g));
3523}
3524
3525int
3526pf_match_tag(struct mbuf *m, struct pf_krule *r, int *tag, int mtag)
3527{
3528	if (*tag == -1)
3529		*tag = mtag;
3530
3531	return ((!r->match_tag_not && r->match_tag == *tag) ||
3532	    (r->match_tag_not && r->match_tag != *tag));
3533}
3534
3535int
3536pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag)
3537{
3538
3539	KASSERT(tag > 0, ("%s: tag %d", __func__, tag));
3540
3541	if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL))
3542		return (ENOMEM);
3543
3544	pd->pf_mtag->tag = tag;
3545
3546	return (0);
3547}
3548
3549#define	PF_ANCHOR_STACKSIZE	32
3550struct pf_kanchor_stackframe {
3551	struct pf_kruleset	*rs;
3552	struct pf_krule		*r;	/* XXX: + match bit */
3553	struct pf_kanchor	*child;
3554};
3555
3556/*
3557 * XXX: We rely on malloc(9) returning pointer aligned addresses.
3558 */
3559#define	PF_ANCHORSTACK_MATCH	0x00000001
3560#define	PF_ANCHORSTACK_MASK	(PF_ANCHORSTACK_MATCH)
3561
3562#define	PF_ANCHOR_MATCH(f)	((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH)
3563#define	PF_ANCHOR_RULE(f)	(struct pf_krule *)			\
3564				((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK)
3565#define	PF_ANCHOR_SET_MATCH(f)	do { (f)->r = (void *) 			\
3566				((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH);  \
3567} while (0)
3568
3569void
3570pf_step_into_anchor(struct pf_kanchor_stackframe *stack, int *depth,
3571    struct pf_kruleset **rs, int n, struct pf_krule **r, struct pf_krule **a,
3572    int *match)
3573{
3574	struct pf_kanchor_stackframe	*f;
3575
3576	PF_RULES_RASSERT();
3577
3578	if (match)
3579		*match = 0;
3580	if (*depth >= PF_ANCHOR_STACKSIZE) {
3581		printf("%s: anchor stack overflow on %s\n",
3582		    __func__, (*r)->anchor->name);
3583		*r = TAILQ_NEXT(*r, entries);
3584		return;
3585	} else if (*depth == 0 && a != NULL)
3586		*a = *r;
3587	f = stack + (*depth)++;
3588	f->rs = *rs;
3589	f->r = *r;
3590	if ((*r)->anchor_wildcard) {
3591		struct pf_kanchor_node *parent = &(*r)->anchor->children;
3592
3593		if ((f->child = RB_MIN(pf_kanchor_node, parent)) == NULL) {
3594			*r = NULL;
3595			return;
3596		}
3597		*rs = &f->child->ruleset;
3598	} else {
3599		f->child = NULL;
3600		*rs = &(*r)->anchor->ruleset;
3601	}
3602	*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
3603}
3604
3605int
3606pf_step_out_of_anchor(struct pf_kanchor_stackframe *stack, int *depth,
3607    struct pf_kruleset **rs, int n, struct pf_krule **r, struct pf_krule **a,
3608    int *match)
3609{
3610	struct pf_kanchor_stackframe	*f;
3611	struct pf_krule *fr;
3612	int quick = 0;
3613
3614	PF_RULES_RASSERT();
3615
3616	do {
3617		if (*depth <= 0)
3618			break;
3619		f = stack + *depth - 1;
3620		fr = PF_ANCHOR_RULE(f);
3621		if (f->child != NULL) {
3622			/*
3623			 * This block traverses through
3624			 * a wildcard anchor.
3625			 */
3626			if (match != NULL && *match) {
3627				/*
3628				 * If any of "*" matched, then
3629				 * "foo/ *" matched, mark frame
3630				 * appropriately.
3631				 */
3632				PF_ANCHOR_SET_MATCH(f);
3633				*match = 0;
3634			}
3635			f->child = RB_NEXT(pf_kanchor_node,
3636			    &fr->anchor->children, f->child);
3637			if (f->child != NULL) {
3638				*rs = &f->child->ruleset;
3639				*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
3640				if (*r == NULL)
3641					continue;
3642				else
3643					break;
3644			}
3645		}
3646		(*depth)--;
3647		if (*depth == 0 && a != NULL)
3648			*a = NULL;
3649		*rs = f->rs;
3650		if (PF_ANCHOR_MATCH(f) || (match != NULL && *match))
3651			quick = fr->quick;
3652		*r = TAILQ_NEXT(fr, entries);
3653	} while (*r == NULL);
3654
3655	return (quick);
3656}
3657
3658struct pf_keth_anchor_stackframe {
3659	struct pf_keth_ruleset	*rs;
3660	struct pf_keth_rule	*r;	/* XXX: + match bit */
3661	struct pf_keth_anchor	*child;
3662};
3663
3664#define	PF_ETH_ANCHOR_MATCH(f)	((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH)
3665#define	PF_ETH_ANCHOR_RULE(f)	(struct pf_keth_rule *)			\
3666				((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK)
3667#define	PF_ETH_ANCHOR_SET_MATCH(f)	do { (f)->r = (void *) 		\
3668				((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH);  \
3669} while (0)
3670
3671void
3672pf_step_into_keth_anchor(struct pf_keth_anchor_stackframe *stack, int *depth,
3673    struct pf_keth_ruleset **rs, struct pf_keth_rule **r,
3674    struct pf_keth_rule **a, int *match)
3675{
3676	struct pf_keth_anchor_stackframe	*f;
3677
3678	NET_EPOCH_ASSERT();
3679
3680	if (match)
3681		*match = 0;
3682	if (*depth >= PF_ANCHOR_STACKSIZE) {
3683		printf("%s: anchor stack overflow on %s\n",
3684		    __func__, (*r)->anchor->name);
3685		*r = TAILQ_NEXT(*r, entries);
3686		return;
3687	} else if (*depth == 0 && a != NULL)
3688		*a = *r;
3689	f = stack + (*depth)++;
3690	f->rs = *rs;
3691	f->r = *r;
3692	if ((*r)->anchor_wildcard) {
3693		struct pf_keth_anchor_node *parent = &(*r)->anchor->children;
3694
3695		if ((f->child = RB_MIN(pf_keth_anchor_node, parent)) == NULL) {
3696			*r = NULL;
3697			return;
3698		}
3699		*rs = &f->child->ruleset;
3700	} else {
3701		f->child = NULL;
3702		*rs = &(*r)->anchor->ruleset;
3703	}
3704	*r = TAILQ_FIRST((*rs)->active.rules);
3705}
3706
3707int
3708pf_step_out_of_keth_anchor(struct pf_keth_anchor_stackframe *stack, int *depth,
3709    struct pf_keth_ruleset **rs, struct pf_keth_rule **r,
3710    struct pf_keth_rule **a, int *match)
3711{
3712	struct pf_keth_anchor_stackframe	*f;
3713	struct pf_keth_rule *fr;
3714	int quick = 0;
3715
3716	NET_EPOCH_ASSERT();
3717
3718	do {
3719		if (*depth <= 0)
3720			break;
3721		f = stack + *depth - 1;
3722		fr = PF_ETH_ANCHOR_RULE(f);
3723		if (f->child != NULL) {
3724			/*
3725			 * This block traverses through
3726			 * a wildcard anchor.
3727			 */
3728			if (match != NULL && *match) {
3729				/*
3730				 * If any of "*" matched, then
3731				 * "foo/ *" matched, mark frame
3732				 * appropriately.
3733				 */
3734				PF_ETH_ANCHOR_SET_MATCH(f);
3735				*match = 0;
3736			}
3737			f->child = RB_NEXT(pf_keth_anchor_node,
3738			    &fr->anchor->children, f->child);
3739			if (f->child != NULL) {
3740				*rs = &f->child->ruleset;
3741				*r = TAILQ_FIRST((*rs)->active.rules);
3742				if (*r == NULL)
3743					continue;
3744				else
3745					break;
3746			}
3747		}
3748		(*depth)--;
3749		if (*depth == 0 && a != NULL)
3750			*a = NULL;
3751		*rs = f->rs;
3752		if (PF_ETH_ANCHOR_MATCH(f) || (match != NULL && *match))
3753			quick = fr->quick;
3754		*r = TAILQ_NEXT(fr, entries);
3755	} while (*r == NULL);
3756
3757	return (quick);
3758}
3759
3760#ifdef INET6
3761void
3762pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
3763    struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
3764{
3765	switch (af) {
3766#ifdef INET
3767	case AF_INET:
3768		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
3769		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
3770		break;
3771#endif /* INET */
3772	case AF_INET6:
3773		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
3774		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
3775		naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
3776		((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
3777		naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
3778		((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
3779		naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
3780		((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
3781		break;
3782	}
3783}
3784
3785void
3786pf_addr_inc(struct pf_addr *addr, sa_family_t af)
3787{
3788	switch (af) {
3789#ifdef INET
3790	case AF_INET:
3791		addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
3792		break;
3793#endif /* INET */
3794	case AF_INET6:
3795		if (addr->addr32[3] == 0xffffffff) {
3796			addr->addr32[3] = 0;
3797			if (addr->addr32[2] == 0xffffffff) {
3798				addr->addr32[2] = 0;
3799				if (addr->addr32[1] == 0xffffffff) {
3800					addr->addr32[1] = 0;
3801					addr->addr32[0] =
3802					    htonl(ntohl(addr->addr32[0]) + 1);
3803				} else
3804					addr->addr32[1] =
3805					    htonl(ntohl(addr->addr32[1]) + 1);
3806			} else
3807				addr->addr32[2] =
3808				    htonl(ntohl(addr->addr32[2]) + 1);
3809		} else
3810			addr->addr32[3] =
3811			    htonl(ntohl(addr->addr32[3]) + 1);
3812		break;
3813	}
3814}
3815#endif /* INET6 */
3816
3817void
3818pf_rule_to_actions(struct pf_krule *r, struct pf_rule_actions *a)
3819{
3820	/*
3821	 * Modern rules use the same flags in rules as they do in states.
3822	 */
3823	a->flags |= (r->scrub_flags & (PFSTATE_NODF|PFSTATE_RANDOMID|
3824	    PFSTATE_SCRUB_TCP|PFSTATE_SETPRIO));
3825
3826	/*
3827	 * Old-style scrub rules have different flags which need to be translated.
3828	 */
3829	if (r->rule_flag & PFRULE_RANDOMID)
3830		a->flags |= PFSTATE_RANDOMID;
3831	if (r->scrub_flags & PFSTATE_SETTOS || r->rule_flag & PFRULE_SET_TOS ) {
3832		a->flags |= PFSTATE_SETTOS;
3833		a->set_tos = r->set_tos;
3834	}
3835
3836	if (r->qid)
3837		a->qid = r->qid;
3838	if (r->pqid)
3839		a->pqid = r->pqid;
3840	if (r->rtableid >= 0)
3841		a->rtableid = r->rtableid;
3842	a->log |= r->log;
3843	if (r->min_ttl)
3844		a->min_ttl = r->min_ttl;
3845	if (r->max_mss)
3846		a->max_mss = r->max_mss;
3847	if (r->dnpipe)
3848		a->dnpipe = r->dnpipe;
3849	if (r->dnrpipe)
3850		a->dnrpipe = r->dnrpipe;
3851	if (r->dnpipe || r->dnrpipe) {
3852		if (r->free_flags & PFRULE_DN_IS_PIPE)
3853			a->flags |= PFSTATE_DN_IS_PIPE;
3854		else
3855			a->flags &= ~PFSTATE_DN_IS_PIPE;
3856	}
3857	if (r->scrub_flags & PFSTATE_SETPRIO) {
3858		a->set_prio[0] = r->set_prio[0];
3859		a->set_prio[1] = r->set_prio[1];
3860	}
3861}
3862
3863int
3864pf_socket_lookup(struct pf_pdesc *pd, struct mbuf *m)
3865{
3866	struct pf_addr		*saddr, *daddr;
3867	u_int16_t		 sport, dport;
3868	struct inpcbinfo	*pi;
3869	struct inpcb		*inp;
3870
3871	pd->lookup.uid = UID_MAX;
3872	pd->lookup.gid = GID_MAX;
3873
3874	switch (pd->proto) {
3875	case IPPROTO_TCP:
3876		sport = pd->hdr.tcp.th_sport;
3877		dport = pd->hdr.tcp.th_dport;
3878		pi = &V_tcbinfo;
3879		break;
3880	case IPPROTO_UDP:
3881		sport = pd->hdr.udp.uh_sport;
3882		dport = pd->hdr.udp.uh_dport;
3883		pi = &V_udbinfo;
3884		break;
3885	default:
3886		return (-1);
3887	}
3888	if (pd->dir == PF_IN) {
3889		saddr = pd->src;
3890		daddr = pd->dst;
3891	} else {
3892		u_int16_t	p;
3893
3894		p = sport;
3895		sport = dport;
3896		dport = p;
3897		saddr = pd->dst;
3898		daddr = pd->src;
3899	}
3900	switch (pd->af) {
3901#ifdef INET
3902	case AF_INET:
3903		inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4,
3904		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
3905		if (inp == NULL) {
3906			inp = in_pcblookup_mbuf(pi, saddr->v4, sport,
3907			   daddr->v4, dport, INPLOOKUP_WILDCARD |
3908			   INPLOOKUP_RLOCKPCB, NULL, m);
3909			if (inp == NULL)
3910				return (-1);
3911		}
3912		break;
3913#endif /* INET */
3914#ifdef INET6
3915	case AF_INET6:
3916		inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6,
3917		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
3918		if (inp == NULL) {
3919			inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport,
3920			    &daddr->v6, dport, INPLOOKUP_WILDCARD |
3921			    INPLOOKUP_RLOCKPCB, NULL, m);
3922			if (inp == NULL)
3923				return (-1);
3924		}
3925		break;
3926#endif /* INET6 */
3927
3928	default:
3929		return (-1);
3930	}
3931	INP_RLOCK_ASSERT(inp);
3932	pd->lookup.uid = inp->inp_cred->cr_uid;
3933	pd->lookup.gid = inp->inp_cred->cr_groups[0];
3934	INP_RUNLOCK(inp);
3935
3936	return (1);
3937}
3938
3939u_int8_t
3940pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
3941{
3942	int		 hlen;
3943	u_int8_t	 hdr[60];
3944	u_int8_t	*opt, optlen;
3945	u_int8_t	 wscale = 0;
3946
3947	hlen = th_off << 2;		/* hlen <= sizeof(hdr) */
3948	if (hlen <= sizeof(struct tcphdr))
3949		return (0);
3950	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
3951		return (0);
3952	opt = hdr + sizeof(struct tcphdr);
3953	hlen -= sizeof(struct tcphdr);
3954	while (hlen >= 3) {
3955		switch (*opt) {
3956		case TCPOPT_EOL:
3957		case TCPOPT_NOP:
3958			++opt;
3959			--hlen;
3960			break;
3961		case TCPOPT_WINDOW:
3962			wscale = opt[2];
3963			if (wscale > TCP_MAX_WINSHIFT)
3964				wscale = TCP_MAX_WINSHIFT;
3965			wscale |= PF_WSCALE_FLAG;
3966			/* FALLTHROUGH */
3967		default:
3968			optlen = opt[1];
3969			if (optlen < 2)
3970				optlen = 2;
3971			hlen -= optlen;
3972			opt += optlen;
3973			break;
3974		}
3975	}
3976	return (wscale);
3977}
3978
3979u_int16_t
3980pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
3981{
3982	int		 hlen;
3983	u_int8_t	 hdr[60];
3984	u_int8_t	*opt, optlen;
3985	u_int16_t	 mss = V_tcp_mssdflt;
3986
3987	hlen = th_off << 2;	/* hlen <= sizeof(hdr) */
3988	if (hlen <= sizeof(struct tcphdr))
3989		return (0);
3990	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
3991		return (0);
3992	opt = hdr + sizeof(struct tcphdr);
3993	hlen -= sizeof(struct tcphdr);
3994	while (hlen >= TCPOLEN_MAXSEG) {
3995		switch (*opt) {
3996		case TCPOPT_EOL:
3997		case TCPOPT_NOP:
3998			++opt;
3999			--hlen;
4000			break;
4001		case TCPOPT_MAXSEG:
4002			bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
4003			NTOHS(mss);
4004			/* FALLTHROUGH */
4005		default:
4006			optlen = opt[1];
4007			if (optlen < 2)
4008				optlen = 2;
4009			hlen -= optlen;
4010			opt += optlen;
4011			break;
4012		}
4013	}
4014	return (mss);
4015}
4016
4017static u_int16_t
4018pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer)
4019{
4020	struct nhop_object *nh;
4021#ifdef INET6
4022	struct in6_addr		dst6;
4023	uint32_t		scopeid;
4024#endif /* INET6 */
4025	int			 hlen = 0;
4026	uint16_t		 mss = 0;
4027
4028	NET_EPOCH_ASSERT();
4029
4030	switch (af) {
4031#ifdef INET
4032	case AF_INET:
4033		hlen = sizeof(struct ip);
4034		nh = fib4_lookup(rtableid, addr->v4, 0, 0, 0);
4035		if (nh != NULL)
4036			mss = nh->nh_mtu - hlen - sizeof(struct tcphdr);
4037		break;
4038#endif /* INET */
4039#ifdef INET6
4040	case AF_INET6:
4041		hlen = sizeof(struct ip6_hdr);
4042		in6_splitscope(&addr->v6, &dst6, &scopeid);
4043		nh = fib6_lookup(rtableid, &dst6, scopeid, 0, 0);
4044		if (nh != NULL)
4045			mss = nh->nh_mtu - hlen - sizeof(struct tcphdr);
4046		break;
4047#endif /* INET6 */
4048	}
4049
4050	mss = max(V_tcp_mssdflt, mss);
4051	mss = min(mss, offer);
4052	mss = max(mss, 64);		/* sanity - at least max opt space */
4053	return (mss);
4054}
4055
4056static u_int32_t
4057pf_tcp_iss(struct pf_pdesc *pd)
4058{
4059	MD5_CTX ctx;
4060	u_int32_t digest[4];
4061
4062	if (V_pf_tcp_secret_init == 0) {
4063		arc4random_buf(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret));
4064		MD5Init(&V_pf_tcp_secret_ctx);
4065		MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret,
4066		    sizeof(V_pf_tcp_secret));
4067		V_pf_tcp_secret_init = 1;
4068	}
4069
4070	ctx = V_pf_tcp_secret_ctx;
4071
4072	MD5Update(&ctx, (char *)&pd->hdr.tcp.th_sport, sizeof(u_short));
4073	MD5Update(&ctx, (char *)&pd->hdr.tcp.th_dport, sizeof(u_short));
4074	if (pd->af == AF_INET6) {
4075		MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
4076		MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
4077	} else {
4078		MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
4079		MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
4080	}
4081	MD5Final((u_char *)digest, &ctx);
4082	V_pf_tcp_iss_off += 4096;
4083#define	ISN_RANDOM_INCREMENT (4096 - 1)
4084	return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) +
4085	    V_pf_tcp_iss_off);
4086#undef	ISN_RANDOM_INCREMENT
4087}
4088
4089static bool
4090pf_match_eth_addr(const uint8_t *a, const struct pf_keth_rule_addr *r)
4091{
4092	bool match = true;
4093
4094	/* Always matches if not set */
4095	if (! r->isset)
4096		return (!r->neg);
4097
4098	for (int i = 0; i < ETHER_ADDR_LEN; i++) {
4099		if ((a[i] & r->mask[i]) != (r->addr[i] & r->mask[i])) {
4100			match = false;
4101			break;
4102		}
4103	}
4104
4105	return (match ^ r->neg);
4106}
4107
4108static int
4109pf_match_eth_tag(struct mbuf *m, struct pf_keth_rule *r, int *tag, int mtag)
4110{
4111	if (*tag == -1)
4112		*tag = mtag;
4113
4114	return ((!r->match_tag_not && r->match_tag == *tag) ||
4115	    (r->match_tag_not && r->match_tag != *tag));
4116}
4117
4118static void
4119pf_bridge_to(struct ifnet *ifp, struct mbuf *m)
4120{
4121	/* If we don't have the interface drop the packet. */
4122	if (ifp == NULL) {
4123		m_freem(m);
4124		return;
4125	}
4126
4127	switch (ifp->if_type) {
4128	case IFT_ETHER:
4129	case IFT_XETHER:
4130	case IFT_L2VLAN:
4131	case IFT_BRIDGE:
4132	case IFT_IEEE8023ADLAG:
4133		break;
4134	default:
4135		m_freem(m);
4136		return;
4137	}
4138
4139	ifp->if_transmit(ifp, m);
4140}
4141
4142static int
4143pf_test_eth_rule(int dir, struct pfi_kkif *kif, struct mbuf **m0)
4144{
4145#ifdef INET
4146	struct ip ip;
4147#endif
4148#ifdef INET6
4149	struct ip6_hdr ip6;
4150#endif
4151	struct mbuf *m = *m0;
4152	struct ether_header *e;
4153	struct pf_keth_rule *r, *rm, *a = NULL;
4154	struct pf_keth_ruleset *ruleset = NULL;
4155	struct pf_mtag *mtag;
4156	struct pf_keth_ruleq *rules;
4157	struct pf_addr *src = NULL, *dst = NULL;
4158	struct pfi_kkif *bridge_to;
4159	sa_family_t af = 0;
4160	uint16_t proto;
4161	int asd = 0, match = 0;
4162	int tag = -1;
4163	uint8_t action;
4164	struct pf_keth_anchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
4165
4166	MPASS(kif->pfik_ifp->if_vnet == curvnet);
4167	NET_EPOCH_ASSERT();
4168
4169	PF_RULES_RLOCK_TRACKER;
4170
4171	SDT_PROBE3(pf, eth, test_rule, entry, dir, kif->pfik_ifp, m);
4172
4173	mtag = pf_find_mtag(m);
4174	if (mtag != NULL && mtag->flags & PF_MTAG_FLAG_DUMMYNET) {
4175		/* Dummynet re-injects packets after they've
4176		 * completed their delay. We've already
4177		 * processed them, so pass unconditionally. */
4178
4179		/* But only once. We may see the packet multiple times (e.g.
4180		 * PFIL_IN/PFIL_OUT). */
4181		pf_dummynet_flag_remove(m, mtag);
4182
4183		return (PF_PASS);
4184	}
4185
4186	ruleset = V_pf_keth;
4187	rules = ck_pr_load_ptr(&ruleset->active.rules);
4188	r = TAILQ_FIRST(rules);
4189	rm = NULL;
4190
4191	e = mtod(m, struct ether_header *);
4192	proto = ntohs(e->ether_type);
4193
4194	switch (proto) {
4195#ifdef INET
4196	case ETHERTYPE_IP: {
4197		if (m_length(m, NULL) < (sizeof(struct ether_header) +
4198		    sizeof(ip)))
4199			return (PF_DROP);
4200
4201		af = AF_INET;
4202		m_copydata(m, sizeof(struct ether_header), sizeof(ip),
4203		    (caddr_t)&ip);
4204		src = (struct pf_addr *)&ip.ip_src;
4205		dst = (struct pf_addr *)&ip.ip_dst;
4206		break;
4207	}
4208#endif /* INET */
4209#ifdef INET6
4210	case ETHERTYPE_IPV6: {
4211		if (m_length(m, NULL) < (sizeof(struct ether_header) +
4212		    sizeof(ip6)))
4213			return (PF_DROP);
4214
4215		af = AF_INET6;
4216		m_copydata(m, sizeof(struct ether_header), sizeof(ip6),
4217		    (caddr_t)&ip6);
4218		src = (struct pf_addr *)&ip6.ip6_src;
4219		dst = (struct pf_addr *)&ip6.ip6_dst;
4220		break;
4221	}
4222#endif /* INET6 */
4223	}
4224
4225	PF_RULES_RLOCK();
4226
4227	while (r != NULL) {
4228		counter_u64_add(r->evaluations, 1);
4229		SDT_PROBE2(pf, eth, test_rule, test, r->nr, r);
4230
4231		if (pfi_kkif_match(r->kif, kif) == r->ifnot) {
4232			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
4233			    "kif");
4234			r = r->skip[PFE_SKIP_IFP].ptr;
4235		}
4236		else if (r->direction && r->direction != dir) {
4237			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
4238			    "dir");
4239			r = r->skip[PFE_SKIP_DIR].ptr;
4240		}
4241		else if (r->proto && r->proto != proto) {
4242			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
4243			    "proto");
4244			r = r->skip[PFE_SKIP_PROTO].ptr;
4245		}
4246		else if (! pf_match_eth_addr(e->ether_shost, &r->src)) {
4247			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
4248			    "src");
4249			r = r->skip[PFE_SKIP_SRC_ADDR].ptr;
4250		}
4251		else if (! pf_match_eth_addr(e->ether_dhost, &r->dst)) {
4252			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
4253			    "dst");
4254			r = r->skip[PFE_SKIP_DST_ADDR].ptr;
4255		}
4256		else if (src != NULL && PF_MISMATCHAW(&r->ipsrc.addr, src, af,
4257		    r->ipsrc.neg, kif, M_GETFIB(m))) {
4258			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
4259			    "ip_src");
4260			r = r->skip[PFE_SKIP_SRC_IP_ADDR].ptr;
4261		}
4262		else if (dst != NULL && PF_MISMATCHAW(&r->ipdst.addr, dst, af,
4263		    r->ipdst.neg, kif, M_GETFIB(m))) {
4264			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
4265			    "ip_dst");
4266			r = r->skip[PFE_SKIP_DST_IP_ADDR].ptr;
4267		}
4268		else if (r->match_tag && !pf_match_eth_tag(m, r, &tag,
4269		    mtag ? mtag->tag : 0)) {
4270			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
4271			    "match_tag");
4272			r = TAILQ_NEXT(r, entries);
4273		}
4274		else {
4275			if (r->tag)
4276				tag = r->tag;
4277			if (r->anchor == NULL) {
4278				/* Rule matches */
4279				rm = r;
4280
4281				SDT_PROBE2(pf, eth, test_rule, match, r->nr, r);
4282
4283				if (r->quick)
4284					break;
4285
4286				r = TAILQ_NEXT(r, entries);
4287			} else {
4288				pf_step_into_keth_anchor(anchor_stack, &asd,
4289				    &ruleset, &r, &a, &match);
4290			}
4291		}
4292		if (r == NULL && pf_step_out_of_keth_anchor(anchor_stack, &asd,
4293		    &ruleset, &r, &a, &match))
4294			break;
4295	}
4296
4297	r = rm;
4298
4299	SDT_PROBE2(pf, eth, test_rule, final_match, (r != NULL ? r->nr : -1), r);
4300
4301	/* Default to pass. */
4302	if (r == NULL) {
4303		PF_RULES_RUNLOCK();
4304		return (PF_PASS);
4305	}
4306
4307	/* Execute action. */
4308	counter_u64_add(r->packets[dir == PF_OUT], 1);
4309	counter_u64_add(r->bytes[dir == PF_OUT], m_length(m, NULL));
4310	pf_update_timestamp(r);
4311
4312	/* Shortcut. Don't tag if we're just going to drop anyway. */
4313	if (r->action == PF_DROP) {
4314		PF_RULES_RUNLOCK();
4315		return (PF_DROP);
4316	}
4317
4318	if (tag > 0) {
4319		if (mtag == NULL)
4320			mtag = pf_get_mtag(m);
4321		if (mtag == NULL) {
4322			PF_RULES_RUNLOCK();
4323			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
4324			return (PF_DROP);
4325		}
4326		mtag->tag = tag;
4327	}
4328
4329	if (r->qid != 0) {
4330		if (mtag == NULL)
4331			mtag = pf_get_mtag(m);
4332		if (mtag == NULL) {
4333			PF_RULES_RUNLOCK();
4334			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
4335			return (PF_DROP);
4336		}
4337		mtag->qid = r->qid;
4338	}
4339
4340	action = r->action;
4341	bridge_to = r->bridge_to;
4342
4343	/* Dummynet */
4344	if (r->dnpipe) {
4345		struct ip_fw_args dnflow;
4346
4347		/* Drop packet if dummynet is not loaded. */
4348		if (ip_dn_io_ptr == NULL) {
4349			PF_RULES_RUNLOCK();
4350			m_freem(m);
4351			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
4352			return (PF_DROP);
4353		}
4354		if (mtag == NULL)
4355			mtag = pf_get_mtag(m);
4356		if (mtag == NULL) {
4357			PF_RULES_RUNLOCK();
4358			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
4359			return (PF_DROP);
4360		}
4361
4362		bzero(&dnflow, sizeof(dnflow));
4363
4364		/* We don't have port numbers here, so we set 0.  That means
4365		 * that we'll be somewhat limited in distinguishing flows (i.e.
4366		 * only based on IP addresses, not based on port numbers), but
4367		 * it's better than nothing. */
4368		dnflow.f_id.dst_port = 0;
4369		dnflow.f_id.src_port = 0;
4370		dnflow.f_id.proto = 0;
4371
4372		dnflow.rule.info = r->dnpipe;
4373		dnflow.rule.info |= IPFW_IS_DUMMYNET;
4374		if (r->dnflags & PFRULE_DN_IS_PIPE)
4375			dnflow.rule.info |= IPFW_IS_PIPE;
4376
4377		dnflow.f_id.extra = dnflow.rule.info;
4378
4379		dnflow.flags = dir == PF_IN ? IPFW_ARGS_IN : IPFW_ARGS_OUT;
4380		dnflow.flags |= IPFW_ARGS_ETHER;
4381		dnflow.ifp = kif->pfik_ifp;
4382
4383		switch (af) {
4384		case AF_INET:
4385			dnflow.f_id.addr_type = 4;
4386			dnflow.f_id.src_ip = src->v4.s_addr;
4387			dnflow.f_id.dst_ip = dst->v4.s_addr;
4388			break;
4389		case AF_INET6:
4390			dnflow.flags |= IPFW_ARGS_IP6;
4391			dnflow.f_id.addr_type = 6;
4392			dnflow.f_id.src_ip6 = src->v6;
4393			dnflow.f_id.dst_ip6 = dst->v6;
4394			break;
4395		}
4396
4397		PF_RULES_RUNLOCK();
4398
4399		mtag->flags |= PF_MTAG_FLAG_DUMMYNET;
4400		ip_dn_io_ptr(m0, &dnflow);
4401		if (*m0 != NULL)
4402			pf_dummynet_flag_remove(m, mtag);
4403	} else {
4404		PF_RULES_RUNLOCK();
4405	}
4406
4407	if (action == PF_PASS && bridge_to) {
4408		pf_bridge_to(bridge_to->pfik_ifp, *m0);
4409		*m0 = NULL; /* We've eaten the packet. */
4410	}
4411
4412	return (action);
4413}
4414
4415static int
4416pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, struct pfi_kkif *kif,
4417    struct mbuf *m, int off, struct pf_pdesc *pd, struct pf_krule **am,
4418    struct pf_kruleset **rsm, struct inpcb *inp)
4419{
4420	struct pf_krule		*nr = NULL;
4421	struct pf_addr		* const saddr = pd->src;
4422	struct pf_addr		* const daddr = pd->dst;
4423	sa_family_t		 af = pd->af;
4424	struct pf_krule		*r, *a = NULL;
4425	struct pf_kruleset	*ruleset = NULL;
4426	struct pf_krule_slist	 match_rules;
4427	struct pf_krule_item	*ri;
4428	struct pf_ksrc_node	*nsn = NULL;
4429	struct tcphdr		*th = &pd->hdr.tcp;
4430	struct pf_state_key	*sk = NULL, *nk = NULL;
4431	u_short			 reason;
4432	int			 rewrite = 0, hdrlen = 0;
4433	int			 tag = -1;
4434	int			 asd = 0;
4435	int			 match = 0;
4436	int			 state_icmp = 0;
4437	u_int16_t		 sport = 0, dport = 0;
4438	u_int16_t		 bproto_sum = 0, bip_sum = 0;
4439	u_int8_t		 icmptype = 0, icmpcode = 0;
4440	struct pf_kanchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
4441
4442	PF_RULES_RASSERT();
4443
4444	if (inp != NULL) {
4445		INP_LOCK_ASSERT(inp);
4446		pd->lookup.uid = inp->inp_cred->cr_uid;
4447		pd->lookup.gid = inp->inp_cred->cr_groups[0];
4448		pd->lookup.done = 1;
4449	}
4450
4451	switch (pd->proto) {
4452	case IPPROTO_TCP:
4453		sport = th->th_sport;
4454		dport = th->th_dport;
4455		hdrlen = sizeof(*th);
4456		break;
4457	case IPPROTO_UDP:
4458		sport = pd->hdr.udp.uh_sport;
4459		dport = pd->hdr.udp.uh_dport;
4460		hdrlen = sizeof(pd->hdr.udp);
4461		break;
4462	case IPPROTO_SCTP:
4463		sport = pd->hdr.sctp.src_port;
4464		dport = pd->hdr.sctp.dest_port;
4465		hdrlen = sizeof(pd->hdr.sctp);
4466		break;
4467#ifdef INET
4468	case IPPROTO_ICMP:
4469		if (pd->af != AF_INET)
4470			break;
4471		sport = dport = pd->hdr.icmp.icmp_id;
4472		hdrlen = sizeof(pd->hdr.icmp);
4473		icmptype = pd->hdr.icmp.icmp_type;
4474		icmpcode = pd->hdr.icmp.icmp_code;
4475
4476		if (icmptype == ICMP_UNREACH ||
4477		    icmptype == ICMP_SOURCEQUENCH ||
4478		    icmptype == ICMP_REDIRECT ||
4479		    icmptype == ICMP_TIMXCEED ||
4480		    icmptype == ICMP_PARAMPROB)
4481			state_icmp++;
4482		break;
4483#endif /* INET */
4484#ifdef INET6
4485	case IPPROTO_ICMPV6:
4486		if (af != AF_INET6)
4487			break;
4488		sport = dport = pd->hdr.icmp6.icmp6_id;
4489		hdrlen = sizeof(pd->hdr.icmp6);
4490		icmptype = pd->hdr.icmp6.icmp6_type;
4491		icmpcode = pd->hdr.icmp6.icmp6_code;
4492
4493		if (icmptype == ICMP6_DST_UNREACH ||
4494		    icmptype == ICMP6_PACKET_TOO_BIG ||
4495		    icmptype == ICMP6_TIME_EXCEEDED ||
4496		    icmptype == ICMP6_PARAM_PROB)
4497			state_icmp++;
4498		break;
4499#endif /* INET6 */
4500	default:
4501		sport = dport = hdrlen = 0;
4502		break;
4503	}
4504
4505	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
4506
4507	/* check packet for BINAT/NAT/RDR */
4508	if ((nr = pf_get_translation(pd, m, off, kif, &nsn, &sk,
4509	    &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) {
4510		KASSERT(sk != NULL, ("%s: null sk", __func__));
4511		KASSERT(nk != NULL, ("%s: null nk", __func__));
4512
4513		if (nr->log) {
4514			PFLOG_PACKET(kif, m, af, PF_PASS, PFRES_MATCH, nr, a,
4515			    ruleset, pd, 1);
4516		}
4517
4518		if (pd->ip_sum)
4519			bip_sum = *pd->ip_sum;
4520
4521		switch (pd->proto) {
4522		case IPPROTO_TCP:
4523			bproto_sum = th->th_sum;
4524			pd->proto_sum = &th->th_sum;
4525
4526			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
4527			    nk->port[pd->sidx] != sport) {
4528				pf_change_ap(m, saddr, &th->th_sport, pd->ip_sum,
4529				    &th->th_sum, &nk->addr[pd->sidx],
4530				    nk->port[pd->sidx], 0, af);
4531				pd->sport = &th->th_sport;
4532				sport = th->th_sport;
4533			}
4534
4535			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
4536			    nk->port[pd->didx] != dport) {
4537				pf_change_ap(m, daddr, &th->th_dport, pd->ip_sum,
4538				    &th->th_sum, &nk->addr[pd->didx],
4539				    nk->port[pd->didx], 0, af);
4540				dport = th->th_dport;
4541				pd->dport = &th->th_dport;
4542			}
4543			rewrite++;
4544			break;
4545		case IPPROTO_UDP:
4546			bproto_sum = pd->hdr.udp.uh_sum;
4547			pd->proto_sum = &pd->hdr.udp.uh_sum;
4548
4549			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
4550			    nk->port[pd->sidx] != sport) {
4551				pf_change_ap(m, saddr, &pd->hdr.udp.uh_sport,
4552				    pd->ip_sum, &pd->hdr.udp.uh_sum,
4553				    &nk->addr[pd->sidx],
4554				    nk->port[pd->sidx], 1, af);
4555				sport = pd->hdr.udp.uh_sport;
4556				pd->sport = &pd->hdr.udp.uh_sport;
4557			}
4558
4559			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
4560			    nk->port[pd->didx] != dport) {
4561				pf_change_ap(m, daddr, &pd->hdr.udp.uh_dport,
4562				    pd->ip_sum, &pd->hdr.udp.uh_sum,
4563				    &nk->addr[pd->didx],
4564				    nk->port[pd->didx], 1, af);
4565				dport = pd->hdr.udp.uh_dport;
4566				pd->dport = &pd->hdr.udp.uh_dport;
4567			}
4568			rewrite++;
4569			break;
4570		case IPPROTO_SCTP: {
4571			uint16_t checksum = 0;
4572
4573			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
4574			    nk->port[pd->sidx] != sport) {
4575				pf_change_ap(m, saddr, &pd->hdr.sctp.src_port,
4576				    pd->ip_sum, &checksum,
4577				    &nk->addr[pd->sidx],
4578				    nk->port[pd->sidx], 1, af);
4579			}
4580			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
4581			    nk->port[pd->didx] != dport) {
4582				pf_change_ap(m, daddr, &pd->hdr.sctp.dest_port,
4583				    pd->ip_sum, &checksum,
4584				    &nk->addr[pd->didx],
4585				    nk->port[pd->didx], 1, af);
4586			}
4587			break;
4588		}
4589#ifdef INET
4590		case IPPROTO_ICMP:
4591			nk->port[0] = nk->port[1];
4592			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
4593				pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
4594				    nk->addr[pd->sidx].v4.s_addr, 0);
4595
4596			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
4597				pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
4598				    nk->addr[pd->didx].v4.s_addr, 0);
4599
4600			if (nk->port[1] != pd->hdr.icmp.icmp_id) {
4601				pd->hdr.icmp.icmp_cksum = pf_cksum_fixup(
4602				    pd->hdr.icmp.icmp_cksum, sport,
4603				    nk->port[1], 0);
4604				pd->hdr.icmp.icmp_id = nk->port[1];
4605				pd->sport = &pd->hdr.icmp.icmp_id;
4606			}
4607			m_copyback(m, off, ICMP_MINLEN, (caddr_t)&pd->hdr.icmp);
4608			break;
4609#endif /* INET */
4610#ifdef INET6
4611		case IPPROTO_ICMPV6:
4612			nk->port[0] = nk->port[1];
4613			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
4614				pf_change_a6(saddr, &pd->hdr.icmp6.icmp6_cksum,
4615				    &nk->addr[pd->sidx], 0);
4616
4617			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
4618				pf_change_a6(daddr, &pd->hdr.icmp6.icmp6_cksum,
4619				    &nk->addr[pd->didx], 0);
4620			rewrite++;
4621			break;
4622#endif /* INET */
4623		default:
4624			switch (af) {
4625#ifdef INET
4626			case AF_INET:
4627				if (PF_ANEQ(saddr,
4628				    &nk->addr[pd->sidx], AF_INET))
4629					pf_change_a(&saddr->v4.s_addr,
4630					    pd->ip_sum,
4631					    nk->addr[pd->sidx].v4.s_addr, 0);
4632
4633				if (PF_ANEQ(daddr,
4634				    &nk->addr[pd->didx], AF_INET))
4635					pf_change_a(&daddr->v4.s_addr,
4636					    pd->ip_sum,
4637					    nk->addr[pd->didx].v4.s_addr, 0);
4638				break;
4639#endif /* INET */
4640#ifdef INET6
4641			case AF_INET6:
4642				if (PF_ANEQ(saddr,
4643				    &nk->addr[pd->sidx], AF_INET6))
4644					PF_ACPY(saddr, &nk->addr[pd->sidx], af);
4645
4646				if (PF_ANEQ(daddr,
4647				    &nk->addr[pd->didx], AF_INET6))
4648					PF_ACPY(daddr, &nk->addr[pd->didx], af);
4649				break;
4650#endif /* INET */
4651			}
4652			break;
4653		}
4654		if (nr->natpass)
4655			r = NULL;
4656		pd->nat_rule = nr;
4657	}
4658
4659	SLIST_INIT(&match_rules);
4660	while (r != NULL) {
4661		pf_counter_u64_add(&r->evaluations, 1);
4662		if (pfi_kkif_match(r->kif, kif) == r->ifnot)
4663			r = r->skip[PF_SKIP_IFP].ptr;
4664		else if (r->direction && r->direction != pd->dir)
4665			r = r->skip[PF_SKIP_DIR].ptr;
4666		else if (r->af && r->af != af)
4667			r = r->skip[PF_SKIP_AF].ptr;
4668		else if (r->proto && r->proto != pd->proto)
4669			r = r->skip[PF_SKIP_PROTO].ptr;
4670		else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
4671		    r->src.neg, kif, M_GETFIB(m)))
4672			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
4673		/* tcp/udp only. port_op always 0 in other cases */
4674		else if (r->src.port_op && !pf_match_port(r->src.port_op,
4675		    r->src.port[0], r->src.port[1], sport))
4676			r = r->skip[PF_SKIP_SRC_PORT].ptr;
4677		else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
4678		    r->dst.neg, NULL, M_GETFIB(m)))
4679			r = r->skip[PF_SKIP_DST_ADDR].ptr;
4680		/* tcp/udp only. port_op always 0 in other cases */
4681		else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
4682		    r->dst.port[0], r->dst.port[1], dport))
4683			r = r->skip[PF_SKIP_DST_PORT].ptr;
4684		/* icmp only. type always 0 in other cases */
4685		else if (r->type && r->type != icmptype + 1)
4686			r = TAILQ_NEXT(r, entries);
4687		/* icmp only. type always 0 in other cases */
4688		else if (r->code && r->code != icmpcode + 1)
4689			r = TAILQ_NEXT(r, entries);
4690		else if (r->tos && !(r->tos == pd->tos))
4691			r = TAILQ_NEXT(r, entries);
4692		else if (r->rule_flag & PFRULE_FRAGMENT)
4693			r = TAILQ_NEXT(r, entries);
4694		else if (pd->proto == IPPROTO_TCP &&
4695		    (r->flagset & th->th_flags) != r->flags)
4696			r = TAILQ_NEXT(r, entries);
4697		/* tcp/udp only. uid.op always 0 in other cases */
4698		else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
4699		    pf_socket_lookup(pd, m), 1)) &&
4700		    !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
4701		    pd->lookup.uid))
4702			r = TAILQ_NEXT(r, entries);
4703		/* tcp/udp only. gid.op always 0 in other cases */
4704		else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
4705		    pf_socket_lookup(pd, m), 1)) &&
4706		    !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
4707		    pd->lookup.gid))
4708			r = TAILQ_NEXT(r, entries);
4709		else if (r->prio &&
4710		    !pf_match_ieee8021q_pcp(r->prio, m))
4711			r = TAILQ_NEXT(r, entries);
4712		else if (r->prob &&
4713		    r->prob <= arc4random())
4714			r = TAILQ_NEXT(r, entries);
4715		else if (r->match_tag && !pf_match_tag(m, r, &tag,
4716		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
4717			r = TAILQ_NEXT(r, entries);
4718		else if (r->os_fingerprint != PF_OSFP_ANY &&
4719		    (pd->proto != IPPROTO_TCP || !pf_osfp_match(
4720		    pf_osfp_fingerprint(pd, m, off, th),
4721		    r->os_fingerprint)))
4722			r = TAILQ_NEXT(r, entries);
4723		else {
4724			if (r->tag)
4725				tag = r->tag;
4726			if (r->anchor == NULL) {
4727				if (r->action == PF_MATCH) {
4728					ri = malloc(sizeof(struct pf_krule_item), M_PF_RULE_ITEM, M_NOWAIT | M_ZERO);
4729					if (ri == NULL) {
4730						REASON_SET(&reason, PFRES_MEMORY);
4731						goto cleanup;
4732					}
4733					ri->r = r;
4734					SLIST_INSERT_HEAD(&match_rules, ri, entry);
4735					pf_counter_u64_critical_enter();
4736					pf_counter_u64_add_protected(&r->packets[pd->dir == PF_OUT], 1);
4737					pf_counter_u64_add_protected(&r->bytes[pd->dir == PF_OUT], pd->tot_len);
4738					pf_counter_u64_critical_exit();
4739					pf_rule_to_actions(r, &pd->act);
4740					if (r->log)
4741						PFLOG_PACKET(kif, m, af,
4742						    r->action, PFRES_MATCH, r,
4743						    a, ruleset, pd, 1);
4744				} else {
4745					match = 1;
4746					*rm = r;
4747					*am = a;
4748					*rsm = ruleset;
4749				}
4750				if ((*rm)->quick)
4751					break;
4752				r = TAILQ_NEXT(r, entries);
4753			} else
4754				pf_step_into_anchor(anchor_stack, &asd,
4755				    &ruleset, PF_RULESET_FILTER, &r, &a,
4756				    &match);
4757		}
4758		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
4759		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
4760			break;
4761	}
4762	r = *rm;
4763	a = *am;
4764	ruleset = *rsm;
4765
4766	REASON_SET(&reason, PFRES_MATCH);
4767
4768	/* apply actions for last matching pass/block rule */
4769	pf_rule_to_actions(r, &pd->act);
4770
4771	if (r->log) {
4772		if (rewrite)
4773			m_copyback(m, off, hdrlen, pd->hdr.any);
4774		PFLOG_PACKET(kif, m, af, r->action, reason, r, a, ruleset, pd, 1);
4775	}
4776
4777	if ((r->action == PF_DROP) &&
4778	    ((r->rule_flag & PFRULE_RETURNRST) ||
4779	    (r->rule_flag & PFRULE_RETURNICMP) ||
4780	    (r->rule_flag & PFRULE_RETURN))) {
4781		pf_return(r, nr, pd, sk, off, m, th, kif, bproto_sum,
4782		    bip_sum, hdrlen, &reason, r->rtableid);
4783	}
4784
4785	if (r->action == PF_DROP)
4786		goto cleanup;
4787
4788	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
4789		REASON_SET(&reason, PFRES_MEMORY);
4790		goto cleanup;
4791	}
4792	if (pd->act.rtableid >= 0)
4793		M_SETFIB(m, pd->act.rtableid);
4794
4795	if (!state_icmp && (r->keep_state || nr != NULL ||
4796	    (pd->flags & PFDESC_TCP_NORM))) {
4797		int action;
4798		action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off,
4799		    sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum,
4800		    hdrlen, &match_rules);
4801		if (action != PF_PASS) {
4802			if (action == PF_DROP &&
4803			    (r->rule_flag & PFRULE_RETURN))
4804				pf_return(r, nr, pd, sk, off, m, th, kif,
4805				    bproto_sum, bip_sum, hdrlen, &reason,
4806				    pd->act.rtableid);
4807			return (action);
4808		}
4809	} else {
4810		while ((ri = SLIST_FIRST(&match_rules))) {
4811			SLIST_REMOVE_HEAD(&match_rules, entry);
4812			free(ri, M_PF_RULE_ITEM);
4813		}
4814
4815		uma_zfree(V_pf_state_key_z, sk);
4816		uma_zfree(V_pf_state_key_z, nk);
4817	}
4818
4819	/* copy back packet headers if we performed NAT operations */
4820	if (rewrite)
4821		m_copyback(m, off, hdrlen, pd->hdr.any);
4822
4823	if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) &&
4824	    pd->dir == PF_OUT &&
4825	    V_pfsync_defer_ptr != NULL && V_pfsync_defer_ptr(*sm, m))
4826		/*
4827		 * We want the state created, but we dont
4828		 * want to send this in case a partner
4829		 * firewall has to know about it to allow
4830		 * replies through it.
4831		 */
4832		return (PF_DEFER);
4833
4834	return (PF_PASS);
4835
4836cleanup:
4837	while ((ri = SLIST_FIRST(&match_rules))) {
4838		SLIST_REMOVE_HEAD(&match_rules, entry);
4839		free(ri, M_PF_RULE_ITEM);
4840	}
4841
4842	uma_zfree(V_pf_state_key_z, sk);
4843	uma_zfree(V_pf_state_key_z, nk);
4844	return (PF_DROP);
4845}
4846
4847static int
4848pf_create_state(struct pf_krule *r, struct pf_krule *nr, struct pf_krule *a,
4849    struct pf_pdesc *pd, struct pf_ksrc_node *nsn, struct pf_state_key *nk,
4850    struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport,
4851    u_int16_t dport, int *rewrite, struct pfi_kkif *kif, struct pf_kstate **sm,
4852    int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen,
4853    struct pf_krule_slist *match_rules)
4854{
4855	struct pf_kstate	*s = NULL;
4856	struct pf_ksrc_node	*sn = NULL;
4857	struct tcphdr		*th = &pd->hdr.tcp;
4858	u_int16_t		 mss = V_tcp_mssdflt;
4859	u_short			 reason, sn_reason;
4860	struct pf_krule_item	*ri;
4861
4862	/* check maximums */
4863	if (r->max_states &&
4864	    (counter_u64_fetch(r->states_cur) >= r->max_states)) {
4865		counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1);
4866		REASON_SET(&reason, PFRES_MAXSTATES);
4867		goto csfailed;
4868	}
4869	/* src node for filter rule */
4870	if ((r->rule_flag & PFRULE_SRCTRACK ||
4871	    r->rpool.opts & PF_POOL_STICKYADDR) &&
4872	    (sn_reason = pf_insert_src_node(&sn, r, pd->src, pd->af)) != 0) {
4873		REASON_SET(&reason, sn_reason);
4874		goto csfailed;
4875	}
4876	/* src node for translation rule */
4877	if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
4878	    (sn_reason = pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx],
4879	    pd->af)) != 0 ) {
4880		REASON_SET(&reason, sn_reason);
4881		goto csfailed;
4882	}
4883	s = pf_alloc_state(M_NOWAIT);
4884	if (s == NULL) {
4885		REASON_SET(&reason, PFRES_MEMORY);
4886		goto csfailed;
4887	}
4888	s->rule.ptr = r;
4889	s->nat_rule.ptr = nr;
4890	s->anchor.ptr = a;
4891	bcopy(match_rules, &s->match_rules, sizeof(s->match_rules));
4892	memcpy(&s->act, &pd->act, sizeof(struct pf_rule_actions));
4893
4894	STATE_INC_COUNTERS(s);
4895	if (r->allow_opts)
4896		s->state_flags |= PFSTATE_ALLOWOPTS;
4897	if (r->rule_flag & PFRULE_STATESLOPPY)
4898		s->state_flags |= PFSTATE_SLOPPY;
4899	if (pd->flags & PFDESC_TCP_NORM) /* Set by old-style scrub rules */
4900		s->state_flags |= PFSTATE_SCRUB_TCP;
4901	if ((r->rule_flag & PFRULE_PFLOW) ||
4902	    (nr != NULL && nr->rule_flag & PFRULE_PFLOW))
4903		s->state_flags |= PFSTATE_PFLOW;
4904
4905	s->act.log = pd->act.log & PF_LOG_ALL;
4906	s->sync_state = PFSYNC_S_NONE;
4907	s->state_flags |= pd->act.flags; /* Only needed for pfsync and state export */
4908
4909	if (nr != NULL)
4910		s->act.log |= nr->log & PF_LOG_ALL;
4911	switch (pd->proto) {
4912	case IPPROTO_TCP:
4913		s->src.seqlo = ntohl(th->th_seq);
4914		s->src.seqhi = s->src.seqlo + pd->p_len + 1;
4915		if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
4916		    r->keep_state == PF_STATE_MODULATE) {
4917			/* Generate sequence number modulator */
4918			if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
4919			    0)
4920				s->src.seqdiff = 1;
4921			pf_change_proto_a(m, &th->th_seq, &th->th_sum,
4922			    htonl(s->src.seqlo + s->src.seqdiff), 0);
4923			*rewrite = 1;
4924		} else
4925			s->src.seqdiff = 0;
4926		if (th->th_flags & TH_SYN) {
4927			s->src.seqhi++;
4928			s->src.wscale = pf_get_wscale(m, off,
4929			    th->th_off, pd->af);
4930		}
4931		s->src.max_win = MAX(ntohs(th->th_win), 1);
4932		if (s->src.wscale & PF_WSCALE_MASK) {
4933			/* Remove scale factor from initial window */
4934			int win = s->src.max_win;
4935			win += 1 << (s->src.wscale & PF_WSCALE_MASK);
4936			s->src.max_win = (win - 1) >>
4937			    (s->src.wscale & PF_WSCALE_MASK);
4938		}
4939		if (th->th_flags & TH_FIN)
4940			s->src.seqhi++;
4941		s->dst.seqhi = 1;
4942		s->dst.max_win = 1;
4943		pf_set_protostate(s, PF_PEER_SRC, TCPS_SYN_SENT);
4944		pf_set_protostate(s, PF_PEER_DST, TCPS_CLOSED);
4945		s->timeout = PFTM_TCP_FIRST_PACKET;
4946		atomic_add_32(&V_pf_status.states_halfopen, 1);
4947		break;
4948	case IPPROTO_UDP:
4949		pf_set_protostate(s, PF_PEER_SRC, PFUDPS_SINGLE);
4950		pf_set_protostate(s, PF_PEER_DST, PFUDPS_NO_TRAFFIC);
4951		s->timeout = PFTM_UDP_FIRST_PACKET;
4952		break;
4953	case IPPROTO_SCTP:
4954		pf_set_protostate(s, PF_PEER_SRC, SCTP_COOKIE_WAIT);
4955		pf_set_protostate(s, PF_PEER_DST, SCTP_CLOSED);
4956		s->timeout = PFTM_SCTP_FIRST_PACKET;
4957		break;
4958	case IPPROTO_ICMP:
4959#ifdef INET6
4960	case IPPROTO_ICMPV6:
4961#endif
4962		s->timeout = PFTM_ICMP_FIRST_PACKET;
4963		break;
4964	default:
4965		pf_set_protostate(s, PF_PEER_SRC, PFOTHERS_SINGLE);
4966		pf_set_protostate(s, PF_PEER_DST, PFOTHERS_NO_TRAFFIC);
4967		s->timeout = PFTM_OTHER_FIRST_PACKET;
4968	}
4969
4970	if (r->rt) {
4971		/* pf_map_addr increases the reason counters */
4972		if ((reason = pf_map_addr(pd->af, r, pd->src, &s->rt_addr,
4973		    &s->rt_kif, NULL, &sn)) != 0)
4974			goto csfailed;
4975		s->rt = r->rt;
4976	}
4977
4978	s->creation = s->expire = pf_get_uptime();
4979
4980	if (sn != NULL)
4981		s->src_node = sn;
4982	if (nsn != NULL) {
4983		/* XXX We only modify one side for now. */
4984		PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
4985		s->nat_src_node = nsn;
4986	}
4987	if (pd->proto == IPPROTO_TCP) {
4988		if (s->state_flags & PFSTATE_SCRUB_TCP &&
4989		    pf_normalize_tcp_init(m, off, pd, th, &s->src, &s->dst)) {
4990			REASON_SET(&reason, PFRES_MEMORY);
4991			goto drop;
4992		}
4993		if (s->state_flags & PFSTATE_SCRUB_TCP && s->src.scrub &&
4994		    pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
4995		    &s->src, &s->dst, rewrite)) {
4996			/* This really shouldn't happen!!! */
4997			DPFPRINTF(PF_DEBUG_URGENT,
4998			    ("pf_normalize_tcp_stateful failed on first "
4999			     "pkt\n"));
5000			goto drop;
5001		}
5002	} else if (pd->proto == IPPROTO_SCTP) {
5003		if (pf_normalize_sctp_init(m, off, pd, &s->src, &s->dst))
5004			goto drop;
5005		if (! (pd->sctp_flags & (PFDESC_SCTP_INIT | PFDESC_SCTP_ADD_IP)))
5006			goto drop;
5007	}
5008	s->direction = pd->dir;
5009
5010	/*
5011	 * sk/nk could already been setup by pf_get_translation().
5012	 */
5013	if (nr == NULL) {
5014		KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p",
5015		    __func__, nr, sk, nk));
5016		sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport);
5017		if (sk == NULL)
5018			goto csfailed;
5019		nk = sk;
5020	} else
5021		KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p",
5022		    __func__, nr, sk, nk));
5023
5024	/* Swap sk/nk for PF_OUT. */
5025	if (pf_state_insert(BOUND_IFACE(s, kif), kif,
5026	    (pd->dir == PF_IN) ? sk : nk,
5027	    (pd->dir == PF_IN) ? nk : sk, s)) {
5028		REASON_SET(&reason, PFRES_STATEINS);
5029		goto drop;
5030	} else
5031		*sm = s;
5032
5033	if (tag > 0)
5034		s->tag = tag;
5035	if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
5036	    TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
5037		pf_set_protostate(s, PF_PEER_SRC, PF_TCPS_PROXY_SRC);
5038		/* undo NAT changes, if they have taken place */
5039		if (nr != NULL) {
5040			struct pf_state_key *skt = s->key[PF_SK_WIRE];
5041			if (pd->dir == PF_OUT)
5042				skt = s->key[PF_SK_STACK];
5043			PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
5044			PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
5045			if (pd->sport)
5046				*pd->sport = skt->port[pd->sidx];
5047			if (pd->dport)
5048				*pd->dport = skt->port[pd->didx];
5049			if (pd->proto_sum)
5050				*pd->proto_sum = bproto_sum;
5051			if (pd->ip_sum)
5052				*pd->ip_sum = bip_sum;
5053			m_copyback(m, off, hdrlen, pd->hdr.any);
5054		}
5055		s->src.seqhi = htonl(arc4random());
5056		/* Find mss option */
5057		int rtid = M_GETFIB(m);
5058		mss = pf_get_mss(m, off, th->th_off, pd->af);
5059		mss = pf_calc_mss(pd->src, pd->af, rtid, mss);
5060		mss = pf_calc_mss(pd->dst, pd->af, rtid, mss);
5061		s->src.mss = mss;
5062		pf_send_tcp(r, pd->af, pd->dst, pd->src, th->th_dport,
5063		    th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
5064		    TH_SYN|TH_ACK, 0, s->src.mss, 0, true, 0, 0,
5065		    pd->act.rtableid);
5066		REASON_SET(&reason, PFRES_SYNPROXY);
5067		return (PF_SYNPROXY_DROP);
5068	}
5069
5070	return (PF_PASS);
5071
5072csfailed:
5073	while ((ri = SLIST_FIRST(match_rules))) {
5074		SLIST_REMOVE_HEAD(match_rules, entry);
5075		free(ri, M_PF_RULE_ITEM);
5076	}
5077
5078	uma_zfree(V_pf_state_key_z, sk);
5079	uma_zfree(V_pf_state_key_z, nk);
5080
5081	if (sn != NULL) {
5082		PF_SRC_NODE_LOCK(sn);
5083		if (--sn->states == 0 && sn->expire == 0) {
5084			pf_unlink_src_node(sn);
5085			uma_zfree(V_pf_sources_z, sn);
5086			counter_u64_add(
5087			    V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
5088		}
5089		PF_SRC_NODE_UNLOCK(sn);
5090	}
5091
5092	if (nsn != sn && nsn != NULL) {
5093		PF_SRC_NODE_LOCK(nsn);
5094		if (--nsn->states == 0 && nsn->expire == 0) {
5095			pf_unlink_src_node(nsn);
5096			uma_zfree(V_pf_sources_z, nsn);
5097			counter_u64_add(
5098			    V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
5099		}
5100		PF_SRC_NODE_UNLOCK(nsn);
5101	}
5102
5103drop:
5104	if (s != NULL) {
5105		pf_src_tree_remove_state(s);
5106		s->timeout = PFTM_UNLINKED;
5107		STATE_DEC_COUNTERS(s);
5108		pf_free_state(s);
5109	}
5110
5111	return (PF_DROP);
5112}
5113
5114static int
5115pf_test_fragment(struct pf_krule **rm, struct pfi_kkif *kif,
5116    struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_krule **am,
5117    struct pf_kruleset **rsm)
5118{
5119	struct pf_krule		*r, *a = NULL;
5120	struct pf_kruleset	*ruleset = NULL;
5121	struct pf_krule_slist	 match_rules;
5122	struct pf_krule_item	*ri;
5123	sa_family_t		 af = pd->af;
5124	u_short			 reason;
5125	int			 tag = -1;
5126	int			 asd = 0;
5127	int			 match = 0;
5128	struct pf_kanchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
5129
5130	PF_RULES_RASSERT();
5131
5132	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
5133	SLIST_INIT(&match_rules);
5134	while (r != NULL) {
5135		pf_counter_u64_add(&r->evaluations, 1);
5136		if (pfi_kkif_match(r->kif, kif) == r->ifnot)
5137			r = r->skip[PF_SKIP_IFP].ptr;
5138		else if (r->direction && r->direction != pd->dir)
5139			r = r->skip[PF_SKIP_DIR].ptr;
5140		else if (r->af && r->af != af)
5141			r = r->skip[PF_SKIP_AF].ptr;
5142		else if (r->proto && r->proto != pd->proto)
5143			r = r->skip[PF_SKIP_PROTO].ptr;
5144		else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
5145		    r->src.neg, kif, M_GETFIB(m)))
5146			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
5147		else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
5148		    r->dst.neg, NULL, M_GETFIB(m)))
5149			r = r->skip[PF_SKIP_DST_ADDR].ptr;
5150		else if (r->tos && !(r->tos == pd->tos))
5151			r = TAILQ_NEXT(r, entries);
5152		else if (r->os_fingerprint != PF_OSFP_ANY)
5153			r = TAILQ_NEXT(r, entries);
5154		else if (pd->proto == IPPROTO_UDP &&
5155		    (r->src.port_op || r->dst.port_op))
5156			r = TAILQ_NEXT(r, entries);
5157		else if (pd->proto == IPPROTO_TCP &&
5158		    (r->src.port_op || r->dst.port_op || r->flagset))
5159			r = TAILQ_NEXT(r, entries);
5160		else if ((pd->proto == IPPROTO_ICMP ||
5161		    pd->proto == IPPROTO_ICMPV6) &&
5162		    (r->type || r->code))
5163			r = TAILQ_NEXT(r, entries);
5164		else if (r->prio &&
5165		    !pf_match_ieee8021q_pcp(r->prio, m))
5166			r = TAILQ_NEXT(r, entries);
5167		else if (r->prob && r->prob <=
5168		    (arc4random() % (UINT_MAX - 1) + 1))
5169			r = TAILQ_NEXT(r, entries);
5170		else if (r->match_tag && !pf_match_tag(m, r, &tag,
5171		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
5172			r = TAILQ_NEXT(r, entries);
5173		else {
5174			if (r->anchor == NULL) {
5175				if (r->action == PF_MATCH) {
5176					ri = malloc(sizeof(struct pf_krule_item), M_PF_RULE_ITEM, M_NOWAIT | M_ZERO);
5177					if (ri == NULL) {
5178						REASON_SET(&reason, PFRES_MEMORY);
5179						goto cleanup;
5180					}
5181					ri->r = r;
5182					SLIST_INSERT_HEAD(&match_rules, ri, entry);
5183					pf_counter_u64_critical_enter();
5184					pf_counter_u64_add_protected(&r->packets[pd->dir == PF_OUT], 1);
5185					pf_counter_u64_add_protected(&r->bytes[pd->dir == PF_OUT], pd->tot_len);
5186					pf_counter_u64_critical_exit();
5187					pf_rule_to_actions(r, &pd->act);
5188					if (r->log)
5189						PFLOG_PACKET(kif, m, af,
5190						    r->action, PFRES_MATCH, r,
5191						    a, ruleset, pd, 1);
5192				} else {
5193					match = 1;
5194					*rm = r;
5195					*am = a;
5196					*rsm = ruleset;
5197				}
5198				if ((*rm)->quick)
5199					break;
5200				r = TAILQ_NEXT(r, entries);
5201			} else
5202				pf_step_into_anchor(anchor_stack, &asd,
5203				    &ruleset, PF_RULESET_FILTER, &r, &a,
5204				    &match);
5205		}
5206		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
5207		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
5208			break;
5209	}
5210	r = *rm;
5211	a = *am;
5212	ruleset = *rsm;
5213
5214	REASON_SET(&reason, PFRES_MATCH);
5215
5216	/* apply actions for last matching pass/block rule */
5217	pf_rule_to_actions(r, &pd->act);
5218
5219	if (r->log)
5220		PFLOG_PACKET(kif, m, af, r->action, reason, r, a, ruleset, pd, 1);
5221
5222	if (r->action != PF_PASS)
5223		return (PF_DROP);
5224
5225	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
5226		REASON_SET(&reason, PFRES_MEMORY);
5227		goto cleanup;
5228	}
5229
5230	return (PF_PASS);
5231
5232cleanup:
5233	while ((ri = SLIST_FIRST(&match_rules))) {
5234		SLIST_REMOVE_HEAD(&match_rules, entry);
5235		free(ri, M_PF_RULE_ITEM);
5236	}
5237
5238	return (PF_DROP);
5239}
5240
5241static int
5242pf_tcp_track_full(struct pf_kstate **state, struct pfi_kkif *kif,
5243    struct mbuf *m, int off, struct pf_pdesc *pd, u_short *reason,
5244    int *copyback)
5245{
5246	struct tcphdr		*th = &pd->hdr.tcp;
5247	struct pf_state_peer	*src, *dst;
5248	u_int16_t		 win = ntohs(th->th_win);
5249	u_int32_t		 ack, end, seq, orig_seq;
5250	u_int8_t		 sws, dws, psrc, pdst;
5251	int			 ackskew;
5252
5253	if (pd->dir == (*state)->direction) {
5254		src = &(*state)->src;
5255		dst = &(*state)->dst;
5256		psrc = PF_PEER_SRC;
5257		pdst = PF_PEER_DST;
5258	} else {
5259		src = &(*state)->dst;
5260		dst = &(*state)->src;
5261		psrc = PF_PEER_DST;
5262		pdst = PF_PEER_SRC;
5263	}
5264
5265	if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
5266		sws = src->wscale & PF_WSCALE_MASK;
5267		dws = dst->wscale & PF_WSCALE_MASK;
5268	} else
5269		sws = dws = 0;
5270
5271	/*
5272	 * Sequence tracking algorithm from Guido van Rooij's paper:
5273	 *   http://www.madison-gurkha.com/publications/tcp_filtering/
5274	 *	tcp_filtering.ps
5275	 */
5276
5277	orig_seq = seq = ntohl(th->th_seq);
5278	if (src->seqlo == 0) {
5279		/* First packet from this end. Set its state */
5280
5281		if (((*state)->state_flags & PFSTATE_SCRUB_TCP || dst->scrub) &&
5282		    src->scrub == NULL) {
5283			if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
5284				REASON_SET(reason, PFRES_MEMORY);
5285				return (PF_DROP);
5286			}
5287		}
5288
5289		/* Deferred generation of sequence number modulator */
5290		if (dst->seqdiff && !src->seqdiff) {
5291			/* use random iss for the TCP server */
5292			while ((src->seqdiff = arc4random() - seq) == 0)
5293				;
5294			ack = ntohl(th->th_ack) - dst->seqdiff;
5295			pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq +
5296			    src->seqdiff), 0);
5297			pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0);
5298			*copyback = 1;
5299		} else {
5300			ack = ntohl(th->th_ack);
5301		}
5302
5303		end = seq + pd->p_len;
5304		if (th->th_flags & TH_SYN) {
5305			end++;
5306			if (dst->wscale & PF_WSCALE_FLAG) {
5307				src->wscale = pf_get_wscale(m, off, th->th_off,
5308				    pd->af);
5309				if (src->wscale & PF_WSCALE_FLAG) {
5310					/* Remove scale factor from initial
5311					 * window */
5312					sws = src->wscale & PF_WSCALE_MASK;
5313					win = ((u_int32_t)win + (1 << sws) - 1)
5314					    >> sws;
5315					dws = dst->wscale & PF_WSCALE_MASK;
5316				} else {
5317					/* fixup other window */
5318					dst->max_win <<= dst->wscale &
5319					    PF_WSCALE_MASK;
5320					/* in case of a retrans SYN|ACK */
5321					dst->wscale = 0;
5322				}
5323			}
5324		}
5325		if (th->th_flags & TH_FIN)
5326			end++;
5327
5328		src->seqlo = seq;
5329		if (src->state < TCPS_SYN_SENT)
5330			pf_set_protostate(*state, psrc, TCPS_SYN_SENT);
5331
5332		/*
5333		 * May need to slide the window (seqhi may have been set by
5334		 * the crappy stack check or if we picked up the connection
5335		 * after establishment)
5336		 */
5337		if (src->seqhi == 1 ||
5338		    SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
5339			src->seqhi = end + MAX(1, dst->max_win << dws);
5340		if (win > src->max_win)
5341			src->max_win = win;
5342
5343	} else {
5344		ack = ntohl(th->th_ack) - dst->seqdiff;
5345		if (src->seqdiff) {
5346			/* Modulate sequence numbers */
5347			pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq +
5348			    src->seqdiff), 0);
5349			pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0);
5350			*copyback = 1;
5351		}
5352		end = seq + pd->p_len;
5353		if (th->th_flags & TH_SYN)
5354			end++;
5355		if (th->th_flags & TH_FIN)
5356			end++;
5357	}
5358
5359	if ((th->th_flags & TH_ACK) == 0) {
5360		/* Let it pass through the ack skew check */
5361		ack = dst->seqlo;
5362	} else if ((ack == 0 &&
5363	    (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
5364	    /* broken tcp stacks do not set ack */
5365	    (dst->state < TCPS_SYN_SENT)) {
5366		/*
5367		 * Many stacks (ours included) will set the ACK number in an
5368		 * FIN|ACK if the SYN times out -- no sequence to ACK.
5369		 */
5370		ack = dst->seqlo;
5371	}
5372
5373	if (seq == end) {
5374		/* Ease sequencing restrictions on no data packets */
5375		seq = src->seqlo;
5376		end = seq;
5377	}
5378
5379	ackskew = dst->seqlo - ack;
5380
5381	/*
5382	 * Need to demodulate the sequence numbers in any TCP SACK options
5383	 * (Selective ACK). We could optionally validate the SACK values
5384	 * against the current ACK window, either forwards or backwards, but
5385	 * I'm not confident that SACK has been implemented properly
5386	 * everywhere. It wouldn't surprise me if several stacks accidentally
5387	 * SACK too far backwards of previously ACKed data. There really aren't
5388	 * any security implications of bad SACKing unless the target stack
5389	 * doesn't validate the option length correctly. Someone trying to
5390	 * spoof into a TCP connection won't bother blindly sending SACK
5391	 * options anyway.
5392	 */
5393	if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
5394		if (pf_modulate_sack(m, off, pd, th, dst))
5395			*copyback = 1;
5396	}
5397
5398#define	MAXACKWINDOW (0xffff + 1500)	/* 1500 is an arbitrary fudge factor */
5399	if (SEQ_GEQ(src->seqhi, end) &&
5400	    /* Last octet inside other's window space */
5401	    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
5402	    /* Retrans: not more than one window back */
5403	    (ackskew >= -MAXACKWINDOW) &&
5404	    /* Acking not more than one reassembled fragment backwards */
5405	    (ackskew <= (MAXACKWINDOW << sws)) &&
5406	    /* Acking not more than one window forward */
5407	    ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
5408	    (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo))) {
5409	    /* Require an exact/+1 sequence match on resets when possible */
5410
5411		if (dst->scrub || src->scrub) {
5412			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
5413			    *state, src, dst, copyback))
5414				return (PF_DROP);
5415		}
5416
5417		/* update max window */
5418		if (src->max_win < win)
5419			src->max_win = win;
5420		/* synchronize sequencing */
5421		if (SEQ_GT(end, src->seqlo))
5422			src->seqlo = end;
5423		/* slide the window of what the other end can send */
5424		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
5425			dst->seqhi = ack + MAX((win << sws), 1);
5426
5427		/* update states */
5428		if (th->th_flags & TH_SYN)
5429			if (src->state < TCPS_SYN_SENT)
5430				pf_set_protostate(*state, psrc, TCPS_SYN_SENT);
5431		if (th->th_flags & TH_FIN)
5432			if (src->state < TCPS_CLOSING)
5433				pf_set_protostate(*state, psrc, TCPS_CLOSING);
5434		if (th->th_flags & TH_ACK) {
5435			if (dst->state == TCPS_SYN_SENT) {
5436				pf_set_protostate(*state, pdst,
5437				    TCPS_ESTABLISHED);
5438				if (src->state == TCPS_ESTABLISHED &&
5439				    (*state)->src_node != NULL &&
5440				    pf_src_connlimit(state)) {
5441					REASON_SET(reason, PFRES_SRCLIMIT);
5442					return (PF_DROP);
5443				}
5444			} else if (dst->state == TCPS_CLOSING)
5445				pf_set_protostate(*state, pdst,
5446				    TCPS_FIN_WAIT_2);
5447		}
5448		if (th->th_flags & TH_RST)
5449			pf_set_protostate(*state, PF_PEER_BOTH, TCPS_TIME_WAIT);
5450
5451		/* update expire time */
5452		(*state)->expire = pf_get_uptime();
5453		if (src->state >= TCPS_FIN_WAIT_2 &&
5454		    dst->state >= TCPS_FIN_WAIT_2)
5455			(*state)->timeout = PFTM_TCP_CLOSED;
5456		else if (src->state >= TCPS_CLOSING &&
5457		    dst->state >= TCPS_CLOSING)
5458			(*state)->timeout = PFTM_TCP_FIN_WAIT;
5459		else if (src->state < TCPS_ESTABLISHED ||
5460		    dst->state < TCPS_ESTABLISHED)
5461			(*state)->timeout = PFTM_TCP_OPENING;
5462		else if (src->state >= TCPS_CLOSING ||
5463		    dst->state >= TCPS_CLOSING)
5464			(*state)->timeout = PFTM_TCP_CLOSING;
5465		else
5466			(*state)->timeout = PFTM_TCP_ESTABLISHED;
5467
5468		/* Fall through to PASS packet */
5469
5470	} else if ((dst->state < TCPS_SYN_SENT ||
5471		dst->state >= TCPS_FIN_WAIT_2 ||
5472		src->state >= TCPS_FIN_WAIT_2) &&
5473	    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
5474	    /* Within a window forward of the originating packet */
5475	    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
5476	    /* Within a window backward of the originating packet */
5477
5478		/*
5479		 * This currently handles three situations:
5480		 *  1) Stupid stacks will shotgun SYNs before their peer
5481		 *     replies.
5482		 *  2) When PF catches an already established stream (the
5483		 *     firewall rebooted, the state table was flushed, routes
5484		 *     changed...)
5485		 *  3) Packets get funky immediately after the connection
5486		 *     closes (this should catch Solaris spurious ACK|FINs
5487		 *     that web servers like to spew after a close)
5488		 *
5489		 * This must be a little more careful than the above code
5490		 * since packet floods will also be caught here. We don't
5491		 * update the TTL here to mitigate the damage of a packet
5492		 * flood and so the same code can handle awkward establishment
5493		 * and a loosened connection close.
5494		 * In the establishment case, a correct peer response will
5495		 * validate the connection, go through the normal state code
5496		 * and keep updating the state TTL.
5497		 */
5498
5499		if (V_pf_status.debug >= PF_DEBUG_MISC) {
5500			printf("pf: loose state match: ");
5501			pf_print_state(*state);
5502			pf_print_flags(th->th_flags);
5503			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
5504			    "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack,
5505			    pd->p_len, ackskew, (unsigned long long)(*state)->packets[0],
5506			    (unsigned long long)(*state)->packets[1],
5507			    pd->dir == PF_IN ? "in" : "out",
5508			    pd->dir == (*state)->direction ? "fwd" : "rev");
5509		}
5510
5511		if (dst->scrub || src->scrub) {
5512			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
5513			    *state, src, dst, copyback))
5514				return (PF_DROP);
5515		}
5516
5517		/* update max window */
5518		if (src->max_win < win)
5519			src->max_win = win;
5520		/* synchronize sequencing */
5521		if (SEQ_GT(end, src->seqlo))
5522			src->seqlo = end;
5523		/* slide the window of what the other end can send */
5524		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
5525			dst->seqhi = ack + MAX((win << sws), 1);
5526
5527		/*
5528		 * Cannot set dst->seqhi here since this could be a shotgunned
5529		 * SYN and not an already established connection.
5530		 */
5531
5532		if (th->th_flags & TH_FIN)
5533			if (src->state < TCPS_CLOSING)
5534				pf_set_protostate(*state, psrc, TCPS_CLOSING);
5535		if (th->th_flags & TH_RST)
5536			pf_set_protostate(*state, PF_PEER_BOTH, TCPS_TIME_WAIT);
5537
5538		/* Fall through to PASS packet */
5539
5540	} else {
5541		if ((*state)->dst.state == TCPS_SYN_SENT &&
5542		    (*state)->src.state == TCPS_SYN_SENT) {
5543			/* Send RST for state mismatches during handshake */
5544			if (!(th->th_flags & TH_RST))
5545				pf_send_tcp((*state)->rule.ptr, pd->af,
5546				    pd->dst, pd->src, th->th_dport,
5547				    th->th_sport, ntohl(th->th_ack), 0,
5548				    TH_RST, 0, 0,
5549				    (*state)->rule.ptr->return_ttl, true, 0, 0,
5550				    (*state)->act.rtableid);
5551			src->seqlo = 0;
5552			src->seqhi = 1;
5553			src->max_win = 1;
5554		} else if (V_pf_status.debug >= PF_DEBUG_MISC) {
5555			printf("pf: BAD state: ");
5556			pf_print_state(*state);
5557			pf_print_flags(th->th_flags);
5558			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
5559			    "pkts=%llu:%llu dir=%s,%s\n",
5560			    seq, orig_seq, ack, pd->p_len, ackskew,
5561			    (unsigned long long)(*state)->packets[0],
5562			    (unsigned long long)(*state)->packets[1],
5563			    pd->dir == PF_IN ? "in" : "out",
5564			    pd->dir == (*state)->direction ? "fwd" : "rev");
5565			printf("pf: State failure on: %c %c %c %c | %c %c\n",
5566			    SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
5567			    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
5568			    ' ': '2',
5569			    (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
5570			    (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
5571			    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
5572			    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
5573		}
5574		REASON_SET(reason, PFRES_BADSTATE);
5575		return (PF_DROP);
5576	}
5577
5578	return (PF_PASS);
5579}
5580
5581static int
5582pf_tcp_track_sloppy(struct pf_kstate **state, struct pf_pdesc *pd, u_short *reason)
5583{
5584	struct tcphdr		*th = &pd->hdr.tcp;
5585	struct pf_state_peer	*src, *dst;
5586	u_int8_t		 psrc, pdst;
5587
5588	if (pd->dir == (*state)->direction) {
5589		src = &(*state)->src;
5590		dst = &(*state)->dst;
5591		psrc = PF_PEER_SRC;
5592		pdst = PF_PEER_DST;
5593	} else {
5594		src = &(*state)->dst;
5595		dst = &(*state)->src;
5596		psrc = PF_PEER_DST;
5597		pdst = PF_PEER_SRC;
5598	}
5599
5600	if (th->th_flags & TH_SYN)
5601		if (src->state < TCPS_SYN_SENT)
5602			pf_set_protostate(*state, psrc, TCPS_SYN_SENT);
5603	if (th->th_flags & TH_FIN)
5604		if (src->state < TCPS_CLOSING)
5605			pf_set_protostate(*state, psrc, TCPS_CLOSING);
5606	if (th->th_flags & TH_ACK) {
5607		if (dst->state == TCPS_SYN_SENT) {
5608			pf_set_protostate(*state, pdst, TCPS_ESTABLISHED);
5609			if (src->state == TCPS_ESTABLISHED &&
5610			    (*state)->src_node != NULL &&
5611			    pf_src_connlimit(state)) {
5612				REASON_SET(reason, PFRES_SRCLIMIT);
5613				return (PF_DROP);
5614			}
5615		} else if (dst->state == TCPS_CLOSING) {
5616			pf_set_protostate(*state, pdst, TCPS_FIN_WAIT_2);
5617		} else if (src->state == TCPS_SYN_SENT &&
5618		    dst->state < TCPS_SYN_SENT) {
5619			/*
5620			 * Handle a special sloppy case where we only see one
5621			 * half of the connection. If there is a ACK after
5622			 * the initial SYN without ever seeing a packet from
5623			 * the destination, set the connection to established.
5624			 */
5625			pf_set_protostate(*state, PF_PEER_BOTH,
5626			    TCPS_ESTABLISHED);
5627			dst->state = src->state = TCPS_ESTABLISHED;
5628			if ((*state)->src_node != NULL &&
5629			    pf_src_connlimit(state)) {
5630				REASON_SET(reason, PFRES_SRCLIMIT);
5631				return (PF_DROP);
5632			}
5633		} else if (src->state == TCPS_CLOSING &&
5634		    dst->state == TCPS_ESTABLISHED &&
5635		    dst->seqlo == 0) {
5636			/*
5637			 * Handle the closing of half connections where we
5638			 * don't see the full bidirectional FIN/ACK+ACK
5639			 * handshake.
5640			 */
5641			pf_set_protostate(*state, pdst, TCPS_CLOSING);
5642		}
5643	}
5644	if (th->th_flags & TH_RST)
5645		pf_set_protostate(*state, PF_PEER_BOTH, TCPS_TIME_WAIT);
5646
5647	/* update expire time */
5648	(*state)->expire = pf_get_uptime();
5649	if (src->state >= TCPS_FIN_WAIT_2 &&
5650	    dst->state >= TCPS_FIN_WAIT_2)
5651		(*state)->timeout = PFTM_TCP_CLOSED;
5652	else if (src->state >= TCPS_CLOSING &&
5653	    dst->state >= TCPS_CLOSING)
5654		(*state)->timeout = PFTM_TCP_FIN_WAIT;
5655	else if (src->state < TCPS_ESTABLISHED ||
5656	    dst->state < TCPS_ESTABLISHED)
5657		(*state)->timeout = PFTM_TCP_OPENING;
5658	else if (src->state >= TCPS_CLOSING ||
5659	    dst->state >= TCPS_CLOSING)
5660		(*state)->timeout = PFTM_TCP_CLOSING;
5661	else
5662		(*state)->timeout = PFTM_TCP_ESTABLISHED;
5663
5664	return (PF_PASS);
5665}
5666
5667static int
5668pf_synproxy(struct pf_pdesc *pd, struct pf_kstate **state, u_short *reason)
5669{
5670	struct pf_state_key	*sk = (*state)->key[pd->didx];
5671	struct tcphdr		*th = &pd->hdr.tcp;
5672
5673	if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
5674		if (pd->dir != (*state)->direction) {
5675			REASON_SET(reason, PFRES_SYNPROXY);
5676			return (PF_SYNPROXY_DROP);
5677		}
5678		if (th->th_flags & TH_SYN) {
5679			if (ntohl(th->th_seq) != (*state)->src.seqlo) {
5680				REASON_SET(reason, PFRES_SYNPROXY);
5681				return (PF_DROP);
5682			}
5683			pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
5684			    pd->src, th->th_dport, th->th_sport,
5685			    (*state)->src.seqhi, ntohl(th->th_seq) + 1,
5686			    TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, true, 0, 0,
5687			    (*state)->act.rtableid);
5688			REASON_SET(reason, PFRES_SYNPROXY);
5689			return (PF_SYNPROXY_DROP);
5690		} else if ((th->th_flags & (TH_ACK|TH_RST|TH_FIN)) != TH_ACK ||
5691		    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
5692		    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
5693			REASON_SET(reason, PFRES_SYNPROXY);
5694			return (PF_DROP);
5695		} else if ((*state)->src_node != NULL &&
5696		    pf_src_connlimit(state)) {
5697			REASON_SET(reason, PFRES_SRCLIMIT);
5698			return (PF_DROP);
5699		} else
5700			pf_set_protostate(*state, PF_PEER_SRC,
5701			    PF_TCPS_PROXY_DST);
5702	}
5703	if ((*state)->src.state == PF_TCPS_PROXY_DST) {
5704		if (pd->dir == (*state)->direction) {
5705			if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
5706			    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
5707			    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
5708				REASON_SET(reason, PFRES_SYNPROXY);
5709				return (PF_DROP);
5710			}
5711			(*state)->src.max_win = MAX(ntohs(th->th_win), 1);
5712			if ((*state)->dst.seqhi == 1)
5713				(*state)->dst.seqhi = htonl(arc4random());
5714			pf_send_tcp((*state)->rule.ptr, pd->af,
5715			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
5716			    sk->port[pd->sidx], sk->port[pd->didx],
5717			    (*state)->dst.seqhi, 0, TH_SYN, 0,
5718			    (*state)->src.mss, 0, false, (*state)->tag, 0,
5719			    (*state)->act.rtableid);
5720			REASON_SET(reason, PFRES_SYNPROXY);
5721			return (PF_SYNPROXY_DROP);
5722		} else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
5723		    (TH_SYN|TH_ACK)) ||
5724		    (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
5725			REASON_SET(reason, PFRES_SYNPROXY);
5726			return (PF_DROP);
5727		} else {
5728			(*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
5729			(*state)->dst.seqlo = ntohl(th->th_seq);
5730			pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
5731			    pd->src, th->th_dport, th->th_sport,
5732			    ntohl(th->th_ack), ntohl(th->th_seq) + 1,
5733			    TH_ACK, (*state)->src.max_win, 0, 0, false,
5734			    (*state)->tag, 0, (*state)->act.rtableid);
5735			pf_send_tcp((*state)->rule.ptr, pd->af,
5736			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
5737			    sk->port[pd->sidx], sk->port[pd->didx],
5738			    (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
5739			    TH_ACK, (*state)->dst.max_win, 0, 0, true, 0, 0,
5740			    (*state)->act.rtableid);
5741			(*state)->src.seqdiff = (*state)->dst.seqhi -
5742			    (*state)->src.seqlo;
5743			(*state)->dst.seqdiff = (*state)->src.seqhi -
5744			    (*state)->dst.seqlo;
5745			(*state)->src.seqhi = (*state)->src.seqlo +
5746			    (*state)->dst.max_win;
5747			(*state)->dst.seqhi = (*state)->dst.seqlo +
5748			    (*state)->src.max_win;
5749			(*state)->src.wscale = (*state)->dst.wscale = 0;
5750			pf_set_protostate(*state, PF_PEER_BOTH,
5751			    TCPS_ESTABLISHED);
5752			REASON_SET(reason, PFRES_SYNPROXY);
5753			return (PF_SYNPROXY_DROP);
5754		}
5755	}
5756
5757	return (PF_PASS);
5758}
5759
5760static int
5761pf_test_state_tcp(struct pf_kstate **state, struct pfi_kkif *kif,
5762    struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
5763    u_short *reason)
5764{
5765	struct pf_state_key_cmp	 key;
5766	struct tcphdr		*th = &pd->hdr.tcp;
5767	int			 copyback = 0;
5768	int			 action;
5769	struct pf_state_peer	*src, *dst;
5770
5771	bzero(&key, sizeof(key));
5772	key.af = pd->af;
5773	key.proto = IPPROTO_TCP;
5774	if (pd->dir == PF_IN)	{	/* wire side, straight */
5775		PF_ACPY(&key.addr[0], pd->src, key.af);
5776		PF_ACPY(&key.addr[1], pd->dst, key.af);
5777		key.port[0] = th->th_sport;
5778		key.port[1] = th->th_dport;
5779	} else {			/* stack side, reverse */
5780		PF_ACPY(&key.addr[1], pd->src, key.af);
5781		PF_ACPY(&key.addr[0], pd->dst, key.af);
5782		key.port[1] = th->th_sport;
5783		key.port[0] = th->th_dport;
5784	}
5785
5786	STATE_LOOKUP(kif, &key, *state, pd);
5787
5788	if (pd->dir == (*state)->direction) {
5789		src = &(*state)->src;
5790		dst = &(*state)->dst;
5791	} else {
5792		src = &(*state)->dst;
5793		dst = &(*state)->src;
5794	}
5795
5796	if ((action = pf_synproxy(pd, state, reason)) != PF_PASS)
5797		return (action);
5798
5799	if (dst->state >= TCPS_FIN_WAIT_2 &&
5800	    src->state >= TCPS_FIN_WAIT_2 &&
5801	    (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) ||
5802	    ((th->th_flags & (TH_SYN|TH_ACK|TH_RST)) == TH_ACK &&
5803	    pf_syncookie_check(pd) && pd->dir == PF_IN))) {
5804		if (V_pf_status.debug >= PF_DEBUG_MISC) {
5805			printf("pf: state reuse ");
5806			pf_print_state(*state);
5807			pf_print_flags(th->th_flags);
5808			printf("\n");
5809		}
5810		/* XXX make sure it's the same direction ?? */
5811		pf_set_protostate(*state, PF_PEER_BOTH, TCPS_CLOSED);
5812		pf_unlink_state(*state);
5813		*state = NULL;
5814		return (PF_DROP);
5815	}
5816
5817	if ((*state)->state_flags & PFSTATE_SLOPPY) {
5818		if (pf_tcp_track_sloppy(state, pd, reason) == PF_DROP)
5819			return (PF_DROP);
5820	} else {
5821		if (pf_tcp_track_full(state, kif, m, off, pd, reason,
5822		    &copyback) == PF_DROP)
5823			return (PF_DROP);
5824	}
5825
5826	/* translate source/destination address, if necessary */
5827	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5828		struct pf_state_key *nk = (*state)->key[pd->didx];
5829
5830		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
5831		    nk->port[pd->sidx] != th->th_sport)
5832			pf_change_ap(m, pd->src, &th->th_sport,
5833			    pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx],
5834			    nk->port[pd->sidx], 0, pd->af);
5835
5836		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
5837		    nk->port[pd->didx] != th->th_dport)
5838			pf_change_ap(m, pd->dst, &th->th_dport,
5839			    pd->ip_sum, &th->th_sum, &nk->addr[pd->didx],
5840			    nk->port[pd->didx], 0, pd->af);
5841		copyback = 1;
5842	}
5843
5844	/* Copyback sequence modulation or stateful scrub changes if needed */
5845	if (copyback)
5846		m_copyback(m, off, sizeof(*th), (caddr_t)th);
5847
5848	return (PF_PASS);
5849}
5850
5851static int
5852pf_test_state_udp(struct pf_kstate **state, struct pfi_kkif *kif,
5853    struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
5854{
5855	struct pf_state_peer	*src, *dst;
5856	struct pf_state_key_cmp	 key;
5857	struct udphdr		*uh = &pd->hdr.udp;
5858	uint8_t			 psrc, pdst;
5859
5860	bzero(&key, sizeof(key));
5861	key.af = pd->af;
5862	key.proto = IPPROTO_UDP;
5863	if (pd->dir == PF_IN)	{	/* wire side, straight */
5864		PF_ACPY(&key.addr[0], pd->src, key.af);
5865		PF_ACPY(&key.addr[1], pd->dst, key.af);
5866		key.port[0] = uh->uh_sport;
5867		key.port[1] = uh->uh_dport;
5868	} else {			/* stack side, reverse */
5869		PF_ACPY(&key.addr[1], pd->src, key.af);
5870		PF_ACPY(&key.addr[0], pd->dst, key.af);
5871		key.port[1] = uh->uh_sport;
5872		key.port[0] = uh->uh_dport;
5873	}
5874
5875	STATE_LOOKUP(kif, &key, *state, pd);
5876
5877	if (pd->dir == (*state)->direction) {
5878		src = &(*state)->src;
5879		dst = &(*state)->dst;
5880		psrc = PF_PEER_SRC;
5881		pdst = PF_PEER_DST;
5882	} else {
5883		src = &(*state)->dst;
5884		dst = &(*state)->src;
5885		psrc = PF_PEER_DST;
5886		pdst = PF_PEER_SRC;
5887	}
5888
5889	/* update states */
5890	if (src->state < PFUDPS_SINGLE)
5891		pf_set_protostate(*state, psrc, PFUDPS_SINGLE);
5892	if (dst->state == PFUDPS_SINGLE)
5893		pf_set_protostate(*state, pdst, PFUDPS_MULTIPLE);
5894
5895	/* update expire time */
5896	(*state)->expire = pf_get_uptime();
5897	if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
5898		(*state)->timeout = PFTM_UDP_MULTIPLE;
5899	else
5900		(*state)->timeout = PFTM_UDP_SINGLE;
5901
5902	/* translate source/destination address, if necessary */
5903	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5904		struct pf_state_key *nk = (*state)->key[pd->didx];
5905
5906		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
5907		    nk->port[pd->sidx] != uh->uh_sport)
5908			pf_change_ap(m, pd->src, &uh->uh_sport, pd->ip_sum,
5909			    &uh->uh_sum, &nk->addr[pd->sidx],
5910			    nk->port[pd->sidx], 1, pd->af);
5911
5912		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
5913		    nk->port[pd->didx] != uh->uh_dport)
5914			pf_change_ap(m, pd->dst, &uh->uh_dport, pd->ip_sum,
5915			    &uh->uh_sum, &nk->addr[pd->didx],
5916			    nk->port[pd->didx], 1, pd->af);
5917		m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
5918	}
5919
5920	return (PF_PASS);
5921}
5922
5923static int
5924pf_test_state_sctp(struct pf_kstate **state, struct pfi_kkif *kif,
5925    struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
5926{
5927	struct pf_state_key_cmp	 key;
5928	struct pf_state_peer	*src, *dst;
5929	struct sctphdr		*sh = &pd->hdr.sctp;
5930	u_int8_t		 psrc; //, pdst;
5931
5932	bzero(&key, sizeof(key));
5933	key.af = pd->af;
5934	key.proto = IPPROTO_SCTP;
5935	if (pd->dir == PF_IN)	{	/* wire side, straight */
5936		PF_ACPY(&key.addr[0], pd->src, key.af);
5937		PF_ACPY(&key.addr[1], pd->dst, key.af);
5938		key.port[0] = sh->src_port;
5939		key.port[1] = sh->dest_port;
5940	} else {			/* stack side, reverse */
5941		PF_ACPY(&key.addr[1], pd->src, key.af);
5942		PF_ACPY(&key.addr[0], pd->dst, key.af);
5943		key.port[1] = sh->src_port;
5944		key.port[0] = sh->dest_port;
5945	}
5946
5947	STATE_LOOKUP(kif, &key, *state, pd);
5948
5949	if (pd->dir == (*state)->direction) {
5950		src = &(*state)->src;
5951		dst = &(*state)->dst;
5952		psrc = PF_PEER_SRC;
5953	} else {
5954		src = &(*state)->dst;
5955		dst = &(*state)->src;
5956		psrc = PF_PEER_DST;
5957	}
5958
5959	/* Track state. */
5960	if (pd->sctp_flags & PFDESC_SCTP_INIT) {
5961		if (src->state < SCTP_COOKIE_WAIT) {
5962			pf_set_protostate(*state, psrc, SCTP_COOKIE_WAIT);
5963			(*state)->timeout = PFTM_SCTP_OPENING;
5964		}
5965	}
5966	if (pd->sctp_flags & PFDESC_SCTP_INIT_ACK) {
5967		MPASS(dst->scrub != NULL);
5968		if (dst->scrub->pfss_v_tag == 0)
5969			dst->scrub->pfss_v_tag = pd->sctp_initiate_tag;
5970	}
5971
5972	if (pd->sctp_flags & (PFDESC_SCTP_COOKIE | PFDESC_SCTP_HEARTBEAT_ACK)) {
5973		if (src->state < SCTP_ESTABLISHED) {
5974			pf_set_protostate(*state, psrc, SCTP_ESTABLISHED);
5975			(*state)->timeout = PFTM_SCTP_ESTABLISHED;
5976		}
5977	}
5978	if (pd->sctp_flags & (PFDESC_SCTP_SHUTDOWN | PFDESC_SCTP_ABORT |
5979	    PFDESC_SCTP_SHUTDOWN_COMPLETE)) {
5980		if (src->state < SCTP_SHUTDOWN_PENDING) {
5981			pf_set_protostate(*state, psrc, SCTP_SHUTDOWN_PENDING);
5982			(*state)->timeout = PFTM_SCTP_CLOSING;
5983		}
5984	}
5985	if (pd->sctp_flags & (PFDESC_SCTP_SHUTDOWN_COMPLETE)) {
5986		pf_set_protostate(*state, psrc, SCTP_CLOSED);
5987		(*state)->timeout = PFTM_SCTP_CLOSED;
5988	}
5989
5990	if (src->scrub != NULL) {
5991		if (src->scrub->pfss_v_tag == 0) {
5992			src->scrub->pfss_v_tag = pd->hdr.sctp.v_tag;
5993		} else  if (src->scrub->pfss_v_tag != pd->hdr.sctp.v_tag)
5994			return (PF_DROP);
5995	}
5996
5997	(*state)->expire = pf_get_uptime();
5998
5999	/* translate source/destination address, if necessary */
6000	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
6001		uint16_t checksum = 0;
6002		struct pf_state_key *nk = (*state)->key[pd->didx];
6003
6004		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
6005		    nk->port[pd->sidx] != pd->hdr.sctp.src_port) {
6006			pf_change_ap(m, pd->src, &pd->hdr.sctp.src_port,
6007			    pd->ip_sum, &checksum, &nk->addr[pd->sidx],
6008			    nk->port[pd->sidx], 1, pd->af);
6009		}
6010
6011		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
6012		    nk->port[pd->didx] != pd->hdr.sctp.dest_port) {
6013			pf_change_ap(m, pd->dst, &pd->hdr.sctp.dest_port,
6014			    pd->ip_sum, &checksum, &nk->addr[pd->didx],
6015			    nk->port[pd->didx], 1, pd->af);
6016		}
6017	}
6018
6019	return (PF_PASS);
6020}
6021
6022static void
6023pf_sctp_multihome_detach_addr(const struct pf_kstate *s)
6024{
6025	struct pf_sctp_endpoint key;
6026	struct pf_sctp_endpoint *ep;
6027	struct pf_state_key *sks = s->key[PF_SK_STACK];
6028	struct pf_sctp_source *i, *tmp;
6029
6030	if (sks == NULL || sks->proto != IPPROTO_SCTP || s->dst.scrub == NULL)
6031		return;
6032
6033	PF_SCTP_ENDPOINTS_LOCK();
6034
6035	key.v_tag = s->dst.scrub->pfss_v_tag;
6036	ep  = RB_FIND(pf_sctp_endpoints, &V_pf_sctp_endpoints, &key);
6037	if (ep != NULL) {
6038		TAILQ_FOREACH_SAFE(i, &ep->sources, entry, tmp) {
6039			if (pf_addr_cmp(&i->addr,
6040			    &s->key[PF_SK_WIRE]->addr[s->direction == PF_OUT],
6041			    s->key[PF_SK_WIRE]->af) == 0) {
6042				SDT_PROBE3(pf, sctp, multihome, remove,
6043				    key.v_tag, s, i);
6044				TAILQ_REMOVE(&ep->sources, i, entry);
6045				free(i, M_PFTEMP);
6046				break;
6047			}
6048		}
6049
6050		if (TAILQ_EMPTY(&ep->sources)) {
6051			RB_REMOVE(pf_sctp_endpoints, &V_pf_sctp_endpoints, ep);
6052			free(ep, M_PFTEMP);
6053		}
6054	}
6055
6056	/* Other direction. */
6057	key.v_tag = s->src.scrub->pfss_v_tag;
6058	ep = RB_FIND(pf_sctp_endpoints, &V_pf_sctp_endpoints, &key);
6059	if (ep != NULL) {
6060		TAILQ_FOREACH_SAFE(i, &ep->sources, entry, tmp) {
6061			if (pf_addr_cmp(&i->addr,
6062			    &s->key[PF_SK_WIRE]->addr[s->direction == PF_IN],
6063			    s->key[PF_SK_WIRE]->af) == 0) {
6064				SDT_PROBE3(pf, sctp, multihome, remove,
6065				    key.v_tag, s, i);
6066				TAILQ_REMOVE(&ep->sources, i, entry);
6067				free(i, M_PFTEMP);
6068				break;
6069			}
6070		}
6071
6072		if (TAILQ_EMPTY(&ep->sources)) {
6073			RB_REMOVE(pf_sctp_endpoints, &V_pf_sctp_endpoints, ep);
6074			free(ep, M_PFTEMP);
6075		}
6076	}
6077
6078	PF_SCTP_ENDPOINTS_UNLOCK();
6079}
6080
6081static void
6082pf_sctp_multihome_add_addr(struct pf_pdesc *pd, struct pf_addr *a, uint32_t v_tag)
6083{
6084	struct pf_sctp_endpoint key = {
6085		.v_tag = v_tag,
6086	};
6087	struct pf_sctp_source *i;
6088	struct pf_sctp_endpoint *ep;
6089
6090	PF_SCTP_ENDPOINTS_LOCK();
6091
6092	ep = RB_FIND(pf_sctp_endpoints, &V_pf_sctp_endpoints, &key);
6093	if (ep == NULL) {
6094		ep = malloc(sizeof(struct pf_sctp_endpoint),
6095		    M_PFTEMP, M_NOWAIT);
6096		if (ep == NULL) {
6097			PF_SCTP_ENDPOINTS_UNLOCK();
6098			return;
6099		}
6100
6101		ep->v_tag = v_tag;
6102		TAILQ_INIT(&ep->sources);
6103		RB_INSERT(pf_sctp_endpoints, &V_pf_sctp_endpoints, ep);
6104	}
6105
6106	/* Avoid inserting duplicates. */
6107	TAILQ_FOREACH(i, &ep->sources, entry) {
6108		if (pf_addr_cmp(&i->addr, a, pd->af) == 0) {
6109			PF_SCTP_ENDPOINTS_UNLOCK();
6110			return;
6111		}
6112	}
6113
6114	i = malloc(sizeof(*i), M_PFTEMP, M_NOWAIT);
6115	if (i == NULL) {
6116		PF_SCTP_ENDPOINTS_UNLOCK();
6117		return;
6118	}
6119
6120	i->af = pd->af;
6121	memcpy(&i->addr, a, sizeof(*a));
6122	TAILQ_INSERT_TAIL(&ep->sources, i, entry);
6123	SDT_PROBE2(pf, sctp, multihome, add, v_tag, i);
6124
6125	PF_SCTP_ENDPOINTS_UNLOCK();
6126}
6127
6128static void
6129pf_sctp_multihome_delayed(struct pf_pdesc *pd, int off, struct pfi_kkif *kif,
6130    struct pf_kstate *s, int action)
6131{
6132	struct pf_sctp_multihome_job	*j, *tmp;
6133	struct pf_sctp_source		*i;
6134	int			 ret __unused;
6135	struct pf_kstate	*sm = NULL;
6136	struct pf_krule		*ra = NULL;
6137	struct pf_krule		*r = &V_pf_default_rule;
6138	struct pf_kruleset	*rs = NULL;
6139	bool do_extra = true;
6140
6141	PF_RULES_RLOCK_TRACKER;
6142
6143again:
6144	TAILQ_FOREACH_SAFE(j, &pd->sctp_multihome_jobs, next, tmp) {
6145		if (s == NULL || action != PF_PASS)
6146			goto free;
6147
6148		/* Confirm we don't recurse here. */
6149		MPASS(! (pd->sctp_flags & PFDESC_SCTP_ADD_IP));
6150
6151		switch (j->op) {
6152		case  SCTP_ADD_IP_ADDRESS: {
6153			uint32_t v_tag = pd->sctp_initiate_tag;
6154
6155			if (v_tag == 0) {
6156				if (s->direction == pd->dir)
6157					v_tag = s->src.scrub->pfss_v_tag;
6158				else
6159					v_tag = s->dst.scrub->pfss_v_tag;
6160			}
6161
6162			/*
6163			 * Avoid duplicating states. We'll already have
6164			 * created a state based on the source address of
6165			 * the packet, but SCTP endpoints may also list this
6166			 * address again in the INIT(_ACK) parameters.
6167			 */
6168			if (pf_addr_cmp(&j->src, pd->src, pd->af) == 0) {
6169				break;
6170			}
6171
6172			j->pd.sctp_flags |= PFDESC_SCTP_ADD_IP;
6173			PF_RULES_RLOCK();
6174			sm = NULL;
6175			/*
6176			 * New connections need to be floating, because
6177			 * we cannot know what interfaces it will use.
6178			 * That's why we pass V_pfi_all rather than kif.
6179			 */
6180			ret = pf_test_rule(&r, &sm, V_pfi_all,
6181			    j->m, off, &j->pd, &ra, &rs, NULL);
6182			PF_RULES_RUNLOCK();
6183			SDT_PROBE4(pf, sctp, multihome, test, kif, r, j->m, ret);
6184			if (ret != PF_DROP && sm != NULL) {
6185				/* Inherit v_tag values. */
6186				if (sm->direction == s->direction) {
6187					sm->src.scrub->pfss_v_tag = s->src.scrub->pfss_v_tag;
6188					sm->dst.scrub->pfss_v_tag = s->dst.scrub->pfss_v_tag;
6189				} else {
6190					sm->src.scrub->pfss_v_tag = s->dst.scrub->pfss_v_tag;
6191					sm->dst.scrub->pfss_v_tag = s->src.scrub->pfss_v_tag;
6192				}
6193				PF_STATE_UNLOCK(sm);
6194			} else {
6195				/* If we try duplicate inserts? */
6196				break;
6197			}
6198
6199			/* Only add the address if we've actually allowed the state. */
6200			pf_sctp_multihome_add_addr(pd, &j->src, v_tag);
6201
6202			if (! do_extra) {
6203				break;
6204			}
6205			/*
6206			 * We need to do this for each of our source addresses.
6207			 * Find those based on the verification tag.
6208			 */
6209			struct pf_sctp_endpoint key = {
6210				.v_tag = pd->hdr.sctp.v_tag,
6211			};
6212			struct pf_sctp_endpoint *ep;
6213
6214			PF_SCTP_ENDPOINTS_LOCK();
6215			ep = RB_FIND(pf_sctp_endpoints, &V_pf_sctp_endpoints, &key);
6216			if (ep == NULL) {
6217				PF_SCTP_ENDPOINTS_UNLOCK();
6218				break;
6219			}
6220			MPASS(ep != NULL);
6221
6222			TAILQ_FOREACH(i, &ep->sources, entry) {
6223				struct pf_sctp_multihome_job *nj;
6224
6225				/* SCTP can intermingle IPv4 and IPv6. */
6226				if (i->af != pd->af)
6227					continue;
6228
6229				nj = malloc(sizeof(*nj), M_PFTEMP, M_NOWAIT | M_ZERO);
6230				if (! nj) {
6231					continue;
6232				}
6233				memcpy(&nj->pd, &j->pd, sizeof(j->pd));
6234				memcpy(&nj->src, &j->src, sizeof(nj->src));
6235				nj->pd.src = &nj->src;
6236				// New destination address!
6237				memcpy(&nj->dst, &i->addr, sizeof(nj->dst));
6238				nj->pd.dst = &nj->dst;
6239				nj->m = j->m;
6240				nj->op = j->op;
6241
6242				TAILQ_INSERT_TAIL(&pd->sctp_multihome_jobs, nj, next);
6243			}
6244			PF_SCTP_ENDPOINTS_UNLOCK();
6245
6246			break;
6247		}
6248		case SCTP_DEL_IP_ADDRESS: {
6249			struct pf_state_key_cmp key;
6250			uint8_t psrc;
6251
6252			bzero(&key, sizeof(key));
6253			key.af = j->pd.af;
6254			key.proto = IPPROTO_SCTP;
6255			if (j->pd.dir == PF_IN)	{	/* wire side, straight */
6256				PF_ACPY(&key.addr[0], j->pd.src, key.af);
6257				PF_ACPY(&key.addr[1], j->pd.dst, key.af);
6258				key.port[0] = j->pd.hdr.sctp.src_port;
6259				key.port[1] = j->pd.hdr.sctp.dest_port;
6260			} else {			/* stack side, reverse */
6261				PF_ACPY(&key.addr[1], j->pd.src, key.af);
6262				PF_ACPY(&key.addr[0], j->pd.dst, key.af);
6263				key.port[1] = j->pd.hdr.sctp.src_port;
6264				key.port[0] = j->pd.hdr.sctp.dest_port;
6265			}
6266
6267			sm = pf_find_state(kif, &key, j->pd.dir);
6268			if (sm != NULL) {
6269				PF_STATE_LOCK_ASSERT(sm);
6270				if (j->pd.dir == sm->direction) {
6271					psrc = PF_PEER_SRC;
6272				} else {
6273					psrc = PF_PEER_DST;
6274				}
6275				pf_set_protostate(sm, psrc, SCTP_SHUTDOWN_PENDING);
6276				sm->timeout = PFTM_SCTP_CLOSING;
6277				PF_STATE_UNLOCK(sm);
6278			}
6279			break;
6280		default:
6281			panic("Unknown op %#x", j->op);
6282		}
6283	}
6284
6285	free:
6286		TAILQ_REMOVE(&pd->sctp_multihome_jobs, j, next);
6287		free(j, M_PFTEMP);
6288	}
6289
6290	/* We may have inserted extra work while processing the list. */
6291	if (! TAILQ_EMPTY(&pd->sctp_multihome_jobs)) {
6292		do_extra = false;
6293		goto again;
6294	}
6295}
6296
6297static int
6298pf_multihome_scan(struct mbuf *m, int start, int len, struct pf_pdesc *pd,
6299    struct pfi_kkif *kif, int op)
6300{
6301	int			 off = 0;
6302	struct pf_sctp_multihome_job	*job;
6303
6304	while (off < len) {
6305		struct sctp_paramhdr h;
6306
6307		if (!pf_pull_hdr(m, start + off, &h, sizeof(h), NULL, NULL,
6308		    pd->af))
6309			return (PF_DROP);
6310
6311		/* Parameters are at least 4 bytes. */
6312		if (ntohs(h.param_length) < 4)
6313			return (PF_DROP);
6314
6315		switch (ntohs(h.param_type)) {
6316		case  SCTP_IPV4_ADDRESS: {
6317			struct in_addr t;
6318
6319			if (ntohs(h.param_length) !=
6320			    (sizeof(struct sctp_paramhdr) + sizeof(t)))
6321				return (PF_DROP);
6322
6323			if (!pf_pull_hdr(m, start + off + sizeof(h), &t, sizeof(t),
6324			    NULL, NULL, pd->af))
6325				return (PF_DROP);
6326
6327			if (in_nullhost(t))
6328				t.s_addr = pd->src->v4.s_addr;
6329
6330			/*
6331			 * We hold the state lock (idhash) here, which means
6332			 * that we can't acquire the keyhash, or we'll get a
6333			 * LOR (and potentially double-lock things too). We also
6334			 * can't release the state lock here, so instead we'll
6335			 * enqueue this for async handling.
6336			 * There's a relatively small race here, in that a
6337			 * packet using the new addresses could arrive already,
6338			 * but that's just though luck for it.
6339			 */
6340			job = malloc(sizeof(*job), M_PFTEMP, M_NOWAIT | M_ZERO);
6341			if (! job)
6342				return (PF_DROP);
6343
6344			memcpy(&job->pd, pd, sizeof(*pd));
6345
6346			// New source address!
6347			memcpy(&job->src, &t, sizeof(t));
6348			job->pd.src = &job->src;
6349			memcpy(&job->dst, pd->dst, sizeof(job->dst));
6350			job->pd.dst = &job->dst;
6351			job->m = m;
6352			job->op = op;
6353
6354			TAILQ_INSERT_TAIL(&pd->sctp_multihome_jobs, job, next);
6355			break;
6356		}
6357#ifdef INET6
6358		case SCTP_IPV6_ADDRESS: {
6359			struct in6_addr t;
6360
6361			if (ntohs(h.param_length) !=
6362			    (sizeof(struct sctp_paramhdr) + sizeof(t)))
6363				return (PF_DROP);
6364
6365			if (!pf_pull_hdr(m, start + off + sizeof(h), &t, sizeof(t),
6366			    NULL, NULL, pd->af))
6367				return (PF_DROP);
6368			if (memcmp(&t, &pd->src->v6, sizeof(t)) == 0)
6369				break;
6370			if (memcmp(&t, &in6addr_any, sizeof(t)) == 0)
6371				memcpy(&t, &pd->src->v6, sizeof(t));
6372
6373			job = malloc(sizeof(*job), M_PFTEMP, M_NOWAIT | M_ZERO);
6374			if (! job)
6375				return (PF_DROP);
6376
6377			memcpy(&job->pd, pd, sizeof(*pd));
6378			memcpy(&job->src, &t, sizeof(t));
6379			job->pd.src = &job->src;
6380			memcpy(&job->dst, pd->dst, sizeof(job->dst));
6381			job->pd.dst = &job->dst;
6382			job->m = m;
6383			job->op = op;
6384
6385			TAILQ_INSERT_TAIL(&pd->sctp_multihome_jobs, job, next);
6386			break;
6387		}
6388#endif
6389		case SCTP_ADD_IP_ADDRESS: {
6390			int ret;
6391			struct sctp_asconf_paramhdr ah;
6392
6393			if (!pf_pull_hdr(m, start + off, &ah, sizeof(ah),
6394			    NULL, NULL, pd->af))
6395				return (PF_DROP);
6396
6397			ret = pf_multihome_scan(m, start + off + sizeof(ah),
6398			    ntohs(ah.ph.param_length) - sizeof(ah), pd, kif,
6399			    SCTP_ADD_IP_ADDRESS);
6400			if (ret != PF_PASS)
6401				return (ret);
6402			break;
6403		}
6404		case SCTP_DEL_IP_ADDRESS: {
6405			int ret;
6406			struct sctp_asconf_paramhdr ah;
6407
6408			if (!pf_pull_hdr(m, start + off, &ah, sizeof(ah),
6409			    NULL, NULL, pd->af))
6410				return (PF_DROP);
6411			ret = pf_multihome_scan(m, start + off + sizeof(ah),
6412			    ntohs(ah.ph.param_length) - sizeof(ah), pd, kif,
6413			    SCTP_DEL_IP_ADDRESS);
6414			if (ret != PF_PASS)
6415				return (ret);
6416			break;
6417		}
6418		default:
6419			break;
6420		}
6421
6422		off += roundup(ntohs(h.param_length), 4);
6423	}
6424
6425	return (PF_PASS);
6426}
6427int
6428pf_multihome_scan_init(struct mbuf *m, int start, int len, struct pf_pdesc *pd,
6429    struct pfi_kkif *kif)
6430{
6431	start += sizeof(struct sctp_init_chunk);
6432	len -= sizeof(struct sctp_init_chunk);
6433
6434	return (pf_multihome_scan(m, start, len, pd, kif, SCTP_ADD_IP_ADDRESS));
6435}
6436
6437int
6438pf_multihome_scan_asconf(struct mbuf *m, int start, int len,
6439    struct pf_pdesc *pd, struct pfi_kkif *kif)
6440{
6441	start += sizeof(struct sctp_asconf_chunk);
6442	len -= sizeof(struct sctp_asconf_chunk);
6443
6444	return (pf_multihome_scan(m, start, len, pd, kif, SCTP_ADD_IP_ADDRESS));
6445}
6446
6447static int
6448pf_test_state_icmp(struct pf_kstate **state, struct pfi_kkif *kif,
6449    struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
6450{
6451	struct pf_addr  *saddr = pd->src, *daddr = pd->dst;
6452	u_int16_t	 icmpid = 0, *icmpsum;
6453	u_int8_t	 icmptype, icmpcode;
6454	int		 state_icmp = 0;
6455	struct pf_state_key_cmp key;
6456
6457	bzero(&key, sizeof(key));
6458	switch (pd->proto) {
6459#ifdef INET
6460	case IPPROTO_ICMP:
6461		icmptype = pd->hdr.icmp.icmp_type;
6462		icmpcode = pd->hdr.icmp.icmp_code;
6463		icmpid = pd->hdr.icmp.icmp_id;
6464		icmpsum = &pd->hdr.icmp.icmp_cksum;
6465
6466		if (icmptype == ICMP_UNREACH ||
6467		    icmptype == ICMP_SOURCEQUENCH ||
6468		    icmptype == ICMP_REDIRECT ||
6469		    icmptype == ICMP_TIMXCEED ||
6470		    icmptype == ICMP_PARAMPROB)
6471			state_icmp++;
6472		break;
6473#endif /* INET */
6474#ifdef INET6
6475	case IPPROTO_ICMPV6:
6476		icmptype = pd->hdr.icmp6.icmp6_type;
6477		icmpcode = pd->hdr.icmp6.icmp6_code;
6478		icmpid = pd->hdr.icmp6.icmp6_id;
6479		icmpsum = &pd->hdr.icmp6.icmp6_cksum;
6480
6481		if (icmptype == ICMP6_DST_UNREACH ||
6482		    icmptype == ICMP6_PACKET_TOO_BIG ||
6483		    icmptype == ICMP6_TIME_EXCEEDED ||
6484		    icmptype == ICMP6_PARAM_PROB)
6485			state_icmp++;
6486		break;
6487#endif /* INET6 */
6488	}
6489
6490	if (!state_icmp) {
6491		/*
6492		 * ICMP query/reply message not related to a TCP/UDP packet.
6493		 * Search for an ICMP state.
6494		 */
6495		key.af = pd->af;
6496		key.proto = pd->proto;
6497		key.port[0] = key.port[1] = icmpid;
6498		if (pd->dir == PF_IN)	{	/* wire side, straight */
6499			PF_ACPY(&key.addr[0], pd->src, key.af);
6500			PF_ACPY(&key.addr[1], pd->dst, key.af);
6501		} else {			/* stack side, reverse */
6502			PF_ACPY(&key.addr[1], pd->src, key.af);
6503			PF_ACPY(&key.addr[0], pd->dst, key.af);
6504		}
6505
6506		STATE_LOOKUP(kif, &key, *state, pd);
6507
6508		(*state)->expire = pf_get_uptime();
6509		(*state)->timeout = PFTM_ICMP_ERROR_REPLY;
6510
6511		/* translate source/destination address, if necessary */
6512		if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
6513			struct pf_state_key *nk = (*state)->key[pd->didx];
6514
6515			switch (pd->af) {
6516#ifdef INET
6517			case AF_INET:
6518				if (PF_ANEQ(pd->src,
6519				    &nk->addr[pd->sidx], AF_INET))
6520					pf_change_a(&saddr->v4.s_addr,
6521					    pd->ip_sum,
6522					    nk->addr[pd->sidx].v4.s_addr, 0);
6523
6524				if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
6525				    AF_INET))
6526					pf_change_a(&daddr->v4.s_addr,
6527					    pd->ip_sum,
6528					    nk->addr[pd->didx].v4.s_addr, 0);
6529
6530				if (nk->port[0] !=
6531				    pd->hdr.icmp.icmp_id) {
6532					pd->hdr.icmp.icmp_cksum =
6533					    pf_cksum_fixup(
6534					    pd->hdr.icmp.icmp_cksum, icmpid,
6535					    nk->port[pd->sidx], 0);
6536					pd->hdr.icmp.icmp_id =
6537					    nk->port[pd->sidx];
6538				}
6539
6540				m_copyback(m, off, ICMP_MINLEN,
6541				    (caddr_t )&pd->hdr.icmp);
6542				break;
6543#endif /* INET */
6544#ifdef INET6
6545			case AF_INET6:
6546				if (PF_ANEQ(pd->src,
6547				    &nk->addr[pd->sidx], AF_INET6))
6548					pf_change_a6(saddr,
6549					    &pd->hdr.icmp6.icmp6_cksum,
6550					    &nk->addr[pd->sidx], 0);
6551
6552				if (PF_ANEQ(pd->dst,
6553				    &nk->addr[pd->didx], AF_INET6))
6554					pf_change_a6(daddr,
6555					    &pd->hdr.icmp6.icmp6_cksum,
6556					    &nk->addr[pd->didx], 0);
6557
6558				m_copyback(m, off, sizeof(struct icmp6_hdr),
6559				    (caddr_t )&pd->hdr.icmp6);
6560				break;
6561#endif /* INET6 */
6562			}
6563		}
6564		return (PF_PASS);
6565
6566	} else {
6567		/*
6568		 * ICMP error message in response to a TCP/UDP packet.
6569		 * Extract the inner TCP/UDP header and search for that state.
6570		 */
6571
6572		struct pf_pdesc	pd2;
6573		bzero(&pd2, sizeof pd2);
6574#ifdef INET
6575		struct ip	h2;
6576#endif /* INET */
6577#ifdef INET6
6578		struct ip6_hdr	h2_6;
6579		int		terminal = 0;
6580#endif /* INET6 */
6581		int		ipoff2 = 0;
6582		int		off2 = 0;
6583
6584		pd2.af = pd->af;
6585		/* Payload packet is from the opposite direction. */
6586		pd2.sidx = (pd->dir == PF_IN) ? 1 : 0;
6587		pd2.didx = (pd->dir == PF_IN) ? 0 : 1;
6588		switch (pd->af) {
6589#ifdef INET
6590		case AF_INET:
6591			/* offset of h2 in mbuf chain */
6592			ipoff2 = off + ICMP_MINLEN;
6593
6594			if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
6595			    NULL, reason, pd2.af)) {
6596				DPFPRINTF(PF_DEBUG_MISC,
6597				    ("pf: ICMP error message too short "
6598				    "(ip)\n"));
6599				return (PF_DROP);
6600			}
6601			/*
6602			 * ICMP error messages don't refer to non-first
6603			 * fragments
6604			 */
6605			if (h2.ip_off & htons(IP_OFFMASK)) {
6606				REASON_SET(reason, PFRES_FRAG);
6607				return (PF_DROP);
6608			}
6609
6610			/* offset of protocol header that follows h2 */
6611			off2 = ipoff2 + (h2.ip_hl << 2);
6612
6613			pd2.proto = h2.ip_p;
6614			pd2.src = (struct pf_addr *)&h2.ip_src;
6615			pd2.dst = (struct pf_addr *)&h2.ip_dst;
6616			pd2.ip_sum = &h2.ip_sum;
6617			break;
6618#endif /* INET */
6619#ifdef INET6
6620		case AF_INET6:
6621			ipoff2 = off + sizeof(struct icmp6_hdr);
6622
6623			if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
6624			    NULL, reason, pd2.af)) {
6625				DPFPRINTF(PF_DEBUG_MISC,
6626				    ("pf: ICMP error message too short "
6627				    "(ip6)\n"));
6628				return (PF_DROP);
6629			}
6630			pd2.proto = h2_6.ip6_nxt;
6631			pd2.src = (struct pf_addr *)&h2_6.ip6_src;
6632			pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
6633			pd2.ip_sum = NULL;
6634			off2 = ipoff2 + sizeof(h2_6);
6635			do {
6636				switch (pd2.proto) {
6637				case IPPROTO_FRAGMENT:
6638					/*
6639					 * ICMPv6 error messages for
6640					 * non-first fragments
6641					 */
6642					REASON_SET(reason, PFRES_FRAG);
6643					return (PF_DROP);
6644				case IPPROTO_AH:
6645				case IPPROTO_HOPOPTS:
6646				case IPPROTO_ROUTING:
6647				case IPPROTO_DSTOPTS: {
6648					/* get next header and header length */
6649					struct ip6_ext opt6;
6650
6651					if (!pf_pull_hdr(m, off2, &opt6,
6652					    sizeof(opt6), NULL, reason,
6653					    pd2.af)) {
6654						DPFPRINTF(PF_DEBUG_MISC,
6655						    ("pf: ICMPv6 short opt\n"));
6656						return (PF_DROP);
6657					}
6658					if (pd2.proto == IPPROTO_AH)
6659						off2 += (opt6.ip6e_len + 2) * 4;
6660					else
6661						off2 += (opt6.ip6e_len + 1) * 8;
6662					pd2.proto = opt6.ip6e_nxt;
6663					/* goto the next header */
6664					break;
6665				}
6666				default:
6667					terminal++;
6668					break;
6669				}
6670			} while (!terminal);
6671			break;
6672#endif /* INET6 */
6673		}
6674
6675		if (PF_ANEQ(pd->dst, pd2.src, pd->af)) {
6676			if (V_pf_status.debug >= PF_DEBUG_MISC) {
6677				printf("pf: BAD ICMP %d:%d outer dst: ",
6678				    icmptype, icmpcode);
6679				pf_print_host(pd->src, 0, pd->af);
6680				printf(" -> ");
6681				pf_print_host(pd->dst, 0, pd->af);
6682				printf(" inner src: ");
6683				pf_print_host(pd2.src, 0, pd2.af);
6684				printf(" -> ");
6685				pf_print_host(pd2.dst, 0, pd2.af);
6686				printf("\n");
6687			}
6688			REASON_SET(reason, PFRES_BADSTATE);
6689			return (PF_DROP);
6690		}
6691
6692		switch (pd2.proto) {
6693		case IPPROTO_TCP: {
6694			struct tcphdr		 th;
6695			u_int32_t		 seq;
6696			struct pf_state_peer	*src, *dst;
6697			u_int8_t		 dws;
6698			int			 copyback = 0;
6699
6700			/*
6701			 * Only the first 8 bytes of the TCP header can be
6702			 * expected. Don't access any TCP header fields after
6703			 * th_seq, an ackskew test is not possible.
6704			 */
6705			if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
6706			    pd2.af)) {
6707				DPFPRINTF(PF_DEBUG_MISC,
6708				    ("pf: ICMP error message too short "
6709				    "(tcp)\n"));
6710				return (PF_DROP);
6711			}
6712
6713			key.af = pd2.af;
6714			key.proto = IPPROTO_TCP;
6715			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
6716			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
6717			key.port[pd2.sidx] = th.th_sport;
6718			key.port[pd2.didx] = th.th_dport;
6719
6720			STATE_LOOKUP(kif, &key, *state, pd);
6721
6722			if (pd->dir == (*state)->direction) {
6723				src = &(*state)->dst;
6724				dst = &(*state)->src;
6725			} else {
6726				src = &(*state)->src;
6727				dst = &(*state)->dst;
6728			}
6729
6730			if (src->wscale && dst->wscale)
6731				dws = dst->wscale & PF_WSCALE_MASK;
6732			else
6733				dws = 0;
6734
6735			/* Demodulate sequence number */
6736			seq = ntohl(th.th_seq) - src->seqdiff;
6737			if (src->seqdiff) {
6738				pf_change_a(&th.th_seq, icmpsum,
6739				    htonl(seq), 0);
6740				copyback = 1;
6741			}
6742
6743			if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
6744			    (!SEQ_GEQ(src->seqhi, seq) ||
6745			    !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
6746				if (V_pf_status.debug >= PF_DEBUG_MISC) {
6747					printf("pf: BAD ICMP %d:%d ",
6748					    icmptype, icmpcode);
6749					pf_print_host(pd->src, 0, pd->af);
6750					printf(" -> ");
6751					pf_print_host(pd->dst, 0, pd->af);
6752					printf(" state: ");
6753					pf_print_state(*state);
6754					printf(" seq=%u\n", seq);
6755				}
6756				REASON_SET(reason, PFRES_BADSTATE);
6757				return (PF_DROP);
6758			} else {
6759				if (V_pf_status.debug >= PF_DEBUG_MISC) {
6760					printf("pf: OK ICMP %d:%d ",
6761					    icmptype, icmpcode);
6762					pf_print_host(pd->src, 0, pd->af);
6763					printf(" -> ");
6764					pf_print_host(pd->dst, 0, pd->af);
6765					printf(" state: ");
6766					pf_print_state(*state);
6767					printf(" seq=%u\n", seq);
6768				}
6769			}
6770
6771			/* translate source/destination address, if necessary */
6772			if ((*state)->key[PF_SK_WIRE] !=
6773			    (*state)->key[PF_SK_STACK]) {
6774				struct pf_state_key *nk =
6775				    (*state)->key[pd->didx];
6776
6777				if (PF_ANEQ(pd2.src,
6778				    &nk->addr[pd2.sidx], pd2.af) ||
6779				    nk->port[pd2.sidx] != th.th_sport)
6780					pf_change_icmp(pd2.src, &th.th_sport,
6781					    daddr, &nk->addr[pd2.sidx],
6782					    nk->port[pd2.sidx], NULL,
6783					    pd2.ip_sum, icmpsum,
6784					    pd->ip_sum, 0, pd2.af);
6785
6786				if (PF_ANEQ(pd2.dst,
6787				    &nk->addr[pd2.didx], pd2.af) ||
6788				    nk->port[pd2.didx] != th.th_dport)
6789					pf_change_icmp(pd2.dst, &th.th_dport,
6790					    saddr, &nk->addr[pd2.didx],
6791					    nk->port[pd2.didx], NULL,
6792					    pd2.ip_sum, icmpsum,
6793					    pd->ip_sum, 0, pd2.af);
6794				copyback = 1;
6795			}
6796
6797			if (copyback) {
6798				switch (pd2.af) {
6799#ifdef INET
6800				case AF_INET:
6801					m_copyback(m, off, ICMP_MINLEN,
6802					    (caddr_t )&pd->hdr.icmp);
6803					m_copyback(m, ipoff2, sizeof(h2),
6804					    (caddr_t )&h2);
6805					break;
6806#endif /* INET */
6807#ifdef INET6
6808				case AF_INET6:
6809					m_copyback(m, off,
6810					    sizeof(struct icmp6_hdr),
6811					    (caddr_t )&pd->hdr.icmp6);
6812					m_copyback(m, ipoff2, sizeof(h2_6),
6813					    (caddr_t )&h2_6);
6814					break;
6815#endif /* INET6 */
6816				}
6817				m_copyback(m, off2, 8, (caddr_t)&th);
6818			}
6819
6820			return (PF_PASS);
6821			break;
6822		}
6823		case IPPROTO_UDP: {
6824			struct udphdr		uh;
6825
6826			if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
6827			    NULL, reason, pd2.af)) {
6828				DPFPRINTF(PF_DEBUG_MISC,
6829				    ("pf: ICMP error message too short "
6830				    "(udp)\n"));
6831				return (PF_DROP);
6832			}
6833
6834			key.af = pd2.af;
6835			key.proto = IPPROTO_UDP;
6836			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
6837			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
6838			key.port[pd2.sidx] = uh.uh_sport;
6839			key.port[pd2.didx] = uh.uh_dport;
6840
6841			STATE_LOOKUP(kif, &key, *state, pd);
6842
6843			/* translate source/destination address, if necessary */
6844			if ((*state)->key[PF_SK_WIRE] !=
6845			    (*state)->key[PF_SK_STACK]) {
6846				struct pf_state_key *nk =
6847				    (*state)->key[pd->didx];
6848
6849				if (PF_ANEQ(pd2.src,
6850				    &nk->addr[pd2.sidx], pd2.af) ||
6851				    nk->port[pd2.sidx] != uh.uh_sport)
6852					pf_change_icmp(pd2.src, &uh.uh_sport,
6853					    daddr, &nk->addr[pd2.sidx],
6854					    nk->port[pd2.sidx], &uh.uh_sum,
6855					    pd2.ip_sum, icmpsum,
6856					    pd->ip_sum, 1, pd2.af);
6857
6858				if (PF_ANEQ(pd2.dst,
6859				    &nk->addr[pd2.didx], pd2.af) ||
6860				    nk->port[pd2.didx] != uh.uh_dport)
6861					pf_change_icmp(pd2.dst, &uh.uh_dport,
6862					    saddr, &nk->addr[pd2.didx],
6863					    nk->port[pd2.didx], &uh.uh_sum,
6864					    pd2.ip_sum, icmpsum,
6865					    pd->ip_sum, 1, pd2.af);
6866
6867				switch (pd2.af) {
6868#ifdef INET
6869				case AF_INET:
6870					m_copyback(m, off, ICMP_MINLEN,
6871					    (caddr_t )&pd->hdr.icmp);
6872					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
6873					break;
6874#endif /* INET */
6875#ifdef INET6
6876				case AF_INET6:
6877					m_copyback(m, off,
6878					    sizeof(struct icmp6_hdr),
6879					    (caddr_t )&pd->hdr.icmp6);
6880					m_copyback(m, ipoff2, sizeof(h2_6),
6881					    (caddr_t )&h2_6);
6882					break;
6883#endif /* INET6 */
6884				}
6885				m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
6886			}
6887			return (PF_PASS);
6888			break;
6889		}
6890#ifdef INET
6891		case IPPROTO_ICMP: {
6892			struct icmp		iih;
6893
6894			if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
6895			    NULL, reason, pd2.af)) {
6896				DPFPRINTF(PF_DEBUG_MISC,
6897				    ("pf: ICMP error message too short i"
6898				    "(icmp)\n"));
6899				return (PF_DROP);
6900			}
6901
6902			key.af = pd2.af;
6903			key.proto = IPPROTO_ICMP;
6904			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
6905			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
6906			key.port[0] = key.port[1] = iih.icmp_id;
6907
6908			STATE_LOOKUP(kif, &key, *state, pd);
6909
6910			/* translate source/destination address, if necessary */
6911			if ((*state)->key[PF_SK_WIRE] !=
6912			    (*state)->key[PF_SK_STACK]) {
6913				struct pf_state_key *nk =
6914				    (*state)->key[pd->didx];
6915
6916				if (PF_ANEQ(pd2.src,
6917				    &nk->addr[pd2.sidx], pd2.af) ||
6918				    nk->port[pd2.sidx] != iih.icmp_id)
6919					pf_change_icmp(pd2.src, &iih.icmp_id,
6920					    daddr, &nk->addr[pd2.sidx],
6921					    nk->port[pd2.sidx], NULL,
6922					    pd2.ip_sum, icmpsum,
6923					    pd->ip_sum, 0, AF_INET);
6924
6925				if (PF_ANEQ(pd2.dst,
6926				    &nk->addr[pd2.didx], pd2.af) ||
6927				    nk->port[pd2.didx] != iih.icmp_id)
6928					pf_change_icmp(pd2.dst, &iih.icmp_id,
6929					    saddr, &nk->addr[pd2.didx],
6930					    nk->port[pd2.didx], NULL,
6931					    pd2.ip_sum, icmpsum,
6932					    pd->ip_sum, 0, AF_INET);
6933
6934				m_copyback(m, off, ICMP_MINLEN, (caddr_t)&pd->hdr.icmp);
6935				m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
6936				m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
6937			}
6938			return (PF_PASS);
6939			break;
6940		}
6941#endif /* INET */
6942#ifdef INET6
6943		case IPPROTO_ICMPV6: {
6944			struct icmp6_hdr	iih;
6945
6946			if (!pf_pull_hdr(m, off2, &iih,
6947			    sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
6948				DPFPRINTF(PF_DEBUG_MISC,
6949				    ("pf: ICMP error message too short "
6950				    "(icmp6)\n"));
6951				return (PF_DROP);
6952			}
6953
6954			key.af = pd2.af;
6955			key.proto = IPPROTO_ICMPV6;
6956			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
6957			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
6958			key.port[0] = key.port[1] = iih.icmp6_id;
6959
6960			STATE_LOOKUP(kif, &key, *state, pd);
6961
6962			/* translate source/destination address, if necessary */
6963			if ((*state)->key[PF_SK_WIRE] !=
6964			    (*state)->key[PF_SK_STACK]) {
6965				struct pf_state_key *nk =
6966				    (*state)->key[pd->didx];
6967
6968				if (PF_ANEQ(pd2.src,
6969				    &nk->addr[pd2.sidx], pd2.af) ||
6970				    nk->port[pd2.sidx] != iih.icmp6_id)
6971					pf_change_icmp(pd2.src, &iih.icmp6_id,
6972					    daddr, &nk->addr[pd2.sidx],
6973					    nk->port[pd2.sidx], NULL,
6974					    pd2.ip_sum, icmpsum,
6975					    pd->ip_sum, 0, AF_INET6);
6976
6977				if (PF_ANEQ(pd2.dst,
6978				    &nk->addr[pd2.didx], pd2.af) ||
6979				    nk->port[pd2.didx] != iih.icmp6_id)
6980					pf_change_icmp(pd2.dst, &iih.icmp6_id,
6981					    saddr, &nk->addr[pd2.didx],
6982					    nk->port[pd2.didx], NULL,
6983					    pd2.ip_sum, icmpsum,
6984					    pd->ip_sum, 0, AF_INET6);
6985
6986				m_copyback(m, off, sizeof(struct icmp6_hdr),
6987				    (caddr_t)&pd->hdr.icmp6);
6988				m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
6989				m_copyback(m, off2, sizeof(struct icmp6_hdr),
6990				    (caddr_t)&iih);
6991			}
6992			return (PF_PASS);
6993			break;
6994		}
6995#endif /* INET6 */
6996		default: {
6997			key.af = pd2.af;
6998			key.proto = pd2.proto;
6999			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
7000			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
7001			key.port[0] = key.port[1] = 0;
7002
7003			STATE_LOOKUP(kif, &key, *state, pd);
7004
7005			/* translate source/destination address, if necessary */
7006			if ((*state)->key[PF_SK_WIRE] !=
7007			    (*state)->key[PF_SK_STACK]) {
7008				struct pf_state_key *nk =
7009				    (*state)->key[pd->didx];
7010
7011				if (PF_ANEQ(pd2.src,
7012				    &nk->addr[pd2.sidx], pd2.af))
7013					pf_change_icmp(pd2.src, NULL, daddr,
7014					    &nk->addr[pd2.sidx], 0, NULL,
7015					    pd2.ip_sum, icmpsum,
7016					    pd->ip_sum, 0, pd2.af);
7017
7018				if (PF_ANEQ(pd2.dst,
7019				    &nk->addr[pd2.didx], pd2.af))
7020					pf_change_icmp(pd2.dst, NULL, saddr,
7021					    &nk->addr[pd2.didx], 0, NULL,
7022					    pd2.ip_sum, icmpsum,
7023					    pd->ip_sum, 0, pd2.af);
7024
7025				switch (pd2.af) {
7026#ifdef INET
7027				case AF_INET:
7028					m_copyback(m, off, ICMP_MINLEN,
7029					    (caddr_t)&pd->hdr.icmp);
7030					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
7031					break;
7032#endif /* INET */
7033#ifdef INET6
7034				case AF_INET6:
7035					m_copyback(m, off,
7036					    sizeof(struct icmp6_hdr),
7037					    (caddr_t )&pd->hdr.icmp6);
7038					m_copyback(m, ipoff2, sizeof(h2_6),
7039					    (caddr_t )&h2_6);
7040					break;
7041#endif /* INET6 */
7042				}
7043			}
7044			return (PF_PASS);
7045			break;
7046		}
7047		}
7048	}
7049}
7050
7051static int
7052pf_test_state_other(struct pf_kstate **state, struct pfi_kkif *kif,
7053    struct mbuf *m, struct pf_pdesc *pd)
7054{
7055	struct pf_state_peer	*src, *dst;
7056	struct pf_state_key_cmp	 key;
7057	uint8_t			 psrc, pdst;
7058
7059	bzero(&key, sizeof(key));
7060	key.af = pd->af;
7061	key.proto = pd->proto;
7062	if (pd->dir == PF_IN)	{
7063		PF_ACPY(&key.addr[0], pd->src, key.af);
7064		PF_ACPY(&key.addr[1], pd->dst, key.af);
7065		key.port[0] = key.port[1] = 0;
7066	} else {
7067		PF_ACPY(&key.addr[1], pd->src, key.af);
7068		PF_ACPY(&key.addr[0], pd->dst, key.af);
7069		key.port[1] = key.port[0] = 0;
7070	}
7071
7072	STATE_LOOKUP(kif, &key, *state, pd);
7073
7074	if (pd->dir == (*state)->direction) {
7075		src = &(*state)->src;
7076		dst = &(*state)->dst;
7077		psrc = PF_PEER_SRC;
7078		pdst = PF_PEER_DST;
7079	} else {
7080		src = &(*state)->dst;
7081		dst = &(*state)->src;
7082		psrc = PF_PEER_DST;
7083		pdst = PF_PEER_SRC;
7084	}
7085
7086	/* update states */
7087	if (src->state < PFOTHERS_SINGLE)
7088		pf_set_protostate(*state, psrc, PFOTHERS_SINGLE);
7089	if (dst->state == PFOTHERS_SINGLE)
7090		pf_set_protostate(*state, pdst, PFOTHERS_MULTIPLE);
7091
7092	/* update expire time */
7093	(*state)->expire = pf_get_uptime();
7094	if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
7095		(*state)->timeout = PFTM_OTHER_MULTIPLE;
7096	else
7097		(*state)->timeout = PFTM_OTHER_SINGLE;
7098
7099	/* translate source/destination address, if necessary */
7100	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
7101		struct pf_state_key *nk = (*state)->key[pd->didx];
7102
7103		KASSERT(nk, ("%s: nk is null", __func__));
7104		KASSERT(pd, ("%s: pd is null", __func__));
7105		KASSERT(pd->src, ("%s: pd->src is null", __func__));
7106		KASSERT(pd->dst, ("%s: pd->dst is null", __func__));
7107		switch (pd->af) {
7108#ifdef INET
7109		case AF_INET:
7110			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
7111				pf_change_a(&pd->src->v4.s_addr,
7112				    pd->ip_sum,
7113				    nk->addr[pd->sidx].v4.s_addr,
7114				    0);
7115
7116			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
7117				pf_change_a(&pd->dst->v4.s_addr,
7118				    pd->ip_sum,
7119				    nk->addr[pd->didx].v4.s_addr,
7120				    0);
7121
7122			break;
7123#endif /* INET */
7124#ifdef INET6
7125		case AF_INET6:
7126			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
7127				PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
7128
7129			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
7130				PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
7131#endif /* INET6 */
7132		}
7133	}
7134	return (PF_PASS);
7135}
7136
7137/*
7138 * ipoff and off are measured from the start of the mbuf chain.
7139 * h must be at "ipoff" on the mbuf chain.
7140 */
7141void *
7142pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
7143    u_short *actionp, u_short *reasonp, sa_family_t af)
7144{
7145	switch (af) {
7146#ifdef INET
7147	case AF_INET: {
7148		struct ip	*h = mtod(m, struct ip *);
7149		u_int16_t	 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
7150
7151		if (fragoff) {
7152			if (fragoff >= len)
7153				ACTION_SET(actionp, PF_PASS);
7154			else {
7155				ACTION_SET(actionp, PF_DROP);
7156				REASON_SET(reasonp, PFRES_FRAG);
7157			}
7158			return (NULL);
7159		}
7160		if (m->m_pkthdr.len < off + len ||
7161		    ntohs(h->ip_len) < off + len) {
7162			ACTION_SET(actionp, PF_DROP);
7163			REASON_SET(reasonp, PFRES_SHORT);
7164			return (NULL);
7165		}
7166		break;
7167	}
7168#endif /* INET */
7169#ifdef INET6
7170	case AF_INET6: {
7171		struct ip6_hdr	*h = mtod(m, struct ip6_hdr *);
7172
7173		if (m->m_pkthdr.len < off + len ||
7174		    (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
7175		    (unsigned)(off + len)) {
7176			ACTION_SET(actionp, PF_DROP);
7177			REASON_SET(reasonp, PFRES_SHORT);
7178			return (NULL);
7179		}
7180		break;
7181	}
7182#endif /* INET6 */
7183	}
7184	m_copydata(m, off, len, p);
7185	return (p);
7186}
7187
7188int
7189pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kkif *kif,
7190    int rtableid)
7191{
7192	struct ifnet		*ifp;
7193
7194	/*
7195	 * Skip check for addresses with embedded interface scope,
7196	 * as they would always match anyway.
7197	 */
7198	if (af == AF_INET6 && IN6_IS_SCOPE_EMBED(&addr->v6))
7199		return (1);
7200
7201	if (af != AF_INET && af != AF_INET6)
7202		return (0);
7203
7204	if (kif == V_pfi_all)
7205		return (1);
7206
7207	/* Skip checks for ipsec interfaces */
7208	if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
7209		return (1);
7210
7211	ifp = (kif != NULL) ? kif->pfik_ifp : NULL;
7212
7213	switch (af) {
7214#ifdef INET6
7215	case AF_INET6:
7216		return (fib6_check_urpf(rtableid, &addr->v6, 0, NHR_NONE,
7217		    ifp));
7218#endif
7219#ifdef INET
7220	case AF_INET:
7221		return (fib4_check_urpf(rtableid, addr->v4, 0, NHR_NONE,
7222		    ifp));
7223#endif
7224	}
7225
7226	return (0);
7227}
7228
7229#ifdef INET
7230static void
7231pf_route(struct mbuf **m, struct pf_krule *r, struct ifnet *oifp,
7232    struct pf_kstate *s, struct pf_pdesc *pd, struct inpcb *inp)
7233{
7234	struct mbuf		*m0, *m1, *md;
7235	struct sockaddr_in	dst;
7236	struct ip		*ip;
7237	struct pfi_kkif		*nkif = NULL;
7238	struct ifnet		*ifp = NULL;
7239	struct pf_addr		 naddr;
7240	struct pf_ksrc_node	*sn = NULL;
7241	int			 error = 0;
7242	uint16_t		 ip_len, ip_off;
7243	uint16_t		 tmp;
7244	int			 r_rt, r_dir;
7245
7246	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
7247
7248	if (s) {
7249		r_rt = s->rt;
7250		r_dir = s->direction;
7251	} else {
7252		r_rt = r->rt;
7253		r_dir = r->direction;
7254	}
7255
7256	KASSERT(pd->dir == PF_IN || pd->dir == PF_OUT ||
7257	    r_dir == PF_IN || r_dir == PF_OUT, ("%s: invalid direction",
7258	    __func__));
7259
7260	if ((pd->pf_mtag == NULL &&
7261	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
7262	    pd->pf_mtag->routed++ > 3) {
7263		m0 = *m;
7264		*m = NULL;
7265		goto bad_locked;
7266	}
7267
7268	if (r_rt == PF_DUPTO) {
7269		if ((pd->pf_mtag->flags & PF_MTAG_FLAG_DUPLICATED)) {
7270			if (s == NULL) {
7271				ifp = r->rpool.cur->kif ?
7272				    r->rpool.cur->kif->pfik_ifp : NULL;
7273			} else {
7274				ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
7275				/* If pfsync'd */
7276				if (ifp == NULL && r->rpool.cur != NULL)
7277					ifp = r->rpool.cur->kif ?
7278					    r->rpool.cur->kif->pfik_ifp : NULL;
7279				PF_STATE_UNLOCK(s);
7280			}
7281			if (ifp == oifp) {
7282				/* When the 2nd interface is not skipped */
7283				return;
7284			} else {
7285				m0 = *m;
7286				*m = NULL;
7287				goto bad;
7288			}
7289		} else {
7290			pd->pf_mtag->flags |= PF_MTAG_FLAG_DUPLICATED;
7291			if (((m0 = m_dup(*m, M_NOWAIT)) == NULL)) {
7292				if (s)
7293					PF_STATE_UNLOCK(s);
7294				return;
7295			}
7296		}
7297	} else {
7298		if ((r_rt == PF_REPLYTO) == (r_dir == pd->dir)) {
7299			pf_dummynet(pd, s, r, m);
7300			if (s)
7301				PF_STATE_UNLOCK(s);
7302			return;
7303		}
7304		m0 = *m;
7305	}
7306
7307	ip = mtod(m0, struct ip *);
7308
7309	bzero(&dst, sizeof(dst));
7310	dst.sin_family = AF_INET;
7311	dst.sin_len = sizeof(dst);
7312	dst.sin_addr = ip->ip_dst;
7313
7314	bzero(&naddr, sizeof(naddr));
7315
7316	if (s == NULL) {
7317		if (TAILQ_EMPTY(&r->rpool.list)) {
7318			DPFPRINTF(PF_DEBUG_URGENT,
7319			    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
7320			goto bad_locked;
7321		}
7322		pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
7323		    &naddr, &nkif, NULL, &sn);
7324		if (!PF_AZERO(&naddr, AF_INET))
7325			dst.sin_addr.s_addr = naddr.v4.s_addr;
7326		ifp = nkif ? nkif->pfik_ifp : NULL;
7327	} else {
7328		struct pfi_kkif *kif;
7329
7330		if (!PF_AZERO(&s->rt_addr, AF_INET))
7331			dst.sin_addr.s_addr =
7332			    s->rt_addr.v4.s_addr;
7333		ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
7334		kif = s->rt_kif;
7335		/* If pfsync'd */
7336		if (ifp == NULL && r->rpool.cur != NULL) {
7337			ifp = r->rpool.cur->kif ?
7338			    r->rpool.cur->kif->pfik_ifp : NULL;
7339			kif = r->rpool.cur->kif;
7340		}
7341		if (ifp != NULL && kif != NULL &&
7342		    r->rule_flag & PFRULE_IFBOUND &&
7343		    r->rt == PF_REPLYTO &&
7344		    s->kif == V_pfi_all) {
7345			s->kif = kif;
7346			s->orig_kif = oifp->if_pf_kif;
7347		}
7348
7349		PF_STATE_UNLOCK(s);
7350	}
7351
7352	if (ifp == NULL)
7353		goto bad;
7354
7355	if (pd->dir == PF_IN) {
7356		if (pf_test(PF_OUT, PFIL_FWD, ifp, &m0, inp, &pd->act) != PF_PASS)
7357			goto bad;
7358		else if (m0 == NULL)
7359			goto done;
7360		if (m0->m_len < sizeof(struct ip)) {
7361			DPFPRINTF(PF_DEBUG_URGENT,
7362			    ("%s: m0->m_len < sizeof(struct ip)\n", __func__));
7363			goto bad;
7364		}
7365		ip = mtod(m0, struct ip *);
7366	}
7367
7368	if (ifp->if_flags & IFF_LOOPBACK)
7369		m0->m_flags |= M_SKIP_FIREWALL;
7370
7371	ip_len = ntohs(ip->ip_len);
7372	ip_off = ntohs(ip->ip_off);
7373
7374	/* Copied from FreeBSD 10.0-CURRENT ip_output. */
7375	m0->m_pkthdr.csum_flags |= CSUM_IP;
7376	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
7377		in_delayed_cksum(m0);
7378		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
7379	}
7380	if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
7381		pf_sctp_checksum(m0, (uint32_t)(ip->ip_hl << 2));
7382		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
7383	}
7384
7385	if (pd->dir == PF_IN) {
7386		/*
7387		 * Make sure dummynet gets the correct direction, in case it needs to
7388		 * re-inject later.
7389		 */
7390		pd->dir = PF_OUT;
7391
7392		/*
7393		 * The following processing is actually the rest of the inbound processing, even
7394		 * though we've marked it as outbound (so we don't look through dummynet) and it
7395		 * happens after the outbound processing (pf_test(PF_OUT) above).
7396		 * Swap the dummynet pipe numbers, because it's going to come to the wrong
7397		 * conclusion about what direction it's processing, and we can't fix it or it
7398		 * will re-inject incorrectly. Swapping the pipe numbers means that its incorrect
7399		 * decision will pick the right pipe, and everything will mostly work as expected.
7400		 */
7401		tmp = pd->act.dnrpipe;
7402		pd->act.dnrpipe = pd->act.dnpipe;
7403		pd->act.dnpipe = tmp;
7404	}
7405
7406	/*
7407	 * If small enough for interface, or the interface will take
7408	 * care of the fragmentation for us, we can just send directly.
7409	 */
7410	if (ip_len <= ifp->if_mtu ||
7411	    (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
7412		ip->ip_sum = 0;
7413		if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
7414			ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
7415			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
7416		}
7417		m_clrprotoflags(m0);	/* Avoid confusing lower layers. */
7418
7419		md = m0;
7420		error = pf_dummynet_route(pd, s, r, ifp, sintosa(&dst), &md);
7421		if (md != NULL)
7422			error = (*ifp->if_output)(ifp, md, sintosa(&dst), NULL);
7423		goto done;
7424	}
7425
7426	/* Balk when DF bit is set or the interface didn't support TSO. */
7427	if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) {
7428		error = EMSGSIZE;
7429		KMOD_IPSTAT_INC(ips_cantfrag);
7430		if (r_rt != PF_DUPTO) {
7431			if (s && pd->nat_rule != NULL)
7432				PACKET_UNDO_NAT(m0, pd,
7433				    (ip->ip_hl << 2) + (ip_off & IP_OFFMASK),
7434				    s);
7435
7436			icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
7437			    ifp->if_mtu);
7438			goto done;
7439		} else
7440			goto bad;
7441	}
7442
7443	error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist);
7444	if (error)
7445		goto bad;
7446
7447	for (; m0; m0 = m1) {
7448		m1 = m0->m_nextpkt;
7449		m0->m_nextpkt = NULL;
7450		if (error == 0) {
7451			m_clrprotoflags(m0);
7452			md = m0;
7453			pd->pf_mtag = pf_find_mtag(md);
7454			error = pf_dummynet_route(pd, s, r, ifp,
7455			    sintosa(&dst), &md);
7456			if (md != NULL)
7457				error = (*ifp->if_output)(ifp, md,
7458				    sintosa(&dst), NULL);
7459		} else
7460			m_freem(m0);
7461	}
7462
7463	if (error == 0)
7464		KMOD_IPSTAT_INC(ips_fragmented);
7465
7466done:
7467	if (r_rt != PF_DUPTO)
7468		*m = NULL;
7469	return;
7470
7471bad_locked:
7472	if (s)
7473		PF_STATE_UNLOCK(s);
7474bad:
7475	m_freem(m0);
7476	goto done;
7477}
7478#endif /* INET */
7479
7480#ifdef INET6
7481static void
7482pf_route6(struct mbuf **m, struct pf_krule *r, struct ifnet *oifp,
7483    struct pf_kstate *s, struct pf_pdesc *pd, struct inpcb *inp)
7484{
7485	struct mbuf		*m0, *md;
7486	struct sockaddr_in6	dst;
7487	struct ip6_hdr		*ip6;
7488	struct pfi_kkif		*nkif = NULL;
7489	struct ifnet		*ifp = NULL;
7490	struct pf_addr		 naddr;
7491	struct pf_ksrc_node	*sn = NULL;
7492	int			 r_rt, r_dir;
7493
7494	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
7495
7496	if (s) {
7497		r_rt = s->rt;
7498		r_dir = s->direction;
7499	} else {
7500		r_rt = r->rt;
7501		r_dir = r->direction;
7502	}
7503
7504	KASSERT(pd->dir == PF_IN || pd->dir == PF_OUT ||
7505	    r_dir == PF_IN || r_dir == PF_OUT, ("%s: invalid direction",
7506	    __func__));
7507
7508	if ((pd->pf_mtag == NULL &&
7509	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
7510	    pd->pf_mtag->routed++ > 3) {
7511		m0 = *m;
7512		*m = NULL;
7513		goto bad_locked;
7514	}
7515
7516	if (r_rt == PF_DUPTO) {
7517		if ((pd->pf_mtag->flags & PF_MTAG_FLAG_DUPLICATED)) {
7518			if (s == NULL) {
7519				ifp = r->rpool.cur->kif ?
7520				    r->rpool.cur->kif->pfik_ifp : NULL;
7521			} else {
7522				ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
7523				/* If pfsync'd */
7524				if (ifp == NULL && r->rpool.cur != NULL)
7525					ifp = r->rpool.cur->kif ?
7526					    r->rpool.cur->kif->pfik_ifp : NULL;
7527				PF_STATE_UNLOCK(s);
7528			}
7529			if (ifp == oifp) {
7530				/* When the 2nd interface is not skipped */
7531				return;
7532			} else {
7533				m0 = *m;
7534				*m = NULL;
7535				goto bad;
7536			}
7537		} else {
7538			pd->pf_mtag->flags |= PF_MTAG_FLAG_DUPLICATED;
7539			if (((m0 = m_dup(*m, M_NOWAIT)) == NULL)) {
7540				if (s)
7541					PF_STATE_UNLOCK(s);
7542				return;
7543			}
7544		}
7545	} else {
7546		if ((r_rt == PF_REPLYTO) == (r_dir == pd->dir)) {
7547			pf_dummynet(pd, s, r, m);
7548			if (s)
7549				PF_STATE_UNLOCK(s);
7550			return;
7551		}
7552		m0 = *m;
7553	}
7554
7555	ip6 = mtod(m0, struct ip6_hdr *);
7556
7557	bzero(&dst, sizeof(dst));
7558	dst.sin6_family = AF_INET6;
7559	dst.sin6_len = sizeof(dst);
7560	dst.sin6_addr = ip6->ip6_dst;
7561
7562	bzero(&naddr, sizeof(naddr));
7563
7564	if (s == NULL) {
7565		if (TAILQ_EMPTY(&r->rpool.list)) {
7566			DPFPRINTF(PF_DEBUG_URGENT,
7567			    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
7568			goto bad_locked;
7569		}
7570		pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
7571		    &naddr, &nkif, NULL, &sn);
7572		if (!PF_AZERO(&naddr, AF_INET6))
7573			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
7574			    &naddr, AF_INET6);
7575		ifp = nkif ? nkif->pfik_ifp : NULL;
7576	} else {
7577		struct pfi_kkif *kif;
7578
7579		if (!PF_AZERO(&s->rt_addr, AF_INET6))
7580			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
7581			    &s->rt_addr, AF_INET6);
7582		ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
7583		kif = s->rt_kif;
7584		/* If pfsync'd */
7585		if (ifp == NULL && r->rpool.cur != NULL) {
7586			ifp = r->rpool.cur->kif ?
7587			    r->rpool.cur->kif->pfik_ifp : NULL;
7588			kif = r->rpool.cur->kif;
7589		}
7590		if (ifp != NULL && kif != NULL &&
7591		    r->rule_flag & PFRULE_IFBOUND &&
7592		    r->rt == PF_REPLYTO &&
7593		    s->kif == V_pfi_all) {
7594			s->kif = kif;
7595			s->orig_kif = oifp->if_pf_kif;
7596		}
7597	}
7598
7599	if (s)
7600		PF_STATE_UNLOCK(s);
7601
7602	if (ifp == NULL)
7603		goto bad;
7604
7605	if (pd->dir == PF_IN) {
7606		if (pf_test6(PF_OUT, PFIL_FWD, ifp, &m0, inp, &pd->act) != PF_PASS)
7607			goto bad;
7608		else if (m0 == NULL)
7609			goto done;
7610		if (m0->m_len < sizeof(struct ip6_hdr)) {
7611			DPFPRINTF(PF_DEBUG_URGENT,
7612			    ("%s: m0->m_len < sizeof(struct ip6_hdr)\n",
7613			    __func__));
7614			goto bad;
7615		}
7616		ip6 = mtod(m0, struct ip6_hdr *);
7617	}
7618
7619	if (ifp->if_flags & IFF_LOOPBACK)
7620		m0->m_flags |= M_SKIP_FIREWALL;
7621
7622	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 &
7623	    ~ifp->if_hwassist) {
7624		uint32_t plen = m0->m_pkthdr.len - sizeof(*ip6);
7625		in6_delayed_cksum(m0, plen, sizeof(struct ip6_hdr));
7626		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
7627	}
7628
7629	/*
7630	 * If the packet is too large for the outgoing interface,
7631	 * send back an icmp6 error.
7632	 */
7633	if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr))
7634		dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
7635	if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) {
7636		md = m0;
7637		pf_dummynet_route(pd, s, r, ifp, sintosa(&dst), &md);
7638		if (md != NULL)
7639			nd6_output_ifp(ifp, ifp, md, &dst, NULL);
7640	}
7641	else {
7642		in6_ifstat_inc(ifp, ifs6_in_toobig);
7643		if (r_rt != PF_DUPTO) {
7644			if (s && pd->nat_rule != NULL)
7645				PACKET_UNDO_NAT(m0, pd,
7646				    ((caddr_t)ip6 - m0->m_data) +
7647				    sizeof(struct ip6_hdr), s);
7648
7649			icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
7650		} else
7651			goto bad;
7652	}
7653
7654done:
7655	if (r_rt != PF_DUPTO)
7656		*m = NULL;
7657	return;
7658
7659bad_locked:
7660	if (s)
7661		PF_STATE_UNLOCK(s);
7662bad:
7663	m_freem(m0);
7664	goto done;
7665}
7666#endif /* INET6 */
7667
7668/*
7669 * FreeBSD supports cksum offloads for the following drivers.
7670 *  em(4), fxp(4), lge(4), nge(4), re(4), ti(4), txp(4), xl(4)
7671 *
7672 * CSUM_DATA_VALID | CSUM_PSEUDO_HDR :
7673 *  network driver performed cksum including pseudo header, need to verify
7674 *   csum_data
7675 * CSUM_DATA_VALID :
7676 *  network driver performed cksum, needs to additional pseudo header
7677 *  cksum computation with partial csum_data(i.e. lack of H/W support for
7678 *  pseudo header, for instance sk(4) and possibly gem(4))
7679 *
7680 * After validating the cksum of packet, set both flag CSUM_DATA_VALID and
7681 * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
7682 * TCP/UDP layer.
7683 * Also, set csum_data to 0xffff to force cksum validation.
7684 */
7685static int
7686pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
7687{
7688	u_int16_t sum = 0;
7689	int hw_assist = 0;
7690	struct ip *ip;
7691
7692	if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
7693		return (1);
7694	if (m->m_pkthdr.len < off + len)
7695		return (1);
7696
7697	switch (p) {
7698	case IPPROTO_TCP:
7699		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
7700			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
7701				sum = m->m_pkthdr.csum_data;
7702			} else {
7703				ip = mtod(m, struct ip *);
7704				sum = in_pseudo(ip->ip_src.s_addr,
7705				ip->ip_dst.s_addr, htonl((u_short)len +
7706				m->m_pkthdr.csum_data + IPPROTO_TCP));
7707			}
7708			sum ^= 0xffff;
7709			++hw_assist;
7710		}
7711		break;
7712	case IPPROTO_UDP:
7713		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
7714			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
7715				sum = m->m_pkthdr.csum_data;
7716			} else {
7717				ip = mtod(m, struct ip *);
7718				sum = in_pseudo(ip->ip_src.s_addr,
7719				ip->ip_dst.s_addr, htonl((u_short)len +
7720				m->m_pkthdr.csum_data + IPPROTO_UDP));
7721			}
7722			sum ^= 0xffff;
7723			++hw_assist;
7724		}
7725		break;
7726	case IPPROTO_ICMP:
7727#ifdef INET6
7728	case IPPROTO_ICMPV6:
7729#endif /* INET6 */
7730		break;
7731	default:
7732		return (1);
7733	}
7734
7735	if (!hw_assist) {
7736		switch (af) {
7737		case AF_INET:
7738			if (p == IPPROTO_ICMP) {
7739				if (m->m_len < off)
7740					return (1);
7741				m->m_data += off;
7742				m->m_len -= off;
7743				sum = in_cksum(m, len);
7744				m->m_data -= off;
7745				m->m_len += off;
7746			} else {
7747				if (m->m_len < sizeof(struct ip))
7748					return (1);
7749				sum = in4_cksum(m, p, off, len);
7750			}
7751			break;
7752#ifdef INET6
7753		case AF_INET6:
7754			if (m->m_len < sizeof(struct ip6_hdr))
7755				return (1);
7756			sum = in6_cksum(m, p, off, len);
7757			break;
7758#endif /* INET6 */
7759		default:
7760			return (1);
7761		}
7762	}
7763	if (sum) {
7764		switch (p) {
7765		case IPPROTO_TCP:
7766		    {
7767			KMOD_TCPSTAT_INC(tcps_rcvbadsum);
7768			break;
7769		    }
7770		case IPPROTO_UDP:
7771		    {
7772			KMOD_UDPSTAT_INC(udps_badsum);
7773			break;
7774		    }
7775#ifdef INET
7776		case IPPROTO_ICMP:
7777		    {
7778			KMOD_ICMPSTAT_INC(icps_checksum);
7779			break;
7780		    }
7781#endif
7782#ifdef INET6
7783		case IPPROTO_ICMPV6:
7784		    {
7785			KMOD_ICMP6STAT_INC(icp6s_checksum);
7786			break;
7787		    }
7788#endif /* INET6 */
7789		}
7790		return (1);
7791	} else {
7792		if (p == IPPROTO_TCP || p == IPPROTO_UDP) {
7793			m->m_pkthdr.csum_flags |=
7794			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
7795			m->m_pkthdr.csum_data = 0xffff;
7796		}
7797	}
7798	return (0);
7799}
7800
7801static bool
7802pf_pdesc_to_dnflow(const struct pf_pdesc *pd, const struct pf_krule *r,
7803    const struct pf_kstate *s, struct ip_fw_args *dnflow)
7804{
7805	int dndir = r->direction;
7806
7807	if (s && dndir == PF_INOUT) {
7808		dndir = s->direction;
7809	} else if (dndir == PF_INOUT) {
7810		/* Assume primary direction. Happens when we've set dnpipe in
7811		 * the ethernet level code. */
7812		dndir = pd->dir;
7813	}
7814
7815	if (pd->pf_mtag->flags & PF_MTAG_FLAG_DUMMYNETED)
7816		return (false);
7817
7818	memset(dnflow, 0, sizeof(*dnflow));
7819
7820	if (pd->dport != NULL)
7821		dnflow->f_id.dst_port = ntohs(*pd->dport);
7822	if (pd->sport != NULL)
7823		dnflow->f_id.src_port = ntohs(*pd->sport);
7824
7825	if (pd->dir == PF_IN)
7826		dnflow->flags |= IPFW_ARGS_IN;
7827	else
7828		dnflow->flags |= IPFW_ARGS_OUT;
7829
7830	if (pd->dir != dndir && pd->act.dnrpipe) {
7831		dnflow->rule.info = pd->act.dnrpipe;
7832	}
7833	else if (pd->dir == dndir && pd->act.dnpipe) {
7834		dnflow->rule.info = pd->act.dnpipe;
7835	}
7836	else {
7837		return (false);
7838	}
7839
7840	dnflow->rule.info |= IPFW_IS_DUMMYNET;
7841	if (r->free_flags & PFRULE_DN_IS_PIPE || pd->act.flags & PFSTATE_DN_IS_PIPE)
7842		dnflow->rule.info |= IPFW_IS_PIPE;
7843
7844	dnflow->f_id.proto = pd->proto;
7845	dnflow->f_id.extra = dnflow->rule.info;
7846	switch (pd->af) {
7847	case AF_INET:
7848		dnflow->f_id.addr_type = 4;
7849		dnflow->f_id.src_ip = ntohl(pd->src->v4.s_addr);
7850		dnflow->f_id.dst_ip = ntohl(pd->dst->v4.s_addr);
7851		break;
7852	case AF_INET6:
7853		dnflow->flags |= IPFW_ARGS_IP6;
7854		dnflow->f_id.addr_type = 6;
7855		dnflow->f_id.src_ip6 = pd->src->v6;
7856		dnflow->f_id.dst_ip6 = pd->dst->v6;
7857		break;
7858	default:
7859		panic("Invalid AF");
7860		break;
7861	}
7862
7863	return (true);
7864}
7865
7866int
7867pf_test_eth(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0,
7868    struct inpcb *inp)
7869{
7870	struct pfi_kkif		*kif;
7871	struct mbuf		*m = *m0;
7872
7873	M_ASSERTPKTHDR(m);
7874	MPASS(ifp->if_vnet == curvnet);
7875	NET_EPOCH_ASSERT();
7876
7877	if (!V_pf_status.running)
7878		return (PF_PASS);
7879
7880	kif = (struct pfi_kkif *)ifp->if_pf_kif;
7881
7882	if (kif == NULL) {
7883		DPFPRINTF(PF_DEBUG_URGENT,
7884		    ("%s: kif == NULL, if_xname %s\n", __func__, ifp->if_xname));
7885		return (PF_DROP);
7886	}
7887	if (kif->pfik_flags & PFI_IFLAG_SKIP)
7888		return (PF_PASS);
7889
7890	if (m->m_flags & M_SKIP_FIREWALL)
7891		return (PF_PASS);
7892
7893	/* Stateless! */
7894	return (pf_test_eth_rule(dir, kif, m0));
7895}
7896
7897static __inline void
7898pf_dummynet_flag_remove(struct mbuf *m, struct pf_mtag *pf_mtag)
7899{
7900	struct m_tag *mtag;
7901
7902	pf_mtag->flags &= ~PF_MTAG_FLAG_DUMMYNET;
7903
7904	/* dummynet adds this tag, but pf does not need it,
7905	 * and keeping it creates unexpected behavior,
7906	 * e.g. in case of divert(4) usage right after dummynet. */
7907	mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
7908	if (mtag != NULL)
7909		m_tag_delete(m, mtag);
7910}
7911
7912static int
7913pf_dummynet(struct pf_pdesc *pd, struct pf_kstate *s,
7914    struct pf_krule *r, struct mbuf **m0)
7915{
7916	return (pf_dummynet_route(pd, s, r, NULL, NULL, m0));
7917}
7918
7919static int
7920pf_dummynet_route(struct pf_pdesc *pd, struct pf_kstate *s,
7921    struct pf_krule *r, struct ifnet *ifp, struct sockaddr *sa,
7922    struct mbuf **m0)
7923{
7924	NET_EPOCH_ASSERT();
7925
7926	if (pd->act.dnpipe || pd->act.dnrpipe) {
7927		struct ip_fw_args dnflow;
7928		if (ip_dn_io_ptr == NULL) {
7929			m_freem(*m0);
7930			*m0 = NULL;
7931			return (ENOMEM);
7932		}
7933
7934		if (pd->pf_mtag == NULL &&
7935		    ((pd->pf_mtag = pf_get_mtag(*m0)) == NULL)) {
7936			m_freem(*m0);
7937			*m0 = NULL;
7938			return (ENOMEM);
7939		}
7940
7941		if (ifp != NULL) {
7942			pd->pf_mtag->flags |= PF_MTAG_FLAG_ROUTE_TO;
7943
7944			pd->pf_mtag->if_index = ifp->if_index;
7945			pd->pf_mtag->if_idxgen = ifp->if_idxgen;
7946
7947			MPASS(sa != NULL);
7948
7949			if (pd->af == AF_INET)
7950				memcpy(&pd->pf_mtag->dst, sa,
7951				    sizeof(struct sockaddr_in));
7952			else
7953				memcpy(&pd->pf_mtag->dst, sa,
7954				    sizeof(struct sockaddr_in6));
7955		}
7956
7957		if (s != NULL && s->nat_rule.ptr != NULL &&
7958		    s->nat_rule.ptr->action == PF_RDR &&
7959		    (
7960#ifdef INET
7961		    (pd->af == AF_INET && IN_LOOPBACK(ntohl(pd->dst->v4.s_addr))) ||
7962#endif
7963		    (pd->af == AF_INET6 && IN6_IS_ADDR_LOOPBACK(&pd->dst->v6)))) {
7964			/*
7965			 * If we're redirecting to loopback mark this packet
7966			 * as being local. Otherwise it might get dropped
7967			 * if dummynet re-injects.
7968			 */
7969			(*m0)->m_pkthdr.rcvif = V_loif;
7970		}
7971
7972		if (pf_pdesc_to_dnflow(pd, r, s, &dnflow)) {
7973			pd->pf_mtag->flags |= PF_MTAG_FLAG_DUMMYNET;
7974			pd->pf_mtag->flags |= PF_MTAG_FLAG_DUMMYNETED;
7975			ip_dn_io_ptr(m0, &dnflow);
7976			if (*m0 != NULL) {
7977				pd->pf_mtag->flags &= ~PF_MTAG_FLAG_ROUTE_TO;
7978				pf_dummynet_flag_remove(*m0, pd->pf_mtag);
7979			}
7980		}
7981	}
7982
7983	return (0);
7984}
7985
7986#ifdef INET
7987int
7988pf_test(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0,
7989    struct inpcb *inp, struct pf_rule_actions *default_actions)
7990{
7991	struct pfi_kkif		*kif;
7992	u_short			 action, reason = 0;
7993	struct mbuf		*m = *m0;
7994	struct ip		*h = NULL;
7995	struct m_tag		*mtag;
7996	struct pf_krule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
7997	struct pf_kstate	*s = NULL;
7998	struct pf_kruleset	*ruleset = NULL;
7999	struct pf_pdesc		 pd;
8000	int			 off, dirndx, use_2nd_queue = 0;
8001	uint16_t		 tag;
8002	uint8_t			 rt;
8003
8004	PF_RULES_RLOCK_TRACKER;
8005	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: bad direction %d\n", __func__, dir));
8006	M_ASSERTPKTHDR(m);
8007
8008	if (!V_pf_status.running)
8009		return (PF_PASS);
8010
8011	PF_RULES_RLOCK();
8012
8013	kif = (struct pfi_kkif *)ifp->if_pf_kif;
8014
8015	if (__predict_false(kif == NULL)) {
8016		DPFPRINTF(PF_DEBUG_URGENT,
8017		    ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
8018		PF_RULES_RUNLOCK();
8019		return (PF_DROP);
8020	}
8021	if (kif->pfik_flags & PFI_IFLAG_SKIP) {
8022		PF_RULES_RUNLOCK();
8023		return (PF_PASS);
8024	}
8025
8026	if (m->m_flags & M_SKIP_FIREWALL) {
8027		PF_RULES_RUNLOCK();
8028		return (PF_PASS);
8029	}
8030
8031	memset(&pd, 0, sizeof(pd));
8032	TAILQ_INIT(&pd.sctp_multihome_jobs);
8033	if (default_actions != NULL)
8034		memcpy(&pd.act, default_actions, sizeof(pd.act));
8035	pd.pf_mtag = pf_find_mtag(m);
8036
8037	if (pd.pf_mtag != NULL && (pd.pf_mtag->flags & PF_MTAG_FLAG_ROUTE_TO)) {
8038		pd.pf_mtag->flags &= ~PF_MTAG_FLAG_ROUTE_TO;
8039
8040		ifp = ifnet_byindexgen(pd.pf_mtag->if_index,
8041		    pd.pf_mtag->if_idxgen);
8042		if (ifp == NULL || ifp->if_flags & IFF_DYING) {
8043			PF_RULES_RUNLOCK();
8044			m_freem(*m0);
8045			*m0 = NULL;
8046			return (PF_PASS);
8047		}
8048		PF_RULES_RUNLOCK();
8049		(ifp->if_output)(ifp, m, sintosa(&pd.pf_mtag->dst), NULL);
8050		*m0 = NULL;
8051		return (PF_PASS);
8052	}
8053
8054	if (pd.pf_mtag && pd.pf_mtag->dnpipe) {
8055		pd.act.dnpipe = pd.pf_mtag->dnpipe;
8056		pd.act.flags = pd.pf_mtag->dnflags;
8057	}
8058
8059	if (ip_dn_io_ptr != NULL && pd.pf_mtag != NULL &&
8060	    pd.pf_mtag->flags & PF_MTAG_FLAG_DUMMYNET) {
8061		/* Dummynet re-injects packets after they've
8062		 * completed their delay. We've already
8063		 * processed them, so pass unconditionally. */
8064
8065		/* But only once. We may see the packet multiple times (e.g.
8066		 * PFIL_IN/PFIL_OUT). */
8067		pf_dummynet_flag_remove(m, pd.pf_mtag);
8068		PF_RULES_RUNLOCK();
8069
8070		return (PF_PASS);
8071	}
8072
8073	pd.sport = pd.dport = NULL;
8074	pd.proto_sum = NULL;
8075	pd.dir = dir;
8076	pd.sidx = (dir == PF_IN) ? 0 : 1;
8077	pd.didx = (dir == PF_IN) ? 1 : 0;
8078	pd.af = AF_INET;
8079	pd.act.rtableid = -1;
8080
8081	h = mtod(m, struct ip *);
8082	off = h->ip_hl << 2;
8083
8084	if (__predict_false(ip_divert_ptr != NULL) &&
8085	    ((mtag = m_tag_locate(m, MTAG_PF_DIVERT, 0, NULL)) != NULL)) {
8086		struct pf_divert_mtag *dt = (struct pf_divert_mtag *)(mtag+1);
8087		if ((dt->idir == PF_DIVERT_MTAG_DIR_IN && dir == PF_IN) ||
8088		    (dt->idir == PF_DIVERT_MTAG_DIR_OUT && dir == PF_OUT)) {
8089			if (pd.pf_mtag == NULL &&
8090			    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
8091				action = PF_DROP;
8092				goto done;
8093			}
8094			pd.pf_mtag->flags |= PF_MTAG_FLAG_PACKET_LOOPED;
8095		}
8096		if (pd.pf_mtag && pd.pf_mtag->flags & PF_MTAG_FLAG_FASTFWD_OURS_PRESENT) {
8097			m->m_flags |= M_FASTFWD_OURS;
8098			pd.pf_mtag->flags &= ~PF_MTAG_FLAG_FASTFWD_OURS_PRESENT;
8099		}
8100		m_tag_delete(m, mtag);
8101
8102		mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
8103		if (mtag != NULL)
8104			m_tag_delete(m, mtag);
8105	} else if (pf_normalize_ip(m0, kif, &reason, &pd) != PF_PASS) {
8106		/* We do IP header normalization and packet reassembly here */
8107		action = PF_DROP;
8108		goto done;
8109	}
8110	m = *m0;	/* pf_normalize messes with m0 */
8111	h = mtod(m, struct ip *);
8112
8113	off = h->ip_hl << 2;
8114	if (off < (int)sizeof(struct ip)) {
8115		action = PF_DROP;
8116		REASON_SET(&reason, PFRES_SHORT);
8117		pd.act.log = PF_LOG_FORCE;
8118		goto done;
8119	}
8120
8121	pd.src = (struct pf_addr *)&h->ip_src;
8122	pd.dst = (struct pf_addr *)&h->ip_dst;
8123	pd.ip_sum = &h->ip_sum;
8124	pd.proto = h->ip_p;
8125	pd.tos = h->ip_tos & ~IPTOS_ECN_MASK;
8126	pd.tot_len = ntohs(h->ip_len);
8127
8128	/* handle fragments that didn't get reassembled by normalization */
8129	if (h->ip_off & htons(IP_MF | IP_OFFMASK)) {
8130		action = pf_test_fragment(&r, kif, m, h, &pd, &a, &ruleset);
8131		goto done;
8132	}
8133
8134	switch (h->ip_p) {
8135	case IPPROTO_TCP: {
8136		if (!pf_pull_hdr(m, off, &pd.hdr.tcp, sizeof(pd.hdr.tcp),
8137		    &action, &reason, AF_INET)) {
8138			if (action != PF_PASS)
8139				pd.act.log = PF_LOG_FORCE;
8140			goto done;
8141		}
8142		pd.p_len = pd.tot_len - off - (pd.hdr.tcp.th_off << 2);
8143
8144		pd.sport = &pd.hdr.tcp.th_sport;
8145		pd.dport = &pd.hdr.tcp.th_dport;
8146
8147		/* Respond to SYN with a syncookie. */
8148		if ((pd.hdr.tcp.th_flags & (TH_SYN|TH_ACK|TH_RST)) == TH_SYN &&
8149		    pd.dir == PF_IN && pf_synflood_check(&pd)) {
8150			pf_syncookie_send(m, off, &pd);
8151			action = PF_DROP;
8152			break;
8153		}
8154
8155		if ((pd.hdr.tcp.th_flags & TH_ACK) && pd.p_len == 0)
8156			use_2nd_queue = 1;
8157		action = pf_normalize_tcp(kif, m, 0, off, h, &pd);
8158		if (action == PF_DROP)
8159			goto done;
8160		action = pf_test_state_tcp(&s, kif, m, off, h, &pd, &reason);
8161		if (action == PF_PASS) {
8162			if (V_pfsync_update_state_ptr != NULL)
8163				V_pfsync_update_state_ptr(s);
8164			r = s->rule.ptr;
8165			a = s->anchor.ptr;
8166		} else if (s == NULL) {
8167			/* Validate remote SYN|ACK, re-create original SYN if
8168			 * valid. */
8169			if ((pd.hdr.tcp.th_flags & (TH_SYN|TH_ACK|TH_RST)) ==
8170			    TH_ACK && pf_syncookie_validate(&pd) &&
8171			    pd.dir == PF_IN) {
8172				struct mbuf *msyn;
8173
8174				msyn = pf_syncookie_recreate_syn(h->ip_ttl, off,
8175				    &pd);
8176				if (msyn == NULL) {
8177					action = PF_DROP;
8178					break;
8179				}
8180
8181				action = pf_test(dir, pflags, ifp, &msyn, inp,
8182				    &pd.act);
8183				m_freem(msyn);
8184				if (action != PF_PASS)
8185					break;
8186
8187				action = pf_test_state_tcp(&s, kif, m, off, h,
8188				    &pd, &reason);
8189				if (action != PF_PASS || s == NULL) {
8190					action = PF_DROP;
8191					break;
8192				}
8193
8194				s->src.seqhi = ntohl(pd.hdr.tcp.th_ack) - 1;
8195				s->src.seqlo = ntohl(pd.hdr.tcp.th_seq) - 1;
8196				pf_set_protostate(s, PF_PEER_SRC, PF_TCPS_PROXY_DST);
8197				action = pf_synproxy(&pd, &s, &reason);
8198				break;
8199			} else {
8200				action = pf_test_rule(&r, &s, kif, m, off, &pd,
8201				    &a, &ruleset, inp);
8202			}
8203		}
8204		break;
8205	}
8206
8207	case IPPROTO_UDP: {
8208		if (!pf_pull_hdr(m, off, &pd.hdr.udp, sizeof(pd.hdr.udp),
8209		    &action, &reason, AF_INET)) {
8210			if (action != PF_PASS)
8211				pd.act.log = PF_LOG_FORCE;
8212			goto done;
8213		}
8214		pd.sport = &pd.hdr.udp.uh_sport;
8215		pd.dport = &pd.hdr.udp.uh_dport;
8216		if (pd.hdr.udp.uh_dport == 0 ||
8217		    ntohs(pd.hdr.udp.uh_ulen) > m->m_pkthdr.len - off ||
8218		    ntohs(pd.hdr.udp.uh_ulen) < sizeof(struct udphdr)) {
8219			action = PF_DROP;
8220			REASON_SET(&reason, PFRES_SHORT);
8221			goto done;
8222		}
8223		action = pf_test_state_udp(&s, kif, m, off, h, &pd);
8224		if (action == PF_PASS) {
8225			if (V_pfsync_update_state_ptr != NULL)
8226				V_pfsync_update_state_ptr(s);
8227			r = s->rule.ptr;
8228			a = s->anchor.ptr;
8229		} else if (s == NULL)
8230			action = pf_test_rule(&r, &s, kif, m, off, &pd,
8231			    &a, &ruleset, inp);
8232		break;
8233	}
8234
8235	case IPPROTO_SCTP: {
8236		if (!pf_pull_hdr(m, off, &pd.hdr.sctp, sizeof(pd.hdr.sctp),
8237		    &action, &reason, AF_INET)) {
8238			if (action != PF_PASS)
8239				pd.act.log |= PF_LOG_FORCE;
8240			goto done;
8241		}
8242		pd.p_len = pd.tot_len - off;
8243
8244		pd.sport = &pd.hdr.sctp.src_port;
8245		pd.dport = &pd.hdr.sctp.dest_port;
8246		if (pd.hdr.sctp.src_port == 0 || pd.hdr.sctp.dest_port == 0) {
8247			action = PF_DROP;
8248			REASON_SET(&reason, PFRES_SHORT);
8249			goto done;
8250		}
8251		action = pf_normalize_sctp(dir, kif, m, 0, off, h, &pd);
8252		if (action == PF_DROP)
8253			goto done;
8254		action = pf_test_state_sctp(&s, kif, m, off, h, &pd,
8255		    &reason);
8256		if (action == PF_PASS) {
8257			if (V_pfsync_update_state_ptr != NULL)
8258				V_pfsync_update_state_ptr(s);
8259			r = s->rule.ptr;
8260			a = s->anchor.ptr;
8261		} else {
8262			action = pf_test_rule(&r, &s, kif, m, off,
8263			    &pd, &a, &ruleset, inp);
8264		}
8265		break;
8266	}
8267
8268	case IPPROTO_ICMP: {
8269		if (!pf_pull_hdr(m, off, &pd.hdr.icmp, ICMP_MINLEN,
8270		    &action, &reason, AF_INET)) {
8271			if (action != PF_PASS)
8272				pd.act.log = PF_LOG_FORCE;
8273			goto done;
8274		}
8275		action = pf_test_state_icmp(&s, kif, m, off, h, &pd, &reason);
8276		if (action == PF_PASS) {
8277			if (V_pfsync_update_state_ptr != NULL)
8278				V_pfsync_update_state_ptr(s);
8279			r = s->rule.ptr;
8280			a = s->anchor.ptr;
8281		} else if (s == NULL)
8282			action = pf_test_rule(&r, &s, kif, m, off, &pd,
8283			    &a, &ruleset, inp);
8284		break;
8285	}
8286
8287#ifdef INET6
8288	case IPPROTO_ICMPV6: {
8289		action = PF_DROP;
8290		DPFPRINTF(PF_DEBUG_MISC,
8291		    ("pf: dropping IPv4 packet with ICMPv6 payload\n"));
8292		goto done;
8293	}
8294#endif
8295
8296	default:
8297		action = pf_test_state_other(&s, kif, m, &pd);
8298		if (action == PF_PASS) {
8299			if (V_pfsync_update_state_ptr != NULL)
8300				V_pfsync_update_state_ptr(s);
8301			r = s->rule.ptr;
8302			a = s->anchor.ptr;
8303		} else if (s == NULL)
8304			action = pf_test_rule(&r, &s, kif, m, off, &pd,
8305			    &a, &ruleset, inp);
8306		break;
8307	}
8308
8309done:
8310	PF_RULES_RUNLOCK();
8311	if (action == PF_PASS && h->ip_hl > 5 &&
8312	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
8313		action = PF_DROP;
8314		REASON_SET(&reason, PFRES_IPOPTIONS);
8315		pd.act.log = PF_LOG_FORCE;
8316		DPFPRINTF(PF_DEBUG_MISC,
8317		    ("pf: dropping packet with ip options\n"));
8318	}
8319
8320	if (s) {
8321		uint8_t log = pd.act.log;
8322		memcpy(&pd.act, &s->act, sizeof(struct pf_rule_actions));
8323		pd.act.log |= log;
8324		tag = s->tag;
8325		rt = s->rt;
8326	} else {
8327		tag = r->tag;
8328		rt = r->rt;
8329	}
8330
8331	if (tag > 0 && pf_tag_packet(m, &pd, tag)) {
8332		action = PF_DROP;
8333		REASON_SET(&reason, PFRES_MEMORY);
8334	}
8335
8336	pf_scrub_ip(&m, &pd);
8337	if (pd.proto == IPPROTO_TCP && pd.act.max_mss)
8338		pf_normalize_mss(m, off, &pd);
8339
8340	if (pd.act.rtableid >= 0)
8341		M_SETFIB(m, pd.act.rtableid);
8342
8343	if (pd.act.flags & PFSTATE_SETPRIO) {
8344		if (pd.tos & IPTOS_LOWDELAY)
8345			use_2nd_queue = 1;
8346		if (vlan_set_pcp(m, pd.act.set_prio[use_2nd_queue])) {
8347			action = PF_DROP;
8348			REASON_SET(&reason, PFRES_MEMORY);
8349			pd.act.log = PF_LOG_FORCE;
8350			DPFPRINTF(PF_DEBUG_MISC,
8351			    ("pf: failed to allocate 802.1q mtag\n"));
8352		}
8353	}
8354
8355#ifdef ALTQ
8356	if (action == PF_PASS && pd.act.qid) {
8357		if (pd.pf_mtag == NULL &&
8358		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
8359			action = PF_DROP;
8360			REASON_SET(&reason, PFRES_MEMORY);
8361		} else {
8362			if (s != NULL)
8363				pd.pf_mtag->qid_hash = pf_state_hash(s);
8364			if (use_2nd_queue || (pd.tos & IPTOS_LOWDELAY))
8365				pd.pf_mtag->qid = pd.act.pqid;
8366			else
8367				pd.pf_mtag->qid = pd.act.qid;
8368			/* Add hints for ecn. */
8369			pd.pf_mtag->hdr = h;
8370		}
8371	}
8372#endif /* ALTQ */
8373
8374	/*
8375	 * connections redirected to loopback should not match sockets
8376	 * bound specifically to loopback due to security implications,
8377	 * see tcp_input() and in_pcblookup_listen().
8378	 */
8379	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
8380	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
8381	    (s->nat_rule.ptr->action == PF_RDR ||
8382	    s->nat_rule.ptr->action == PF_BINAT) &&
8383	    IN_LOOPBACK(ntohl(pd.dst->v4.s_addr)))
8384		m->m_flags |= M_SKIP_FIREWALL;
8385
8386	if (__predict_false(ip_divert_ptr != NULL) && action == PF_PASS &&
8387	    r->divert.port && !PACKET_LOOPED(&pd)) {
8388		mtag = m_tag_alloc(MTAG_PF_DIVERT, 0,
8389		    sizeof(struct pf_divert_mtag), M_NOWAIT | M_ZERO);
8390		if (mtag != NULL) {
8391			((struct pf_divert_mtag *)(mtag+1))->port =
8392			    ntohs(r->divert.port);
8393			((struct pf_divert_mtag *)(mtag+1))->idir =
8394			    (dir == PF_IN) ? PF_DIVERT_MTAG_DIR_IN :
8395			    PF_DIVERT_MTAG_DIR_OUT;
8396
8397			if (s)
8398				PF_STATE_UNLOCK(s);
8399
8400			m_tag_prepend(m, mtag);
8401			if (m->m_flags & M_FASTFWD_OURS) {
8402				if (pd.pf_mtag == NULL &&
8403				    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
8404					action = PF_DROP;
8405					REASON_SET(&reason, PFRES_MEMORY);
8406					pd.act.log = PF_LOG_FORCE;
8407					DPFPRINTF(PF_DEBUG_MISC,
8408					    ("pf: failed to allocate tag\n"));
8409				} else {
8410					pd.pf_mtag->flags |=
8411					    PF_MTAG_FLAG_FASTFWD_OURS_PRESENT;
8412					m->m_flags &= ~M_FASTFWD_OURS;
8413				}
8414			}
8415			ip_divert_ptr(*m0, dir == PF_IN);
8416			*m0 = NULL;
8417
8418			return (action);
8419		} else {
8420			/* XXX: ipfw has the same behaviour! */
8421			action = PF_DROP;
8422			REASON_SET(&reason, PFRES_MEMORY);
8423			pd.act.log = PF_LOG_FORCE;
8424			DPFPRINTF(PF_DEBUG_MISC,
8425			    ("pf: failed to allocate divert tag\n"));
8426		}
8427	}
8428	/* this flag will need revising if the pkt is forwarded */
8429	if (pd.pf_mtag)
8430		pd.pf_mtag->flags &= ~PF_MTAG_FLAG_PACKET_LOOPED;
8431
8432	if (pd.act.log) {
8433		struct pf_krule		*lr;
8434		struct pf_krule_item	*ri;
8435
8436		if (s != NULL && s->nat_rule.ptr != NULL &&
8437		    s->nat_rule.ptr->log & PF_LOG_ALL)
8438			lr = s->nat_rule.ptr;
8439		else
8440			lr = r;
8441
8442		if (pd.act.log & PF_LOG_FORCE || lr->log & PF_LOG_ALL)
8443			PFLOG_PACKET(kif, m, AF_INET, action, reason, lr, a,
8444			    ruleset, &pd, (s == NULL));
8445		if (s) {
8446			SLIST_FOREACH(ri, &s->match_rules, entry)
8447				if (ri->r->log & PF_LOG_ALL)
8448					PFLOG_PACKET(kif, m, AF_INET, action,
8449					    reason, ri->r, a, ruleset, &pd, 0);
8450		}
8451	}
8452
8453	pf_counter_u64_critical_enter();
8454	pf_counter_u64_add_protected(&kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS],
8455	    pd.tot_len);
8456	pf_counter_u64_add_protected(&kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS],
8457	    1);
8458
8459	if (action == PF_PASS || r->action == PF_DROP) {
8460		dirndx = (dir == PF_OUT);
8461		pf_counter_u64_add_protected(&r->packets[dirndx], 1);
8462		pf_counter_u64_add_protected(&r->bytes[dirndx], pd.tot_len);
8463		pf_update_timestamp(r);
8464
8465		if (a != NULL) {
8466			pf_counter_u64_add_protected(&a->packets[dirndx], 1);
8467			pf_counter_u64_add_protected(&a->bytes[dirndx], pd.tot_len);
8468		}
8469		if (s != NULL) {
8470			struct pf_krule_item	*ri;
8471
8472			if (s->nat_rule.ptr != NULL) {
8473				pf_counter_u64_add_protected(&s->nat_rule.ptr->packets[dirndx],
8474				    1);
8475				pf_counter_u64_add_protected(&s->nat_rule.ptr->bytes[dirndx],
8476				    pd.tot_len);
8477			}
8478			if (s->src_node != NULL) {
8479				counter_u64_add(s->src_node->packets[dirndx],
8480				    1);
8481				counter_u64_add(s->src_node->bytes[dirndx],
8482				    pd.tot_len);
8483			}
8484			if (s->nat_src_node != NULL) {
8485				counter_u64_add(s->nat_src_node->packets[dirndx],
8486				    1);
8487				counter_u64_add(s->nat_src_node->bytes[dirndx],
8488				    pd.tot_len);
8489			}
8490			dirndx = (dir == s->direction) ? 0 : 1;
8491			s->packets[dirndx]++;
8492			s->bytes[dirndx] += pd.tot_len;
8493			SLIST_FOREACH(ri, &s->match_rules, entry) {
8494				pf_counter_u64_add_protected(&ri->r->packets[dirndx], 1);
8495				pf_counter_u64_add_protected(&ri->r->bytes[dirndx], pd.tot_len);
8496			}
8497		}
8498		tr = r;
8499		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
8500		if (nr != NULL && r == &V_pf_default_rule)
8501			tr = nr;
8502		if (tr->src.addr.type == PF_ADDR_TABLE)
8503			pfr_update_stats(tr->src.addr.p.tbl,
8504			    (s == NULL) ? pd.src :
8505			    &s->key[(s->direction == PF_IN)]->
8506				addr[(s->direction == PF_OUT)],
8507			    pd.af, pd.tot_len, dir == PF_OUT,
8508			    r->action == PF_PASS, tr->src.neg);
8509		if (tr->dst.addr.type == PF_ADDR_TABLE)
8510			pfr_update_stats(tr->dst.addr.p.tbl,
8511			    (s == NULL) ? pd.dst :
8512			    &s->key[(s->direction == PF_IN)]->
8513				addr[(s->direction == PF_IN)],
8514			    pd.af, pd.tot_len, dir == PF_OUT,
8515			    r->action == PF_PASS, tr->dst.neg);
8516	}
8517	pf_counter_u64_critical_exit();
8518
8519	switch (action) {
8520	case PF_SYNPROXY_DROP:
8521		m_freem(*m0);
8522	case PF_DEFER:
8523		*m0 = NULL;
8524		action = PF_PASS;
8525		break;
8526	case PF_DROP:
8527		m_freem(*m0);
8528		*m0 = NULL;
8529		break;
8530	default:
8531		/* pf_route() returns unlocked. */
8532		if (rt) {
8533			pf_route(m0, r, kif->pfik_ifp, s, &pd, inp);
8534			goto out;
8535		}
8536		if (pf_dummynet(&pd, s, r, m0) != 0) {
8537			action = PF_DROP;
8538			REASON_SET(&reason, PFRES_MEMORY);
8539		}
8540		break;
8541	}
8542
8543	SDT_PROBE4(pf, ip, test, done, action, reason, r, s);
8544
8545	if (s && action != PF_DROP) {
8546		if (!s->if_index_in && dir == PF_IN)
8547			s->if_index_in = ifp->if_index;
8548		else if (!s->if_index_out && dir == PF_OUT)
8549			s->if_index_out = ifp->if_index;
8550	}
8551
8552	if (s)
8553		PF_STATE_UNLOCK(s);
8554
8555out:
8556	pf_sctp_multihome_delayed(&pd, off, kif, s, action);
8557
8558	return (action);
8559}
8560#endif /* INET */
8561
8562#ifdef INET6
8563int
8564pf_test6(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp,
8565    struct pf_rule_actions *default_actions)
8566{
8567	struct pfi_kkif		*kif;
8568	u_short			 action, reason = 0;
8569	struct mbuf		*m = *m0, *n = NULL;
8570	struct m_tag		*mtag;
8571	struct ip6_hdr		*h = NULL;
8572	struct pf_krule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
8573	struct pf_kstate	*s = NULL;
8574	struct pf_kruleset	*ruleset = NULL;
8575	struct pf_pdesc		 pd;
8576	int			 off, terminal = 0, dirndx, rh_cnt = 0, use_2nd_queue = 0;
8577	uint16_t		 tag;
8578	uint8_t			 rt;
8579
8580	PF_RULES_RLOCK_TRACKER;
8581	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: bad direction %d\n", __func__, dir));
8582	M_ASSERTPKTHDR(m);
8583
8584	if (!V_pf_status.running)
8585		return (PF_PASS);
8586
8587	PF_RULES_RLOCK();
8588
8589	kif = (struct pfi_kkif *)ifp->if_pf_kif;
8590	if (__predict_false(kif == NULL)) {
8591		DPFPRINTF(PF_DEBUG_URGENT,
8592		    ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
8593		PF_RULES_RUNLOCK();
8594		return (PF_DROP);
8595	}
8596	if (kif->pfik_flags & PFI_IFLAG_SKIP) {
8597		PF_RULES_RUNLOCK();
8598		return (PF_PASS);
8599	}
8600
8601	if (m->m_flags & M_SKIP_FIREWALL) {
8602		PF_RULES_RUNLOCK();
8603		return (PF_PASS);
8604	}
8605
8606	/*
8607	 * If we end up changing IP addresses (e.g. binat) the stack may get
8608	 * confused and fail to send the icmp6 packet too big error. Just send
8609	 * it here, before we do any NAT.
8610	 */
8611	if (dir == PF_OUT && pflags & PFIL_FWD && IN6_LINKMTU(ifp) < pf_max_frag_size(m)) {
8612		PF_RULES_RUNLOCK();
8613		*m0 = NULL;
8614		icmp6_error(m, ICMP6_PACKET_TOO_BIG, 0, IN6_LINKMTU(ifp));
8615		return (PF_DROP);
8616	}
8617
8618	memset(&pd, 0, sizeof(pd));
8619	TAILQ_INIT(&pd.sctp_multihome_jobs);
8620	if (default_actions != NULL)
8621		memcpy(&pd.act, default_actions, sizeof(pd.act));
8622	pd.pf_mtag = pf_find_mtag(m);
8623
8624	if (pd.pf_mtag != NULL && (pd.pf_mtag->flags & PF_MTAG_FLAG_ROUTE_TO)) {
8625		pd.pf_mtag->flags &= ~PF_MTAG_FLAG_ROUTE_TO;
8626
8627		ifp = ifnet_byindexgen(pd.pf_mtag->if_index,
8628		    pd.pf_mtag->if_idxgen);
8629		if (ifp == NULL || ifp->if_flags & IFF_DYING) {
8630			PF_RULES_RUNLOCK();
8631			m_freem(*m0);
8632			*m0 = NULL;
8633			return (PF_PASS);
8634		}
8635		PF_RULES_RUNLOCK();
8636		nd6_output_ifp(ifp, ifp, m,
8637                    (struct sockaddr_in6 *)&pd.pf_mtag->dst, NULL);
8638		*m0 = NULL;
8639		return (PF_PASS);
8640	}
8641
8642	if (pd.pf_mtag && pd.pf_mtag->dnpipe) {
8643		pd.act.dnpipe = pd.pf_mtag->dnpipe;
8644		pd.act.flags = pd.pf_mtag->dnflags;
8645	}
8646
8647	if (ip_dn_io_ptr != NULL && pd.pf_mtag != NULL &&
8648	    pd.pf_mtag->flags & PF_MTAG_FLAG_DUMMYNET) {
8649		pf_dummynet_flag_remove(m, pd.pf_mtag);
8650		/* Dummynet re-injects packets after they've
8651		 * completed their delay. We've already
8652		 * processed them, so pass unconditionally. */
8653		PF_RULES_RUNLOCK();
8654		return (PF_PASS);
8655	}
8656
8657	pd.sport = pd.dport = NULL;
8658	pd.ip_sum = NULL;
8659	pd.proto_sum = NULL;
8660	pd.dir = dir;
8661	pd.sidx = (dir == PF_IN) ? 0 : 1;
8662	pd.didx = (dir == PF_IN) ? 1 : 0;
8663	pd.af = AF_INET6;
8664	pd.act.rtableid = -1;
8665
8666	h = mtod(m, struct ip6_hdr *);
8667	off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
8668
8669	/* We do IP header normalization and packet reassembly here */
8670	if (pf_normalize_ip6(m0, kif, &reason, &pd) != PF_PASS) {
8671		action = PF_DROP;
8672		goto done;
8673	}
8674	m = *m0;	/* pf_normalize messes with m0 */
8675	h = mtod(m, struct ip6_hdr *);
8676	off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
8677
8678	/*
8679	 * we do not support jumbogram.  if we keep going, zero ip6_plen
8680	 * will do something bad, so drop the packet for now.
8681	 */
8682	if (htons(h->ip6_plen) == 0) {
8683		action = PF_DROP;
8684		REASON_SET(&reason, PFRES_NORM);	/*XXX*/
8685		goto done;
8686	}
8687
8688	pd.src = (struct pf_addr *)&h->ip6_src;
8689	pd.dst = (struct pf_addr *)&h->ip6_dst;
8690	pd.tos = IPV6_DSCP(h);
8691	pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
8692
8693	pd.proto = h->ip6_nxt;
8694	do {
8695		switch (pd.proto) {
8696		case IPPROTO_FRAGMENT:
8697			action = pf_test_fragment(&r, kif, m, h, &pd, &a,
8698			    &ruleset);
8699			if (action == PF_DROP)
8700				REASON_SET(&reason, PFRES_FRAG);
8701			goto done;
8702		case IPPROTO_ROUTING: {
8703			struct ip6_rthdr rthdr;
8704
8705			if (rh_cnt++) {
8706				DPFPRINTF(PF_DEBUG_MISC,
8707				    ("pf: IPv6 more than one rthdr\n"));
8708				action = PF_DROP;
8709				REASON_SET(&reason, PFRES_IPOPTIONS);
8710				pd.act.log = PF_LOG_FORCE;
8711				goto done;
8712			}
8713			if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
8714			    &reason, pd.af)) {
8715				DPFPRINTF(PF_DEBUG_MISC,
8716				    ("pf: IPv6 short rthdr\n"));
8717				action = PF_DROP;
8718				REASON_SET(&reason, PFRES_SHORT);
8719				pd.act.log = PF_LOG_FORCE;
8720				goto done;
8721			}
8722			if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
8723				DPFPRINTF(PF_DEBUG_MISC,
8724				    ("pf: IPv6 rthdr0\n"));
8725				action = PF_DROP;
8726				REASON_SET(&reason, PFRES_IPOPTIONS);
8727				pd.act.log = PF_LOG_FORCE;
8728				goto done;
8729			}
8730			/* FALLTHROUGH */
8731		}
8732		case IPPROTO_AH:
8733		case IPPROTO_HOPOPTS:
8734		case IPPROTO_DSTOPTS: {
8735			/* get next header and header length */
8736			struct ip6_ext	opt6;
8737
8738			if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
8739			    NULL, &reason, pd.af)) {
8740				DPFPRINTF(PF_DEBUG_MISC,
8741				    ("pf: IPv6 short opt\n"));
8742				action = PF_DROP;
8743				pd.act.log = PF_LOG_FORCE;
8744				goto done;
8745			}
8746			if (pd.proto == IPPROTO_AH)
8747				off += (opt6.ip6e_len + 2) * 4;
8748			else
8749				off += (opt6.ip6e_len + 1) * 8;
8750			pd.proto = opt6.ip6e_nxt;
8751			/* goto the next header */
8752			break;
8753		}
8754		default:
8755			terminal++;
8756			break;
8757		}
8758	} while (!terminal);
8759
8760	/* if there's no routing header, use unmodified mbuf for checksumming */
8761	if (!n)
8762		n = m;
8763
8764	switch (pd.proto) {
8765	case IPPROTO_TCP: {
8766		if (!pf_pull_hdr(m, off, &pd.hdr.tcp, sizeof(pd.hdr.tcp),
8767		    &action, &reason, AF_INET6)) {
8768			if (action != PF_PASS)
8769				pd.act.log |= PF_LOG_FORCE;
8770			goto done;
8771		}
8772		pd.p_len = pd.tot_len - off - (pd.hdr.tcp.th_off << 2);
8773		pd.sport = &pd.hdr.tcp.th_sport;
8774		pd.dport = &pd.hdr.tcp.th_dport;
8775
8776		/* Respond to SYN with a syncookie. */
8777		if ((pd.hdr.tcp.th_flags & (TH_SYN|TH_ACK|TH_RST)) == TH_SYN &&
8778		    pd.dir == PF_IN && pf_synflood_check(&pd)) {
8779			pf_syncookie_send(m, off, &pd);
8780			action = PF_DROP;
8781			break;
8782		}
8783
8784		action = pf_normalize_tcp(kif, m, 0, off, h, &pd);
8785		if (action == PF_DROP)
8786			goto done;
8787		action = pf_test_state_tcp(&s, kif, m, off, h, &pd, &reason);
8788		if (action == PF_PASS) {
8789			if (V_pfsync_update_state_ptr != NULL)
8790				V_pfsync_update_state_ptr(s);
8791			r = s->rule.ptr;
8792			a = s->anchor.ptr;
8793		} else if (s == NULL) {
8794			/* Validate remote SYN|ACK, re-create original SYN if
8795			 * valid. */
8796			if ((pd.hdr.tcp.th_flags & (TH_SYN|TH_ACK|TH_RST)) ==
8797			    TH_ACK && pf_syncookie_validate(&pd) &&
8798			    pd.dir == PF_IN) {
8799				struct mbuf *msyn;
8800
8801				msyn = pf_syncookie_recreate_syn(h->ip6_hlim,
8802				    off, &pd);
8803				if (msyn == NULL) {
8804					action = PF_DROP;
8805					break;
8806				}
8807
8808				action = pf_test6(dir, pflags, ifp, &msyn, inp,
8809				    &pd.act);
8810				m_freem(msyn);
8811				if (action != PF_PASS)
8812					break;
8813
8814				action = pf_test_state_tcp(&s, kif, m, off, h,
8815				    &pd, &reason);
8816				if (action != PF_PASS || s == NULL) {
8817					action = PF_DROP;
8818					break;
8819				}
8820
8821				s->src.seqhi = ntohl(pd.hdr.tcp.th_ack) - 1;
8822				s->src.seqlo = ntohl(pd.hdr.tcp.th_seq) - 1;
8823				pf_set_protostate(s, PF_PEER_SRC, PF_TCPS_PROXY_DST);
8824
8825				action = pf_synproxy(&pd, &s, &reason);
8826				break;
8827			} else {
8828				action = pf_test_rule(&r, &s, kif, m, off, &pd,
8829				    &a, &ruleset, inp);
8830			}
8831		}
8832		break;
8833	}
8834
8835	case IPPROTO_UDP: {
8836		if (!pf_pull_hdr(m, off, &pd.hdr.udp, sizeof(pd.hdr.udp),
8837		    &action, &reason, AF_INET6)) {
8838			if (action != PF_PASS)
8839				pd.act.log |= PF_LOG_FORCE;
8840			goto done;
8841		}
8842		pd.sport = &pd.hdr.udp.uh_sport;
8843		pd.dport = &pd.hdr.udp.uh_dport;
8844		if (pd.hdr.udp.uh_dport == 0 ||
8845		    ntohs(pd.hdr.udp.uh_ulen) > m->m_pkthdr.len - off ||
8846		    ntohs(pd.hdr.udp.uh_ulen) < sizeof(struct udphdr)) {
8847			action = PF_DROP;
8848			REASON_SET(&reason, PFRES_SHORT);
8849			goto done;
8850		}
8851		action = pf_test_state_udp(&s, kif, m, off, h, &pd);
8852		if (action == PF_PASS) {
8853			if (V_pfsync_update_state_ptr != NULL)
8854				V_pfsync_update_state_ptr(s);
8855			r = s->rule.ptr;
8856			a = s->anchor.ptr;
8857		} else if (s == NULL)
8858			action = pf_test_rule(&r, &s, kif, m, off, &pd,
8859			    &a, &ruleset, inp);
8860		break;
8861	}
8862
8863	case IPPROTO_SCTP: {
8864		if (!pf_pull_hdr(m, off, &pd.hdr.sctp, sizeof(pd.hdr.sctp),
8865		    &action, &reason, AF_INET6)) {
8866			if (action != PF_PASS)
8867				pd.act.log |= PF_LOG_FORCE;
8868			goto done;
8869		}
8870		pd.sport = &pd.hdr.sctp.src_port;
8871		pd.dport = &pd.hdr.sctp.dest_port;
8872		if (pd.hdr.sctp.src_port == 0 || pd.hdr.sctp.dest_port == 0) {
8873			action = PF_DROP;
8874			REASON_SET(&reason, PFRES_SHORT);
8875			goto done;
8876		}
8877		action = pf_normalize_sctp(dir, kif, m, 0, off, h, &pd);
8878		if (action == PF_DROP)
8879			goto done;
8880		action = pf_test_state_sctp(&s, kif, m, off, h, &pd,
8881		    &reason);
8882		if (action == PF_PASS) {
8883			if (V_pfsync_update_state_ptr != NULL)
8884				V_pfsync_update_state_ptr(s);
8885			r = s->rule.ptr;
8886			a = s->anchor.ptr;
8887		} else {
8888			action = pf_test_rule(&r, &s, kif, m, off,
8889			    &pd, &a, &ruleset, inp);
8890		}
8891		break;
8892	}
8893
8894	case IPPROTO_ICMP: {
8895		action = PF_DROP;
8896		DPFPRINTF(PF_DEBUG_MISC,
8897		    ("pf: dropping IPv6 packet with ICMPv4 payload\n"));
8898		goto done;
8899	}
8900
8901	case IPPROTO_ICMPV6: {
8902		if (!pf_pull_hdr(m, off, &pd.hdr.icmp6, sizeof(pd.hdr.icmp6),
8903		    &action, &reason, AF_INET6)) {
8904			if (action != PF_PASS)
8905				pd.act.log |= PF_LOG_FORCE;
8906			goto done;
8907		}
8908		action = pf_test_state_icmp(&s, kif, m, off, h, &pd, &reason);
8909		if (action == PF_PASS) {
8910			if (V_pfsync_update_state_ptr != NULL)
8911				V_pfsync_update_state_ptr(s);
8912			r = s->rule.ptr;
8913			a = s->anchor.ptr;
8914		} else if (s == NULL)
8915			action = pf_test_rule(&r, &s, kif, m, off, &pd,
8916			    &a, &ruleset, inp);
8917		break;
8918	}
8919
8920	default:
8921		action = pf_test_state_other(&s, kif, m, &pd);
8922		if (action == PF_PASS) {
8923			if (V_pfsync_update_state_ptr != NULL)
8924				V_pfsync_update_state_ptr(s);
8925			r = s->rule.ptr;
8926			a = s->anchor.ptr;
8927		} else if (s == NULL)
8928			action = pf_test_rule(&r, &s, kif, m, off, &pd,
8929			    &a, &ruleset, inp);
8930		break;
8931	}
8932
8933done:
8934	PF_RULES_RUNLOCK();
8935	if (n != m) {
8936		m_freem(n);
8937		n = NULL;
8938	}
8939
8940	/* handle dangerous IPv6 extension headers. */
8941	if (action == PF_PASS && rh_cnt &&
8942	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
8943		action = PF_DROP;
8944		REASON_SET(&reason, PFRES_IPOPTIONS);
8945		pd.act.log = r->log;
8946		DPFPRINTF(PF_DEBUG_MISC,
8947		    ("pf: dropping packet with dangerous v6 headers\n"));
8948	}
8949
8950	if (s) {
8951		uint8_t log = pd.act.log;
8952		memcpy(&pd.act, &s->act, sizeof(struct pf_rule_actions));
8953		pd.act.log |= log;
8954		tag = s->tag;
8955		rt = s->rt;
8956	} else {
8957		tag = r->tag;
8958		rt = r->rt;
8959	}
8960
8961	if (tag > 0 && pf_tag_packet(m, &pd, tag)) {
8962		action = PF_DROP;
8963		REASON_SET(&reason, PFRES_MEMORY);
8964	}
8965
8966	pf_scrub_ip6(&m, &pd);
8967	if (pd.proto == IPPROTO_TCP && pd.act.max_mss)
8968		pf_normalize_mss(m, off, &pd);
8969
8970	if (pd.act.rtableid >= 0)
8971		M_SETFIB(m, pd.act.rtableid);
8972
8973	if (pd.act.flags & PFSTATE_SETPRIO) {
8974		if (pd.tos & IPTOS_LOWDELAY)
8975			use_2nd_queue = 1;
8976		if (vlan_set_pcp(m, pd.act.set_prio[use_2nd_queue])) {
8977			action = PF_DROP;
8978			REASON_SET(&reason, PFRES_MEMORY);
8979			pd.act.log = PF_LOG_FORCE;
8980			DPFPRINTF(PF_DEBUG_MISC,
8981			    ("pf: failed to allocate 802.1q mtag\n"));
8982		}
8983	}
8984
8985#ifdef ALTQ
8986	if (action == PF_PASS && pd.act.qid) {
8987		if (pd.pf_mtag == NULL &&
8988		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
8989			action = PF_DROP;
8990			REASON_SET(&reason, PFRES_MEMORY);
8991		} else {
8992			if (s != NULL)
8993				pd.pf_mtag->qid_hash = pf_state_hash(s);
8994			if (pd.tos & IPTOS_LOWDELAY)
8995				pd.pf_mtag->qid = pd.act.pqid;
8996			else
8997				pd.pf_mtag->qid = pd.act.qid;
8998			/* Add hints for ecn. */
8999			pd.pf_mtag->hdr = h;
9000		}
9001	}
9002#endif /* ALTQ */
9003
9004	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
9005	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
9006	    (s->nat_rule.ptr->action == PF_RDR ||
9007	    s->nat_rule.ptr->action == PF_BINAT) &&
9008	    IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
9009		m->m_flags |= M_SKIP_FIREWALL;
9010
9011	/* XXX: Anybody working on it?! */
9012	if (r->divert.port)
9013		printf("pf: divert(9) is not supported for IPv6\n");
9014
9015	if (pd.act.log) {
9016		struct pf_krule		*lr;
9017		struct pf_krule_item	*ri;
9018
9019		if (s != NULL && s->nat_rule.ptr != NULL &&
9020		    s->nat_rule.ptr->log & PF_LOG_ALL)
9021			lr = s->nat_rule.ptr;
9022		else
9023			lr = r;
9024
9025		if (pd.act.log & PF_LOG_FORCE || lr->log & PF_LOG_ALL)
9026			PFLOG_PACKET(kif, m, AF_INET6, action, reason, lr, a, ruleset,
9027			    &pd, (s == NULL));
9028		if (s) {
9029			SLIST_FOREACH(ri, &s->match_rules, entry)
9030				if (ri->r->log & PF_LOG_ALL)
9031					PFLOG_PACKET(kif, m, AF_INET6, action, reason,
9032					    ri->r, a, ruleset, &pd, 0);
9033		}
9034	}
9035
9036	pf_counter_u64_critical_enter();
9037	pf_counter_u64_add_protected(&kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS],
9038	    pd.tot_len);
9039	pf_counter_u64_add_protected(&kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS],
9040	    1);
9041
9042	if (action == PF_PASS || r->action == PF_DROP) {
9043		dirndx = (dir == PF_OUT);
9044		pf_counter_u64_add_protected(&r->packets[dirndx], 1);
9045		pf_counter_u64_add_protected(&r->bytes[dirndx], pd.tot_len);
9046		if (a != NULL) {
9047			pf_counter_u64_add_protected(&a->packets[dirndx], 1);
9048			pf_counter_u64_add_protected(&a->bytes[dirndx], pd.tot_len);
9049		}
9050		if (s != NULL) {
9051			if (s->nat_rule.ptr != NULL) {
9052				pf_counter_u64_add_protected(&s->nat_rule.ptr->packets[dirndx],
9053				    1);
9054				pf_counter_u64_add_protected(&s->nat_rule.ptr->bytes[dirndx],
9055				    pd.tot_len);
9056			}
9057			if (s->src_node != NULL) {
9058				counter_u64_add(s->src_node->packets[dirndx],
9059				    1);
9060				counter_u64_add(s->src_node->bytes[dirndx],
9061				    pd.tot_len);
9062			}
9063			if (s->nat_src_node != NULL) {
9064				counter_u64_add(s->nat_src_node->packets[dirndx],
9065				    1);
9066				counter_u64_add(s->nat_src_node->bytes[dirndx],
9067				    pd.tot_len);
9068			}
9069			dirndx = (dir == s->direction) ? 0 : 1;
9070			s->packets[dirndx]++;
9071			s->bytes[dirndx] += pd.tot_len;
9072		}
9073		tr = r;
9074		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
9075		if (nr != NULL && r == &V_pf_default_rule)
9076			tr = nr;
9077		if (tr->src.addr.type == PF_ADDR_TABLE)
9078			pfr_update_stats(tr->src.addr.p.tbl,
9079			    (s == NULL) ? pd.src :
9080			    &s->key[(s->direction == PF_IN)]->addr[0],
9081			    pd.af, pd.tot_len, dir == PF_OUT,
9082			    r->action == PF_PASS, tr->src.neg);
9083		if (tr->dst.addr.type == PF_ADDR_TABLE)
9084			pfr_update_stats(tr->dst.addr.p.tbl,
9085			    (s == NULL) ? pd.dst :
9086			    &s->key[(s->direction == PF_IN)]->addr[1],
9087			    pd.af, pd.tot_len, dir == PF_OUT,
9088			    r->action == PF_PASS, tr->dst.neg);
9089	}
9090	pf_counter_u64_critical_exit();
9091
9092	switch (action) {
9093	case PF_SYNPROXY_DROP:
9094		m_freem(*m0);
9095	case PF_DEFER:
9096		*m0 = NULL;
9097		action = PF_PASS;
9098		break;
9099	case PF_DROP:
9100		m_freem(*m0);
9101		*m0 = NULL;
9102		break;
9103	default:
9104		/* pf_route6() returns unlocked. */
9105		if (rt) {
9106			pf_route6(m0, r, kif->pfik_ifp, s, &pd, inp);
9107			goto out;
9108		}
9109		if (pf_dummynet(&pd, s, r, m0) != 0) {
9110			action = PF_DROP;
9111			REASON_SET(&reason, PFRES_MEMORY);
9112		}
9113		break;
9114	}
9115
9116	if (s && action != PF_DROP) {
9117		if (!s->if_index_in && dir == PF_IN)
9118			s->if_index_in = ifp->if_index;
9119		else if (!s->if_index_out && dir == PF_OUT)
9120			s->if_index_out = ifp->if_index;
9121	}
9122
9123	if (s)
9124		PF_STATE_UNLOCK(s);
9125
9126	/* If reassembled packet passed, create new fragments. */
9127	if (action == PF_PASS && *m0 && dir == PF_OUT &&
9128	    (mtag = m_tag_find(m, PACKET_TAG_PF_REASSEMBLED, NULL)) != NULL)
9129		action = pf_refragment6(ifp, m0, mtag, pflags & PFIL_FWD);
9130
9131out:
9132	SDT_PROBE4(pf, ip, test6, done, action, reason, r, s);
9133
9134	pf_sctp_multihome_delayed(&pd, off, kif, s, action);
9135
9136	return (action);
9137}
9138#endif /* INET6 */
9139