1/*
2 * Copyright (c) 2007-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*	$apfw: pf_norm.c,v 1.10 2008/08/28 19:10:53 jhw Exp $ */
30/*	$OpenBSD: pf_norm.c,v 1.107 2006/04/16 00:59:52 pascoe Exp $ */
31
32/*
33 * Copyright 2001 Niels Provos <provos@citi.umich.edu>
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
46 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
47 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
48 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
49 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
50 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
54 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55 */
56
57#include <sys/param.h>
58#include <sys/systm.h>
59#include <sys/mbuf.h>
60#include <sys/filio.h>
61#include <sys/fcntl.h>
62#include <sys/socket.h>
63#include <sys/kernel.h>
64#include <sys/time.h>
65#include <sys/random.h>
66#include <sys/mcache.h>
67
68#include <net/if.h>
69#include <net/if_types.h>
70#include <net/bpf.h>
71#include <net/route.h>
72#include <net/if_pflog.h>
73
74#include <netinet/in.h>
75#include <netinet/in_var.h>
76#include <netinet/in_systm.h>
77#include <netinet/ip.h>
78#include <netinet/ip_var.h>
79#include <netinet/tcp.h>
80#include <netinet/tcp_seq.h>
81#include <netinet/tcp_fsm.h>
82#include <netinet/udp.h>
83#include <netinet/ip_icmp.h>
84
85#if INET6
86#include <netinet/ip6.h>
87#endif /* INET6 */
88
89#include <net/pfvar.h>
90
91struct pf_frent {
92	LIST_ENTRY(pf_frent)	fr_next;
93	struct mbuf		*fr_m;
94#define fr_ip		fr_u.fru_ipv4
95#define fr_ip6		fr_u.fru_ipv6
96	union {
97		struct ip	*fru_ipv4;
98		struct ip6_hdr	*fru_ipv6;
99	} fr_u;
100	struct ip6_frag		fr_ip6f_opt;
101	int			fr_ip6f_hlen;
102};
103
104struct pf_frcache {
105	LIST_ENTRY(pf_frcache) fr_next;
106	uint16_t	fr_off;
107	uint16_t	fr_end;
108};
109
110#define PFFRAG_SEENLAST	0x0001		/* Seen the last fragment for this */
111#define PFFRAG_NOBUFFER	0x0002		/* Non-buffering fragment cache */
112#define PFFRAG_DROP	0x0004		/* Drop all fragments */
113#define BUFFER_FRAGMENTS(fr)	(!((fr)->fr_flags & PFFRAG_NOBUFFER))
114
115struct pf_fragment {
116	RB_ENTRY(pf_fragment) fr_entry;
117	TAILQ_ENTRY(pf_fragment) frag_next;
118	struct pf_addr	fr_srcx;
119	struct pf_addr	fr_dstx;
120	u_int8_t	fr_p;		/* protocol of this fragment */
121	u_int8_t	fr_flags;	/* status flags */
122	u_int16_t	fr_max;		/* fragment data max */
123#define fr_id		fr_uid.fru_id4
124#define fr_id6		fr_uid.fru_id6
125	union {
126		u_int16_t	fru_id4;
127		u_int32_t	fru_id6;
128	} fr_uid;
129	int		fr_af;
130	u_int32_t	fr_timeout;
131#define fr_queue	fr_u.fru_queue
132#define fr_cache	fr_u.fru_cache
133	union {
134		LIST_HEAD(pf_fragq, pf_frent) fru_queue;	/* buffering */
135		LIST_HEAD(pf_cacheq, pf_frcache) fru_cache;	/* non-buf */
136	} fr_u;
137};
138
139static TAILQ_HEAD(pf_fragqueue, pf_fragment)	pf_fragqueue;
140static TAILQ_HEAD(pf_cachequeue, pf_fragment)	pf_cachequeue;
141
142static __inline int  pf_frag_compare(struct pf_fragment *,
143    struct pf_fragment *);
144static RB_HEAD(pf_frag_tree, pf_fragment)	pf_frag_tree, pf_cache_tree;
145RB_PROTOTYPE_SC(__private_extern__, pf_frag_tree, pf_fragment, fr_entry,
146    pf_frag_compare);
147RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
148
149/* Private prototypes */
150static void pf_ip6hdr2key(struct pf_fragment *, struct ip6_hdr *,
151    struct ip6_frag *);
152static void pf_ip2key(struct pf_fragment *, struct ip *);
153static void pf_remove_fragment(struct pf_fragment *);
154static void pf_flush_fragments(void);
155static void pf_free_fragment(struct pf_fragment *);
156static struct pf_fragment *pf_find_fragment_by_key(struct pf_fragment *,
157    struct pf_frag_tree *);
158static __inline struct pf_fragment *
159    pf_find_fragment_by_ipv4_header(struct ip *, struct pf_frag_tree *);
160static __inline struct pf_fragment *
161    pf_find_fragment_by_ipv6_header(struct ip6_hdr *, struct ip6_frag *,
162    struct pf_frag_tree *);
163static struct mbuf *pf_reassemble(struct mbuf **, struct pf_fragment **,
164    struct pf_frent *, int);
165static struct mbuf *pf_fragcache(struct mbuf **, struct ip *,
166    struct pf_fragment **, int, int, int *);
167static struct mbuf *pf_reassemble6(struct mbuf **, struct pf_fragment **,
168    struct pf_frent *, int);
169static struct mbuf *pf_frag6cache(struct mbuf **, struct ip6_hdr*,
170    struct ip6_frag *, struct pf_fragment **, int, int, int, int *);
171static int pf_normalize_tcpopt(struct pf_rule *, int, struct pfi_kif *,
172    struct pf_pdesc *, struct mbuf *, struct tcphdr *, int, int *);
173
174#define	DPFPRINTF(x) do {				\
175	if (pf_status.debug >= PF_DEBUG_MISC) {		\
176		printf("%s: ", __func__);		\
177		printf x ;				\
178	}						\
179} while (0)
180
181/* Globals */
182struct pool		 pf_frent_pl, pf_frag_pl;
183static struct pool	 pf_cache_pl, pf_cent_pl;
184struct pool		 pf_state_scrub_pl;
185
186static int		 pf_nfrents, pf_ncache;
187
188void
189pf_normalize_init(void)
190{
191	pool_init(&pf_frent_pl, sizeof (struct pf_frent), 0, 0, 0, "pffrent",
192	    NULL);
193	pool_init(&pf_frag_pl, sizeof (struct pf_fragment), 0, 0, 0, "pffrag",
194	    NULL);
195	pool_init(&pf_cache_pl, sizeof (struct pf_fragment), 0, 0, 0,
196	    "pffrcache", NULL);
197	pool_init(&pf_cent_pl, sizeof (struct pf_frcache), 0, 0, 0, "pffrcent",
198	    NULL);
199	pool_init(&pf_state_scrub_pl, sizeof (struct pf_state_scrub), 0, 0, 0,
200	    "pfstscr", NULL);
201
202	pool_sethiwat(&pf_frag_pl, PFFRAG_FRAG_HIWAT);
203	pool_sethardlimit(&pf_frent_pl, PFFRAG_FRENT_HIWAT, NULL, 0);
204	pool_sethardlimit(&pf_cache_pl, PFFRAG_FRCACHE_HIWAT, NULL, 0);
205	pool_sethardlimit(&pf_cent_pl, PFFRAG_FRCENT_HIWAT, NULL, 0);
206
207	TAILQ_INIT(&pf_fragqueue);
208	TAILQ_INIT(&pf_cachequeue);
209}
210
211#if 0
212void
213pf_normalize_destroy(void)
214{
215	pool_destroy(&pf_state_scrub_pl);
216	pool_destroy(&pf_cent_pl);
217	pool_destroy(&pf_cache_pl);
218	pool_destroy(&pf_frag_pl);
219	pool_destroy(&pf_frent_pl);
220}
221#endif
222
223int
224pf_normalize_isempty(void)
225{
226	return (TAILQ_EMPTY(&pf_fragqueue) && TAILQ_EMPTY(&pf_cachequeue));
227}
228
229static __inline int
230pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
231{
232	int	diff;
233
234	if ((diff = a->fr_af - b->fr_af))
235		return (diff);
236	else if ((diff = a->fr_p - b->fr_p))
237		return (diff);
238	else {
239		struct pf_addr *sa = &a->fr_srcx;
240		struct pf_addr *sb = &b->fr_srcx;
241		struct pf_addr *da = &a->fr_dstx;
242		struct pf_addr *db = &b->fr_dstx;
243
244		switch (a->fr_af) {
245#ifdef INET
246		case AF_INET:
247			if ((diff = a->fr_id - b->fr_id))
248				return (diff);
249			else if (sa->v4.s_addr < sb->v4.s_addr)
250				return (-1);
251			else if (sa->v4.s_addr > sb->v4.s_addr)
252				return (1);
253			else if (da->v4.s_addr < db->v4.s_addr)
254				return (-1);
255			else if (da->v4.s_addr > db->v4.s_addr)
256				return (1);
257			break;
258#endif
259#ifdef INET6
260		case AF_INET6:
261			if ((diff = a->fr_id6 - b->fr_id6))
262				return (diff);
263			else if (sa->addr32[3] < sb->addr32[3])
264				return (-1);
265			else if (sa->addr32[3] > sb->addr32[3])
266				return (1);
267			else if (sa->addr32[2] < sb->addr32[2])
268				return (-1);
269			else if (sa->addr32[2] > sb->addr32[2])
270				return (1);
271			else if (sa->addr32[1] < sb->addr32[1])
272				return (-1);
273			else if (sa->addr32[1] > sb->addr32[1])
274				return (1);
275			else if (sa->addr32[0] < sb->addr32[0])
276				return (-1);
277			else if (sa->addr32[0] > sb->addr32[0])
278				return (1);
279			else if (da->addr32[3] < db->addr32[3])
280				return (-1);
281			else if (da->addr32[3] > db->addr32[3])
282				return (1);
283			else if (da->addr32[2] < db->addr32[2])
284				return (-1);
285			else if (da->addr32[2] > db->addr32[2])
286				return (1);
287			else if (da->addr32[1] < db->addr32[1])
288				return (-1);
289			else if (da->addr32[1] > db->addr32[1])
290				return (1);
291			else if (da->addr32[0] < db->addr32[0])
292				return (-1);
293			else if (da->addr32[0] > db->addr32[0])
294				return (1);
295			break;
296#endif
297		default:
298			VERIFY(!0 && "only IPv4 and IPv6 supported!");
299			break;
300		}
301	}
302	return (0);
303}
304
305void
306pf_purge_expired_fragments(void)
307{
308	struct pf_fragment *frag;
309	u_int32_t expire = pf_time_second() -
310	    pf_default_rule.timeout[PFTM_FRAG];
311
312	while ((frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue)) != NULL) {
313		VERIFY(BUFFER_FRAGMENTS(frag));
314		if (frag->fr_timeout > expire)
315			break;
316
317		switch (frag->fr_af) {
318		case AF_INET:
319		      DPFPRINTF(("expiring IPv4 %d(0x%llx) from queue.\n",
320			  ntohs(frag->fr_id),
321			  (uint64_t)VM_KERNEL_ADDRPERM(frag)));
322		      break;
323		case AF_INET6:
324		      DPFPRINTF(("expiring IPv6 %d(0x%llx) from queue.\n",
325			  ntohl(frag->fr_id6),
326			  (uint64_t)VM_KERNEL_ADDRPERM(frag)));
327		      break;
328		default:
329		      VERIFY(0 && "only IPv4 and IPv6 supported");
330		      break;
331		}
332		pf_free_fragment(frag);
333	}
334
335	while ((frag = TAILQ_LAST(&pf_cachequeue, pf_cachequeue)) != NULL) {
336		VERIFY(!BUFFER_FRAGMENTS(frag));
337		if (frag->fr_timeout > expire)
338			break;
339
340		switch (frag->fr_af) {
341		case AF_INET:
342		      DPFPRINTF(("expiring IPv4 %d(0x%llx) from cache.\n",
343			  ntohs(frag->fr_id),
344			  (uint64_t)VM_KERNEL_ADDRPERM(frag)));
345		      break;
346		case AF_INET6:
347		      DPFPRINTF(("expiring IPv6 %d(0x%llx) from cache.\n",
348			  ntohl(frag->fr_id6),
349			  (uint64_t)VM_KERNEL_ADDRPERM(frag)));
350		      break;
351		default:
352		      VERIFY(0 && "only IPv4 and IPv6 supported");
353		      break;
354		}
355		pf_free_fragment(frag);
356		VERIFY(TAILQ_EMPTY(&pf_cachequeue) ||
357		    TAILQ_LAST(&pf_cachequeue, pf_cachequeue) != frag);
358	}
359}
360
361/*
362 * Try to flush old fragments to make space for new ones
363 */
364
365static void
366pf_flush_fragments(void)
367{
368	struct pf_fragment	*frag;
369	int			 goal;
370
371	goal = pf_nfrents * 9 / 10;
372	DPFPRINTF(("trying to free > %d frents\n",
373	    pf_nfrents - goal));
374	while (goal < pf_nfrents) {
375		frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue);
376		if (frag == NULL)
377			break;
378		pf_free_fragment(frag);
379	}
380
381
382	goal = pf_ncache * 9 / 10;
383	DPFPRINTF(("trying to free > %d cache entries\n",
384	    pf_ncache - goal));
385	while (goal < pf_ncache) {
386		frag = TAILQ_LAST(&pf_cachequeue, pf_cachequeue);
387		if (frag == NULL)
388			break;
389		pf_free_fragment(frag);
390	}
391}
392
393/* Frees the fragments and all associated entries */
394
395static void
396pf_free_fragment(struct pf_fragment *frag)
397{
398	struct pf_frent		*frent;
399	struct pf_frcache	*frcache;
400
401	/* Free all fragments */
402	if (BUFFER_FRAGMENTS(frag)) {
403		for (frent = LIST_FIRST(&frag->fr_queue); frent;
404		    frent = LIST_FIRST(&frag->fr_queue)) {
405			LIST_REMOVE(frent, fr_next);
406
407			m_freem(frent->fr_m);
408			pool_put(&pf_frent_pl, frent);
409			pf_nfrents--;
410		}
411	} else {
412		for (frcache = LIST_FIRST(&frag->fr_cache); frcache;
413		    frcache = LIST_FIRST(&frag->fr_cache)) {
414			LIST_REMOVE(frcache, fr_next);
415
416			VERIFY(LIST_EMPTY(&frag->fr_cache) ||
417			    LIST_FIRST(&frag->fr_cache)->fr_off >
418			    frcache->fr_end);
419
420			pool_put(&pf_cent_pl, frcache);
421			pf_ncache--;
422		}
423	}
424
425	pf_remove_fragment(frag);
426}
427
428static void
429pf_ip6hdr2key(struct pf_fragment *key, struct ip6_hdr *ip6,
430    struct ip6_frag *fh)
431{
432	key->fr_p = fh->ip6f_nxt;
433	key->fr_id6 = fh->ip6f_ident;
434	key->fr_af = AF_INET6;
435	key->fr_srcx.v6 = ip6->ip6_src;
436	key->fr_dstx.v6 = ip6->ip6_dst;
437}
438
439static void
440pf_ip2key(struct pf_fragment *key, struct ip *ip)
441{
442	key->fr_p = ip->ip_p;
443	key->fr_id = ip->ip_id;
444	key->fr_af = AF_INET;
445	key->fr_srcx.v4.s_addr = ip->ip_src.s_addr;
446	key->fr_dstx.v4.s_addr = ip->ip_dst.s_addr;
447}
448
449static struct pf_fragment *
450pf_find_fragment_by_key(struct pf_fragment *key, struct pf_frag_tree *tree)
451{
452	struct pf_fragment *frag;
453
454	frag = RB_FIND(pf_frag_tree, tree, key);
455	if (frag != NULL) {
456		/* XXX Are we sure we want to update the timeout? */
457		frag->fr_timeout = pf_time_second();
458		if (BUFFER_FRAGMENTS(frag)) {
459			TAILQ_REMOVE(&pf_fragqueue, frag, frag_next);
460			TAILQ_INSERT_HEAD(&pf_fragqueue, frag, frag_next);
461		} else {
462			TAILQ_REMOVE(&pf_cachequeue, frag, frag_next);
463			TAILQ_INSERT_HEAD(&pf_cachequeue, frag, frag_next);
464		}
465	}
466
467	return (frag);
468}
469
470static __inline struct pf_fragment *
471pf_find_fragment_by_ipv4_header(struct ip *ip, struct pf_frag_tree *tree)
472{
473	struct pf_fragment key;
474	pf_ip2key(&key, ip);
475	return pf_find_fragment_by_key(&key, tree);
476}
477
478static __inline struct pf_fragment *
479pf_find_fragment_by_ipv6_header(struct ip6_hdr *ip6, struct ip6_frag *fh,
480    struct pf_frag_tree *tree)
481{
482      struct pf_fragment key;
483      pf_ip6hdr2key(&key, ip6, fh);
484      return pf_find_fragment_by_key(&key, tree);
485}
486
487/* Removes a fragment from the fragment queue and frees the fragment */
488
489static void
490pf_remove_fragment(struct pf_fragment *frag)
491{
492	if (BUFFER_FRAGMENTS(frag)) {
493		RB_REMOVE(pf_frag_tree, &pf_frag_tree, frag);
494		TAILQ_REMOVE(&pf_fragqueue, frag, frag_next);
495		pool_put(&pf_frag_pl, frag);
496	} else {
497		RB_REMOVE(pf_frag_tree, &pf_cache_tree, frag);
498		TAILQ_REMOVE(&pf_cachequeue, frag, frag_next);
499		pool_put(&pf_cache_pl, frag);
500	}
501}
502
503#define FR_IP_OFF(fr)	((ntohs((fr)->fr_ip->ip_off) & IP_OFFMASK) << 3)
504static struct mbuf *
505pf_reassemble(struct mbuf **m0, struct pf_fragment **frag,
506    struct pf_frent *frent, int mff)
507{
508	struct mbuf	*m = *m0, *m2;
509	struct pf_frent	*frea, *next;
510	struct pf_frent	*frep = NULL;
511	struct ip	*ip = frent->fr_ip;
512	int		 hlen = ip->ip_hl << 2;
513	u_int16_t	 off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
514	u_int16_t	 ip_len = ntohs(ip->ip_len) - ip->ip_hl * 4;
515	u_int16_t	 fr_max = ip_len + off;
516
517	VERIFY(*frag == NULL || BUFFER_FRAGMENTS(*frag));
518
519	/* Strip off ip header */
520	m->m_data += hlen;
521	m->m_len -= hlen;
522
523	/* Create a new reassembly queue for this packet */
524	if (*frag == NULL) {
525		*frag = pool_get(&pf_frag_pl, PR_NOWAIT);
526		if (*frag == NULL) {
527			pf_flush_fragments();
528			*frag = pool_get(&pf_frag_pl, PR_NOWAIT);
529			if (*frag == NULL)
530				goto drop_fragment;
531		}
532
533		(*frag)->fr_flags = 0;
534		(*frag)->fr_max = 0;
535		(*frag)->fr_af = AF_INET;
536		(*frag)->fr_srcx.v4 = frent->fr_ip->ip_src;
537		(*frag)->fr_dstx.v4 = frent->fr_ip->ip_dst;
538		(*frag)->fr_p = frent->fr_ip->ip_p;
539		(*frag)->fr_id = frent->fr_ip->ip_id;
540		(*frag)->fr_timeout = pf_time_second();
541		LIST_INIT(&(*frag)->fr_queue);
542
543		RB_INSERT(pf_frag_tree, &pf_frag_tree, *frag);
544		TAILQ_INSERT_HEAD(&pf_fragqueue, *frag, frag_next);
545
546		/* We do not have a previous fragment */
547		frep = NULL;
548		goto insert;
549	}
550
551	/*
552	 * Find a fragment after the current one:
553	 *  - off contains the real shifted offset.
554	 */
555	LIST_FOREACH(frea, &(*frag)->fr_queue, fr_next) {
556		if (FR_IP_OFF(frea) > off)
557			break;
558		frep = frea;
559	}
560
561	VERIFY(frep != NULL || frea != NULL);
562
563	if (frep != NULL &&
564	    FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl *
565	    4 > off) {
566		u_int16_t	precut;
567
568		precut = FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) -
569		    frep->fr_ip->ip_hl * 4 - off;
570		if (precut >= ip_len)
571			goto drop_fragment;
572		m_adj(frent->fr_m, precut);
573		DPFPRINTF(("overlap -%d\n", precut));
574		/* Enforce 8 byte boundaries */
575		ip->ip_off = htons(ntohs(ip->ip_off) + (precut >> 3));
576		off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
577		ip_len -= precut;
578		ip->ip_len = htons(ip_len);
579	}
580
581	for (; frea != NULL && ip_len + off > FR_IP_OFF(frea);
582	    frea = next) {
583		u_int16_t	aftercut;
584
585		aftercut = ip_len + off - FR_IP_OFF(frea);
586		DPFPRINTF(("adjust overlap %d\n", aftercut));
587		if (aftercut < ntohs(frea->fr_ip->ip_len) - frea->fr_ip->ip_hl
588		    * 4) {
589			frea->fr_ip->ip_len =
590			    htons(ntohs(frea->fr_ip->ip_len) - aftercut);
591			frea->fr_ip->ip_off = htons(ntohs(frea->fr_ip->ip_off) +
592			    (aftercut >> 3));
593			m_adj(frea->fr_m, aftercut);
594			break;
595		}
596
597		/* This fragment is completely overlapped, lose it */
598		next = LIST_NEXT(frea, fr_next);
599		m_freem(frea->fr_m);
600		LIST_REMOVE(frea, fr_next);
601		pool_put(&pf_frent_pl, frea);
602		pf_nfrents--;
603	}
604
605insert:
606	/* Update maximum data size */
607	if ((*frag)->fr_max < fr_max)
608		(*frag)->fr_max = fr_max;
609	/* This is the last segment */
610	if (!mff)
611		(*frag)->fr_flags |= PFFRAG_SEENLAST;
612
613	if (frep == NULL)
614		LIST_INSERT_HEAD(&(*frag)->fr_queue, frent, fr_next);
615	else
616		LIST_INSERT_AFTER(frep, frent, fr_next);
617
618	/* Check if we are completely reassembled */
619	if (!((*frag)->fr_flags & PFFRAG_SEENLAST))
620		return (NULL);
621
622	/* Check if we have all the data */
623	off = 0;
624	for (frep = LIST_FIRST(&(*frag)->fr_queue); frep; frep = next) {
625		next = LIST_NEXT(frep, fr_next);
626
627		off += ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl * 4;
628		if (off < (*frag)->fr_max &&
629		    (next == NULL || FR_IP_OFF(next) != off)) {
630			DPFPRINTF(("missing fragment at %d, next %d, max %d\n",
631			    off, next == NULL ? -1 : FR_IP_OFF(next),
632			    (*frag)->fr_max));
633			return (NULL);
634		}
635	}
636	DPFPRINTF(("%d < %d?\n", off, (*frag)->fr_max));
637	if (off < (*frag)->fr_max)
638		return (NULL);
639
640	/* We have all the data */
641	frent = LIST_FIRST(&(*frag)->fr_queue);
642	VERIFY(frent != NULL);
643	if ((frent->fr_ip->ip_hl << 2) + off > IP_MAXPACKET) {
644		DPFPRINTF(("drop: too big: %d\n", off));
645		pf_free_fragment(*frag);
646		*frag = NULL;
647		return (NULL);
648	}
649	next = LIST_NEXT(frent, fr_next);
650
651	/* Magic from ip_input */
652	ip = frent->fr_ip;
653	m = frent->fr_m;
654	m2 = m->m_next;
655	m->m_next = NULL;
656	m_cat(m, m2);
657	pool_put(&pf_frent_pl, frent);
658	pf_nfrents--;
659	for (frent = next; frent != NULL; frent = next) {
660		next = LIST_NEXT(frent, fr_next);
661
662		m2 = frent->fr_m;
663		pool_put(&pf_frent_pl, frent);
664		pf_nfrents--;
665		m_cat(m, m2);
666	}
667
668	ip->ip_src = (*frag)->fr_srcx.v4;
669	ip->ip_dst = (*frag)->fr_dstx.v4;
670
671	/* Remove from fragment queue */
672	pf_remove_fragment(*frag);
673	*frag = NULL;
674
675	hlen = ip->ip_hl << 2;
676	ip->ip_len = htons(off + hlen);
677	m->m_len += hlen;
678	m->m_data -= hlen;
679
680	/* some debugging cruft by sklower, below, will go away soon */
681	/* XXX this should be done elsewhere */
682	if (m->m_flags & M_PKTHDR) {
683		int plen = 0;
684		for (m2 = m; m2; m2 = m2->m_next)
685			plen += m2->m_len;
686		m->m_pkthdr.len = plen;
687	}
688
689	DPFPRINTF(("complete: 0x%llx(%d)\n",
690	    (uint64_t)VM_KERNEL_ADDRPERM(m), ntohs(ip->ip_len)));
691	return (m);
692
693drop_fragment:
694	/* Oops - fail safe - drop packet */
695	pool_put(&pf_frent_pl, frent);
696	pf_nfrents--;
697	m_freem(m);
698	return (NULL);
699}
700
701static struct mbuf *
702pf_fragcache(struct mbuf **m0, struct ip *h, struct pf_fragment **frag, int mff,
703    int drop, int *nomem)
704{
705	struct mbuf		*m = *m0;
706	struct pf_frcache	*frp, *fra, *cur = NULL;
707	int			 ip_len = ntohs(h->ip_len) - (h->ip_hl << 2);
708	u_int16_t		 off = ntohs(h->ip_off) << 3;
709	u_int16_t		 fr_max = ip_len + off;
710	int			 hosed = 0;
711
712	VERIFY(*frag == NULL || !BUFFER_FRAGMENTS(*frag));
713
714	/* Create a new range queue for this packet */
715	if (*frag == NULL) {
716		*frag = pool_get(&pf_cache_pl, PR_NOWAIT);
717		if (*frag == NULL) {
718			pf_flush_fragments();
719			*frag = pool_get(&pf_cache_pl, PR_NOWAIT);
720			if (*frag == NULL)
721				goto no_mem;
722		}
723
724		/* Get an entry for the queue */
725		cur = pool_get(&pf_cent_pl, PR_NOWAIT);
726		if (cur == NULL) {
727			pool_put(&pf_cache_pl, *frag);
728			*frag = NULL;
729			goto no_mem;
730		}
731		pf_ncache++;
732
733		(*frag)->fr_flags = PFFRAG_NOBUFFER;
734		(*frag)->fr_max = 0;
735		(*frag)->fr_af = AF_INET;
736		(*frag)->fr_srcx.v4 = h->ip_src;
737		(*frag)->fr_dstx.v4 = h->ip_dst;
738		(*frag)->fr_p = h->ip_p;
739		(*frag)->fr_id = h->ip_id;
740		(*frag)->fr_timeout = pf_time_second();
741
742		cur->fr_off = off;
743		cur->fr_end = fr_max;
744		LIST_INIT(&(*frag)->fr_cache);
745		LIST_INSERT_HEAD(&(*frag)->fr_cache, cur, fr_next);
746
747		RB_INSERT(pf_frag_tree, &pf_cache_tree, *frag);
748		TAILQ_INSERT_HEAD(&pf_cachequeue, *frag, frag_next);
749
750		DPFPRINTF(("fragcache[%d]: new %d-%d\n", h->ip_id, off,
751		    fr_max));
752
753		goto pass;
754	}
755
756	/*
757	 * Find a fragment after the current one:
758	 *  - off contains the real shifted offset.
759	 */
760	frp = NULL;
761	LIST_FOREACH(fra, &(*frag)->fr_cache, fr_next) {
762		if (fra->fr_off > off)
763			break;
764		frp = fra;
765	}
766
767	VERIFY(frp != NULL || fra != NULL);
768
769	if (frp != NULL) {
770		int	precut;
771
772		precut = frp->fr_end - off;
773		if (precut >= ip_len) {
774			/* Fragment is entirely a duplicate */
775			DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n",
776			    h->ip_id, frp->fr_off, frp->fr_end, off, fr_max));
777			goto drop_fragment;
778		}
779		if (precut == 0) {
780			/* They are adjacent.  Fixup cache entry */
781			DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n",
782			    h->ip_id, frp->fr_off, frp->fr_end, off, fr_max));
783			frp->fr_end = fr_max;
784		} else if (precut > 0) {
785			/*
786			 * The first part of this payload overlaps with a
787			 * fragment that has already been passed.
788			 * Need to trim off the first part of the payload.
789			 * But to do so easily, we need to create another
790			 * mbuf to throw the original header into.
791			 */
792
793			DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n",
794			    h->ip_id, precut, frp->fr_off, frp->fr_end, off,
795			    fr_max));
796
797			off += precut;
798			fr_max -= precut;
799			/* Update the previous frag to encompass this one */
800			frp->fr_end = fr_max;
801
802			if (!drop) {
803				/*
804				 * XXX Optimization opportunity
805				 * This is a very heavy way to trim the payload.
806				 * we could do it much faster by diddling mbuf
807				 * internals but that would be even less legible
808				 * than this mbuf magic.  For my next trick,
809				 * I'll pull a rabbit out of my laptop.
810				 */
811				*m0 = m_copym(m, 0, h->ip_hl << 2, M_NOWAIT);
812				if (*m0 == NULL)
813					goto no_mem;
814				VERIFY((*m0)->m_next == NULL);
815				m_adj(m, precut + (h->ip_hl << 2));
816				m_cat(*m0, m);
817				m = *m0;
818				if (m->m_flags & M_PKTHDR) {
819					int plen = 0;
820					struct mbuf *t;
821					for (t = m; t; t = t->m_next)
822						plen += t->m_len;
823					m->m_pkthdr.len = plen;
824				}
825
826
827				h = mtod(m, struct ip *);
828
829
830				VERIFY((int)m->m_len ==
831				    ntohs(h->ip_len) - precut);
832				h->ip_off = htons(ntohs(h->ip_off) +
833				    (precut >> 3));
834				h->ip_len = htons(ntohs(h->ip_len) - precut);
835			} else {
836				hosed++;
837			}
838		} else {
839			/* There is a gap between fragments */
840
841			DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n",
842			    h->ip_id, -precut, frp->fr_off, frp->fr_end, off,
843			    fr_max));
844
845			cur = pool_get(&pf_cent_pl, PR_NOWAIT);
846			if (cur == NULL)
847				goto no_mem;
848			pf_ncache++;
849
850			cur->fr_off = off;
851			cur->fr_end = fr_max;
852			LIST_INSERT_AFTER(frp, cur, fr_next);
853		}
854	}
855
856	if (fra != NULL) {
857		int	aftercut;
858		int	merge = 0;
859
860		aftercut = fr_max - fra->fr_off;
861		if (aftercut == 0) {
862			/* Adjacent fragments */
863			DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n",
864			    h->ip_id, off, fr_max, fra->fr_off, fra->fr_end));
865			fra->fr_off = off;
866			merge = 1;
867		} else if (aftercut > 0) {
868			/* Need to chop off the tail of this fragment */
869			DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n",
870			    h->ip_id, aftercut, off, fr_max, fra->fr_off,
871			    fra->fr_end));
872			fra->fr_off = off;
873			fr_max -= aftercut;
874
875			merge = 1;
876
877			if (!drop) {
878				m_adj(m, -aftercut);
879				if (m->m_flags & M_PKTHDR) {
880					int plen = 0;
881					struct mbuf *t;
882					for (t = m; t; t = t->m_next)
883						plen += t->m_len;
884					m->m_pkthdr.len = plen;
885				}
886				h = mtod(m, struct ip *);
887				VERIFY((int)m->m_len ==
888				    ntohs(h->ip_len) - aftercut);
889				h->ip_len = htons(ntohs(h->ip_len) - aftercut);
890			} else {
891				hosed++;
892			}
893		} else if (frp == NULL) {
894			/* There is a gap between fragments */
895			DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n",
896			    h->ip_id, -aftercut, off, fr_max, fra->fr_off,
897			    fra->fr_end));
898
899			cur = pool_get(&pf_cent_pl, PR_NOWAIT);
900			if (cur == NULL)
901				goto no_mem;
902			pf_ncache++;
903
904			cur->fr_off = off;
905			cur->fr_end = fr_max;
906			LIST_INSERT_BEFORE(fra, cur, fr_next);
907		}
908
909
910		/* Need to glue together two separate fragment descriptors */
911		if (merge) {
912			if (cur && fra->fr_off <= cur->fr_end) {
913				/* Need to merge in a previous 'cur' */
914				DPFPRINTF(("fragcache[%d]: adjacent(merge "
915				    "%d-%d) %d-%d (%d-%d)\n",
916				    h->ip_id, cur->fr_off, cur->fr_end, off,
917				    fr_max, fra->fr_off, fra->fr_end));
918				fra->fr_off = cur->fr_off;
919				LIST_REMOVE(cur, fr_next);
920				pool_put(&pf_cent_pl, cur);
921				pf_ncache--;
922				cur = NULL;
923
924			} else if (frp && fra->fr_off <= frp->fr_end) {
925				/* Need to merge in a modified 'frp' */
926				VERIFY(cur == NULL);
927				DPFPRINTF(("fragcache[%d]: adjacent(merge "
928				    "%d-%d) %d-%d (%d-%d)\n",
929				    h->ip_id, frp->fr_off, frp->fr_end, off,
930				    fr_max, fra->fr_off, fra->fr_end));
931				fra->fr_off = frp->fr_off;
932				LIST_REMOVE(frp, fr_next);
933				pool_put(&pf_cent_pl, frp);
934				pf_ncache--;
935				frp = NULL;
936
937			}
938		}
939	}
940
941	if (hosed) {
942		/*
943		 * We must keep tracking the overall fragment even when
944		 * we're going to drop it anyway so that we know when to
945		 * free the overall descriptor.  Thus we drop the frag late.
946		 */
947		goto drop_fragment;
948	}
949
950
951pass:
952	/* Update maximum data size */
953	if ((*frag)->fr_max < fr_max)
954		(*frag)->fr_max = fr_max;
955
956	/* This is the last segment */
957	if (!mff)
958		(*frag)->fr_flags |= PFFRAG_SEENLAST;
959
960	/* Check if we are completely reassembled */
961	if (((*frag)->fr_flags & PFFRAG_SEENLAST) &&
962	    LIST_FIRST(&(*frag)->fr_cache)->fr_off == 0 &&
963	    LIST_FIRST(&(*frag)->fr_cache)->fr_end == (*frag)->fr_max) {
964		/* Remove from fragment queue */
965		DPFPRINTF(("fragcache[%d]: done 0-%d\n", h->ip_id,
966		    (*frag)->fr_max));
967		pf_free_fragment(*frag);
968		*frag = NULL;
969	}
970
971	return (m);
972
973no_mem:
974	*nomem = 1;
975
976	/* Still need to pay attention to !IP_MF */
977	if (!mff && *frag != NULL)
978		(*frag)->fr_flags |= PFFRAG_SEENLAST;
979
980	m_freem(m);
981	return (NULL);
982
983drop_fragment:
984
985	/* Still need to pay attention to !IP_MF */
986	if (!mff && *frag != NULL)
987		(*frag)->fr_flags |= PFFRAG_SEENLAST;
988
989	if (drop) {
990		/* This fragment has been deemed bad.  Don't reass */
991		if (((*frag)->fr_flags & PFFRAG_DROP) == 0)
992			DPFPRINTF(("fragcache[%d]: dropping overall fragment\n",
993			    h->ip_id));
994		(*frag)->fr_flags |= PFFRAG_DROP;
995	}
996
997	m_freem(m);
998	return (NULL);
999}
1000
1001#define FR_IP6_OFF(fr) \
1002	(ntohs((fr)->fr_ip6f_opt.ip6f_offlg & IP6F_OFF_MASK))
1003#define FR_IP6_PLEN(fr) (ntohs((fr)->fr_ip6->ip6_plen))
1004struct mbuf *
1005pf_reassemble6(struct mbuf **m0, struct pf_fragment **frag,
1006    struct pf_frent *frent, int mff)
1007{
1008	struct mbuf *m, *m2;
1009	struct pf_frent *frea, *frep, *next;
1010	struct ip6_hdr *ip6;
1011	int plen, off, fr_max;
1012
1013	VERIFY(*frag == NULL || BUFFER_FRAGMENTS(*frag));
1014	m = *m0;
1015	frep = NULL;
1016	ip6 = frent->fr_ip6;
1017	off = FR_IP6_OFF(frent);
1018	plen = FR_IP6_PLEN(frent);
1019	fr_max = off + plen - (frent->fr_ip6f_hlen - sizeof *ip6);
1020
1021	DPFPRINTF(("0x%llx IPv6 frag plen %u off %u fr_ip6f_hlen %u "
1022	    "fr_max %u m_len %u\n", (uint64_t)VM_KERNEL_ADDRPERM(m), plen, off,
1023	    frent->fr_ip6f_hlen, fr_max, m->m_len));
1024
1025	/* strip off headers up to the fragment payload */
1026	m->m_data += frent->fr_ip6f_hlen;
1027	m->m_len -= frent->fr_ip6f_hlen;
1028
1029	/* Create a new reassembly queue for this packet */
1030	if (*frag == NULL) {
1031		*frag = pool_get(&pf_frag_pl, PR_NOWAIT);
1032		if (*frag == NULL) {
1033			pf_flush_fragments();
1034			*frag = pool_get(&pf_frag_pl, PR_NOWAIT);
1035			if (*frag == NULL)
1036				goto drop_fragment;
1037		}
1038
1039		(*frag)->fr_flags = 0;
1040		(*frag)->fr_max = 0;
1041		(*frag)->fr_af = AF_INET6;
1042		(*frag)->fr_srcx.v6 = frent->fr_ip6->ip6_src;
1043		(*frag)->fr_dstx.v6 = frent->fr_ip6->ip6_dst;
1044		(*frag)->fr_p = frent->fr_ip6f_opt.ip6f_nxt;
1045		(*frag)->fr_id6 = frent->fr_ip6f_opt.ip6f_ident;
1046		(*frag)->fr_timeout = pf_time_second();
1047		LIST_INIT(&(*frag)->fr_queue);
1048
1049		RB_INSERT(pf_frag_tree, &pf_frag_tree, *frag);
1050		TAILQ_INSERT_HEAD(&pf_fragqueue, *frag, frag_next);
1051
1052		/* We do not have a previous fragment */
1053		frep = NULL;
1054		goto insert;
1055	}
1056
1057	/*
1058	 * Find a fragment after the current one:
1059	 *  - off contains the real shifted offset.
1060	 */
1061	LIST_FOREACH(frea, &(*frag)->fr_queue, fr_next) {
1062		if (FR_IP6_OFF(frea) > off)
1063			break;
1064		frep = frea;
1065	}
1066
1067	VERIFY(frep != NULL || frea != NULL);
1068
1069	if (frep != NULL &&
1070	    FR_IP6_OFF(frep) + FR_IP6_PLEN(frep) - frep->fr_ip6f_hlen > off)
1071	{
1072		u_int16_t precut;
1073
1074		precut = FR_IP6_OFF(frep) + FR_IP6_PLEN(frep) -
1075		    frep->fr_ip6f_hlen - off;
1076		if (precut >= plen)
1077			goto drop_fragment;
1078		m_adj(frent->fr_m, precut);
1079		DPFPRINTF(("overlap -%d\n", precut));
1080		/* Enforce 8 byte boundaries */
1081		frent->fr_ip6f_opt.ip6f_offlg =
1082		    htons(ntohs(frent->fr_ip6f_opt.ip6f_offlg) +
1083		    (precut >> 3));
1084		off = FR_IP6_OFF(frent);
1085		plen -= precut;
1086		ip6->ip6_plen = htons(plen);
1087	}
1088
1089	for (; frea != NULL && plen + off > FR_IP6_OFF(frea); frea = next) {
1090		u_int16_t	aftercut;
1091
1092		aftercut = plen + off - FR_IP6_OFF(frea);
1093		DPFPRINTF(("adjust overlap %d\n", aftercut));
1094		if (aftercut < FR_IP6_PLEN(frea) - frea->fr_ip6f_hlen) {
1095			frea->fr_ip6->ip6_plen = htons(FR_IP6_PLEN(frea) -
1096				aftercut);
1097			frea->fr_ip6f_opt.ip6f_offlg =
1098			    htons(ntohs(frea->fr_ip6f_opt.ip6f_offlg) +
1099			    (aftercut >> 3));
1100			m_adj(frea->fr_m, aftercut);
1101			break;
1102		}
1103
1104		/* This fragment is completely overlapped, lose it */
1105		next = LIST_NEXT(frea, fr_next);
1106		m_freem(frea->fr_m);
1107		LIST_REMOVE(frea, fr_next);
1108		pool_put(&pf_frent_pl, frea);
1109		pf_nfrents--;
1110	}
1111
1112  insert:
1113	/* Update maximum data size */
1114	if ((*frag)->fr_max < fr_max)
1115		(*frag)->fr_max = fr_max;
1116	/* This is the last segment */
1117	if (!mff)
1118		(*frag)->fr_flags |= PFFRAG_SEENLAST;
1119
1120	if (frep == NULL)
1121		LIST_INSERT_HEAD(&(*frag)->fr_queue, frent, fr_next);
1122	else
1123		LIST_INSERT_AFTER(frep, frent, fr_next);
1124
1125	/* Check if we are completely reassembled */
1126	if (!((*frag)->fr_flags & PFFRAG_SEENLAST))
1127		return (NULL);
1128
1129	/* Check if we have all the data */
1130	off = 0;
1131	for (frep = LIST_FIRST(&(*frag)->fr_queue); frep; frep = next) {
1132		next = LIST_NEXT(frep, fr_next);
1133		off += FR_IP6_PLEN(frep) - (frent->fr_ip6f_hlen - sizeof *ip6);
1134		DPFPRINTF(("frep at %d, next %d, max %d\n",
1135			off, next == NULL ? -1 : FR_IP6_OFF(next),
1136			(*frag)->fr_max));
1137		if (off < (*frag)->fr_max &&
1138		    (next == NULL || FR_IP6_OFF(next) != off)) {
1139			DPFPRINTF(("missing fragment at %d, next %d, max %d\n",
1140			    off, next == NULL ? -1 : FR_IP6_OFF(next),
1141			    (*frag)->fr_max));
1142			return (NULL);
1143		}
1144	}
1145	DPFPRINTF(("%d < %d?\n", off, (*frag)->fr_max));
1146	if (off < (*frag)->fr_max)
1147		return (NULL);
1148
1149	/* We have all the data */
1150	frent = LIST_FIRST(&(*frag)->fr_queue);
1151	VERIFY(frent != NULL);
1152	if (frent->fr_ip6f_hlen + off > IP_MAXPACKET) {
1153		DPFPRINTF(("drop: too big: %d\n", off));
1154		pf_free_fragment(*frag);
1155		*frag = NULL;
1156		return (NULL);
1157	}
1158
1159	ip6 = frent->fr_ip6;
1160	ip6->ip6_nxt = (*frag)->fr_p;
1161	ip6->ip6_plen = htons(off);
1162	ip6->ip6_src = (*frag)->fr_srcx.v6;
1163	ip6->ip6_dst = (*frag)->fr_dstx.v6;
1164
1165	/* Remove from fragment queue */
1166	pf_remove_fragment(*frag);
1167	*frag = NULL;
1168
1169	m = frent->fr_m;
1170	m->m_len += sizeof(struct ip6_hdr);
1171	m->m_data -= sizeof(struct ip6_hdr);
1172	memmove(m->m_data, ip6, sizeof(struct ip6_hdr));
1173
1174	next = LIST_NEXT(frent, fr_next);
1175	pool_put(&pf_frent_pl, frent);
1176	pf_nfrents--;
1177	for (frent = next; next != NULL; frent = next) {
1178		m2 = frent->fr_m;
1179
1180		m_cat(m, m2);
1181		next = LIST_NEXT(frent, fr_next);
1182		pool_put(&pf_frent_pl, frent);
1183		pf_nfrents--;
1184	}
1185
1186	/* XXX this should be done elsewhere */
1187	if (m->m_flags & M_PKTHDR) {
1188		int pktlen = 0;
1189		for (m2 = m; m2; m2 = m2->m_next)
1190			pktlen += m2->m_len;
1191		m->m_pkthdr.len = pktlen;
1192	}
1193
1194	DPFPRINTF(("complete: 0x%llx ip6_plen %d m_pkthdr.len %d\n",
1195	    (uint64_t)VM_KERNEL_ADDRPERM(m), ntohs(ip6->ip6_plen),
1196	    m->m_pkthdr.len));
1197
1198	return m;
1199
1200 drop_fragment:
1201	/* Oops - fail safe - drop packet */
1202	pool_put(&pf_frent_pl, frent);
1203	--pf_nfrents;
1204	m_freem(m);
1205	return NULL;
1206}
1207
1208static struct mbuf *
1209pf_frag6cache(struct mbuf **m0, struct ip6_hdr *h, struct ip6_frag *fh,
1210    struct pf_fragment **frag, int hlen, int mff, int drop, int *nomem)
1211{
1212	struct mbuf *m = *m0;
1213	u_int16_t plen, off, fr_max;
1214	struct pf_frcache *frp, *fra, *cur = NULL;
1215	int hosed = 0;
1216
1217	VERIFY(*frag == NULL || !BUFFER_FRAGMENTS(*frag));
1218	m = *m0;
1219	off = ntohs(fh->ip6f_offlg & IP6F_OFF_MASK);
1220	plen = ntohs(h->ip6_plen) - (hlen - sizeof *h);
1221
1222	/*
1223	 * Apple Modification: dimambro@apple.com. The hlen, being passed
1224	 * into this function Includes all the headers associated with
1225	 * the packet, and may include routing headers, so to get to
1226	 * the data payload as stored in the original IPv6 header we need
1227	 * to subtract al those headers and the IP header.
1228	 *
1229	 * The 'max' local variable should also contain the offset from the start
1230	 * of the reassembled packet to the octet just past the end of the octets
1231	 * in the current fragment where:
1232	 * - 'off' is the offset from the start of the reassembled packet to the
1233	 *    first octet in the fragment,
1234	 * - 'plen' is the length of the "payload data length" Excluding all the
1235	 *   IPv6 headers of the fragment.
1236	 * - 'hlen' is computed in pf_normalize_ip6() as the offset from the start
1237	 *   of the IPv6 packet to the beginning of the data.
1238	 */
1239	fr_max = off + plen;
1240
1241	DPFPRINTF(("0x%llx plen %u off %u fr_max %u\n",
1242	    (uint64_t)VM_KERNEL_ADDRPERM(m), plen, off, fr_max));
1243
1244	/* Create a new range queue for this packet */
1245	if (*frag == NULL) {
1246		*frag = pool_get(&pf_cache_pl, PR_NOWAIT);
1247		if (*frag == NULL) {
1248			pf_flush_fragments();
1249			*frag = pool_get(&pf_cache_pl, PR_NOWAIT);
1250			if (*frag == NULL)
1251				goto no_mem;
1252		}
1253
1254		/* Get an entry for the queue */
1255		cur = pool_get(&pf_cent_pl, PR_NOWAIT);
1256		if (cur == NULL) {
1257			pool_put(&pf_cache_pl, *frag);
1258			*frag = NULL;
1259			goto no_mem;
1260		}
1261		pf_ncache++;
1262
1263		(*frag)->fr_flags = PFFRAG_NOBUFFER;
1264		(*frag)->fr_max = 0;
1265		(*frag)->fr_af = AF_INET6;
1266		(*frag)->fr_srcx.v6 = h->ip6_src;
1267		(*frag)->fr_dstx.v6 = h->ip6_dst;
1268		(*frag)->fr_p = fh->ip6f_nxt;
1269		(*frag)->fr_id6 = fh->ip6f_ident;
1270		(*frag)->fr_timeout = pf_time_second();
1271
1272		cur->fr_off = off;
1273		cur->fr_end = fr_max;
1274		LIST_INIT(&(*frag)->fr_cache);
1275		LIST_INSERT_HEAD(&(*frag)->fr_cache, cur, fr_next);
1276
1277		RB_INSERT(pf_frag_tree, &pf_cache_tree, *frag);
1278		TAILQ_INSERT_HEAD(&pf_cachequeue, *frag, frag_next);
1279
1280		DPFPRINTF(("frag6cache[%d]: new %d-%d\n", ntohl(fh->ip6f_ident),
1281		    off, fr_max));
1282
1283		goto pass;
1284	}
1285
1286	/*
1287	 * Find a fragment after the current one:
1288	 *  - off contains the real shifted offset.
1289	 */
1290	frp = NULL;
1291	LIST_FOREACH(fra, &(*frag)->fr_cache, fr_next) {
1292		if (fra->fr_off > off)
1293			break;
1294		frp = fra;
1295	}
1296
1297	VERIFY(frp != NULL || fra != NULL);
1298
1299	if (frp != NULL) {
1300		int precut;
1301
1302		precut = frp->fr_end - off;
1303		if (precut >= plen) {
1304			/* Fragment is entirely a duplicate */
1305			DPFPRINTF(("frag6cache[%u]: dead (%d-%d) %d-%d\n",
1306			    ntohl(fh->ip6f_ident), frp->fr_off, frp->fr_end,
1307			    off, fr_max));
1308			goto drop_fragment;
1309		}
1310		if (precut == 0) {
1311			/* They are adjacent.  Fixup cache entry */
1312			DPFPRINTF(("frag6cache[%u]: adjacent (%d-%d) %d-%d\n",
1313			    ntohl(fh->ip6f_ident), frp->fr_off, frp->fr_end,
1314			    off, fr_max));
1315			frp->fr_end = fr_max;
1316		} else if (precut > 0) {
1317			/* The first part of this payload overlaps with a
1318			 * fragment that has already been passed.
1319			 * Need to trim off the first part of the payload.
1320			 * But to do so easily, we need to create another
1321			 * mbuf to throw the original header into.
1322			 */
1323
1324			DPFPRINTF(("frag6cache[%u]: chop %d (%d-%d) %d-%d\n",
1325			    ntohl(fh->ip6f_ident), precut, frp->fr_off,
1326			    frp->fr_end, off, fr_max));
1327
1328			off += precut;
1329			fr_max -= precut;
1330			/* Update the previous frag to encompass this one */
1331			frp->fr_end = fr_max;
1332
1333			if (!drop) {
1334				/* XXX Optimization opportunity
1335				 * This is a very heavy way to trim the payload.
1336				 * we could do it much faster by diddling mbuf
1337				 * internals but that would be even less legible
1338				 * than this mbuf magic.  For my next trick,
1339				 * I'll pull a rabbit out of my laptop.
1340				 */
1341				*m0 = m_copym(m, 0, hlen, M_NOWAIT);
1342				if (*m0 == NULL)
1343					goto no_mem;
1344				VERIFY((*m0)->m_next == NULL);
1345				m_adj(m, precut + hlen);
1346				m_cat(*m0, m);
1347				m = *m0;
1348				if (m->m_flags & M_PKTHDR) {
1349					int pktlen = 0;
1350					struct mbuf *t;
1351					for (t = m; t; t = t->m_next)
1352						pktlen += t->m_len;
1353					m->m_pkthdr.len = pktlen;
1354				}
1355
1356				h = mtod(m, struct ip6_hdr *);
1357
1358				VERIFY((int)m->m_len ==
1359				    ntohs(h->ip6_plen) - precut);
1360				fh->ip6f_offlg &= ~IP6F_OFF_MASK;
1361				fh->ip6f_offlg |=
1362				    htons(ntohs(fh->ip6f_offlg & IP6F_OFF_MASK)
1363				    + (precut >> 3));
1364				h->ip6_plen = htons(ntohs(h->ip6_plen) -
1365				    precut);
1366			} else {
1367				hosed++;
1368			}
1369		} else {
1370			/* There is a gap between fragments */
1371
1372			DPFPRINTF(("frag6cache[%u]: gap %d (%d-%d) %d-%d\n",
1373			    ntohl(fh->ip6f_ident), -precut, frp->fr_off,
1374			    frp->fr_end, off, fr_max));
1375
1376			cur = pool_get(&pf_cent_pl, PR_NOWAIT);
1377			if (cur == NULL)
1378				goto no_mem;
1379			pf_ncache++;
1380
1381			cur->fr_off = off;
1382			cur->fr_end = fr_max;
1383			LIST_INSERT_AFTER(frp, cur, fr_next);
1384		}
1385	}
1386
1387	if (fra != NULL) {
1388		int	aftercut;
1389		int	merge = 0;
1390
1391		aftercut = fr_max - fra->fr_off;
1392		if (aftercut == 0) {
1393			/* Adjacent fragments */
1394			DPFPRINTF(("frag6cache[%u]: adjacent %d-%d (%d-%d)\n",
1395			    ntohl(fh->ip6f_ident), off, fr_max, fra->fr_off,
1396			    fra->fr_end));
1397			fra->fr_off = off;
1398			merge = 1;
1399		} else if (aftercut > 0) {
1400			/* Need to chop off the tail of this fragment */
1401			DPFPRINTF(("frag6cache[%u]: chop %d %d-%d (%d-%d)\n",
1402			    ntohl(fh->ip6f_ident), aftercut, off, fr_max,
1403			    fra->fr_off, fra->fr_end));
1404			fra->fr_off = off;
1405			fr_max -= aftercut;
1406
1407			merge = 1;
1408
1409			if (!drop) {
1410				m_adj(m, -aftercut);
1411				if (m->m_flags & M_PKTHDR) {
1412					int pktlen = 0;
1413					struct mbuf *t;
1414					for (t = m; t; t = t->m_next)
1415						pktlen += t->m_len;
1416					m->m_pkthdr.len = pktlen;
1417				}
1418				h = mtod(m, struct ip6_hdr *);
1419				VERIFY((int)m->m_len ==
1420				    ntohs(h->ip6_plen) - aftercut);
1421				h->ip6_plen =
1422				    htons(ntohs(h->ip6_plen) - aftercut);
1423			} else {
1424				hosed++;
1425			}
1426		} else if (frp == NULL) {
1427			/* There is a gap between fragments */
1428			DPFPRINTF(("frag6cache[%u]: gap %d %d-%d (%d-%d)\n",
1429			    ntohl(fh->ip6f_ident), -aftercut, off, fr_max,
1430			    fra->fr_off, fra->fr_end));
1431
1432			cur = pool_get(&pf_cent_pl, PR_NOWAIT);
1433			if (cur == NULL)
1434				goto no_mem;
1435			pf_ncache++;
1436
1437			cur->fr_off = off;
1438			cur->fr_end = fr_max;
1439			LIST_INSERT_BEFORE(fra, cur, fr_next);
1440		}
1441
1442		/* Need to glue together two separate fragment descriptors */
1443		if (merge) {
1444			if (cur && fra->fr_off <= cur->fr_end) {
1445				/* Need to merge in a previous 'cur' */
1446				DPFPRINTF(("frag6cache[%u]: adjacent(merge "
1447				    "%d-%d) %d-%d (%d-%d)\n",
1448				    ntohl(fh->ip6f_ident), cur->fr_off,
1449				    cur->fr_end, off, fr_max, fra->fr_off,
1450				    fra->fr_end));
1451				fra->fr_off = cur->fr_off;
1452				LIST_REMOVE(cur, fr_next);
1453				pool_put(&pf_cent_pl, cur);
1454				pf_ncache--;
1455				cur = NULL;
1456			} else if (frp && fra->fr_off <= frp->fr_end) {
1457				/* Need to merge in a modified 'frp' */
1458				VERIFY(cur == NULL);
1459				DPFPRINTF(("frag6cache[%u]: adjacent(merge "
1460				    "%d-%d) %d-%d (%d-%d)\n",
1461				    ntohl(fh->ip6f_ident), frp->fr_off,
1462				    frp->fr_end, off, fr_max, fra->fr_off,
1463				    fra->fr_end));
1464				fra->fr_off = frp->fr_off;
1465				LIST_REMOVE(frp, fr_next);
1466				pool_put(&pf_cent_pl, frp);
1467				pf_ncache--;
1468				frp = NULL;
1469			}
1470		}
1471	}
1472
1473	if (hosed) {
1474		/*
1475		 * We must keep tracking the overall fragment even when
1476		 * we're going to drop it anyway so that we know when to
1477		 * free the overall descriptor.  Thus we drop the frag late.
1478		 */
1479		goto drop_fragment;
1480	}
1481
1482 pass:
1483	/* Update maximum data size */
1484	if ((*frag)->fr_max < fr_max)
1485		(*frag)->fr_max = fr_max;
1486
1487	/* This is the last segment */
1488	if (!mff)
1489		(*frag)->fr_flags |= PFFRAG_SEENLAST;
1490
1491	/* Check if we are completely reassembled */
1492	if (((*frag)->fr_flags & PFFRAG_SEENLAST) &&
1493	    LIST_FIRST(&(*frag)->fr_cache)->fr_off == 0 &&
1494	    LIST_FIRST(&(*frag)->fr_cache)->fr_end == (*frag)->fr_max) {
1495		/* Remove from fragment queue */
1496		DPFPRINTF(("frag6cache[%u]: done 0-%d\n",
1497		    ntohl(fh->ip6f_ident), (*frag)->fr_max));
1498		pf_free_fragment(*frag);
1499		*frag = NULL;
1500	}
1501
1502	return (m);
1503
1504 no_mem:
1505	*nomem = 1;
1506
1507	/* Still need to pay attention to !IP_MF */
1508	if (!mff && *frag != NULL)
1509		(*frag)->fr_flags |= PFFRAG_SEENLAST;
1510
1511	m_freem(m);
1512	return (NULL);
1513
1514 drop_fragment:
1515
1516	/* Still need to pay attention to !IP_MF */
1517	if (!mff && *frag != NULL)
1518		(*frag)->fr_flags |= PFFRAG_SEENLAST;
1519
1520	if (drop) {
1521		/* This fragment has been deemed bad.  Don't reass */
1522		if (((*frag)->fr_flags & PFFRAG_DROP) == 0)
1523			DPFPRINTF(("frag6cache[%u]: dropping overall fragment\n",
1524			    ntohl(fh->ip6f_ident)));
1525		(*frag)->fr_flags |= PFFRAG_DROP;
1526	}
1527
1528	m_freem(m);
1529	return (NULL);
1530}
1531
1532int
1533pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason,
1534    struct pf_pdesc *pd)
1535{
1536	struct mbuf		*m = *m0;
1537	struct pf_rule		*r;
1538	struct pf_frent		*frent;
1539	struct pf_fragment	*frag = NULL;
1540	struct ip		*h = mtod(m, struct ip *);
1541	int			 mff = (ntohs(h->ip_off) & IP_MF);
1542	int			 hlen = h->ip_hl << 2;
1543	u_int16_t		 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
1544	u_int16_t		 fr_max;
1545	int			 ip_len;
1546	int			 ip_off;
1547	int			 asd = 0;
1548	struct pf_ruleset	*ruleset = NULL;
1549
1550	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1551	while (r != NULL) {
1552		r->evaluations++;
1553		if (pfi_kif_match(r->kif, kif) == r->ifnot)
1554			r = r->skip[PF_SKIP_IFP].ptr;
1555		else if (r->direction && r->direction != dir)
1556			r = r->skip[PF_SKIP_DIR].ptr;
1557		else if (r->af && r->af != AF_INET)
1558			r = r->skip[PF_SKIP_AF].ptr;
1559		else if (r->proto && r->proto != h->ip_p)
1560			r = r->skip[PF_SKIP_PROTO].ptr;
1561		else if (PF_MISMATCHAW(&r->src.addr,
1562		    (struct pf_addr *)&h->ip_src.s_addr, AF_INET,
1563		    r->src.neg, kif))
1564			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1565		else if (PF_MISMATCHAW(&r->dst.addr,
1566		    (struct pf_addr *)&h->ip_dst.s_addr, AF_INET,
1567		    r->dst.neg, NULL))
1568			r = r->skip[PF_SKIP_DST_ADDR].ptr;
1569		else {
1570			if (r->anchor == NULL)
1571				break;
1572			else
1573				pf_step_into_anchor(&asd, &ruleset,
1574				    PF_RULESET_SCRUB, &r, NULL, NULL);
1575		}
1576		if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
1577		    PF_RULESET_SCRUB, &r, NULL, NULL))
1578			break;
1579	}
1580
1581	if (r == NULL || r->action == PF_NOSCRUB)
1582		return (PF_PASS);
1583	else {
1584		r->packets[dir == PF_OUT]++;
1585		r->bytes[dir == PF_OUT] += pd->tot_len;
1586	}
1587
1588	/* Check for illegal packets */
1589	if (hlen < (int)sizeof (struct ip))
1590		goto drop;
1591
1592	if (hlen > ntohs(h->ip_len))
1593		goto drop;
1594
1595	/* Clear IP_DF if the rule uses the no-df option */
1596	if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
1597		u_int16_t ipoff = h->ip_off;
1598
1599		h->ip_off &= htons(~IP_DF);
1600		h->ip_sum = pf_cksum_fixup(h->ip_sum, ipoff, h->ip_off, 0);
1601	}
1602
1603	/* We will need other tests here */
1604	if (!fragoff && !mff)
1605		goto no_fragment;
1606
1607	/*
1608	 * We're dealing with a fragment now. Don't allow fragments
1609	 * with IP_DF to enter the cache. If the flag was cleared by
1610	 * no-df above, fine. Otherwise drop it.
1611	 */
1612	if (h->ip_off & htons(IP_DF)) {
1613		DPFPRINTF(("IP_DF\n"));
1614		goto bad;
1615	}
1616
1617	ip_len = ntohs(h->ip_len) - hlen;
1618	ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
1619
1620	/* All fragments are 8 byte aligned */
1621	if (mff && (ip_len & 0x7)) {
1622		DPFPRINTF(("mff and %d\n", ip_len));
1623		goto bad;
1624	}
1625
1626	/* Respect maximum length */
1627	if (fragoff + ip_len > IP_MAXPACKET) {
1628		DPFPRINTF(("max packet %d\n", fragoff + ip_len));
1629		goto bad;
1630	}
1631	fr_max = fragoff + ip_len;
1632
1633	if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) {
1634		/* Fully buffer all of the fragments */
1635
1636		frag = pf_find_fragment_by_ipv4_header(h, &pf_frag_tree);
1637		/* Check if we saw the last fragment already */
1638		if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
1639		    fr_max > frag->fr_max)
1640			goto bad;
1641
1642		/* Get an entry for the fragment queue */
1643		frent = pool_get(&pf_frent_pl, PR_NOWAIT);
1644		if (frent == NULL) {
1645			REASON_SET(reason, PFRES_MEMORY);
1646			return (PF_DROP);
1647		}
1648		pf_nfrents++;
1649		frent->fr_ip = h;
1650		frent->fr_m = m;
1651
1652		/* Might return a completely reassembled mbuf, or NULL */
1653		DPFPRINTF(("reass IPv4 frag %d @ %d-%d\n", ntohs(h->ip_id),
1654		    fragoff, fr_max));
1655		*m0 = m = pf_reassemble(m0, &frag, frent, mff);
1656
1657		if (m == NULL)
1658			return (PF_DROP);
1659
1660		VERIFY(m->m_flags & M_PKTHDR);
1661
1662		/* use mtag from concatenated mbuf chain */
1663		pd->pf_mtag = pf_find_mtag(m);
1664#if DIAGNOSTIC
1665		if (pd->pf_mtag == NULL) {
1666			printf("%s: pf_find_mtag returned NULL(1)\n", __func__);
1667			if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) {
1668				m_freem(m);
1669				*m0 = NULL;
1670				goto no_mem;
1671			}
1672		}
1673#endif
1674		if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
1675			goto drop;
1676
1677		h = mtod(m, struct ip *);
1678	} else {
1679		/* non-buffering fragment cache (drops or masks overlaps) */
1680		int	nomem = 0;
1681
1682		if (dir == PF_OUT && (pd->pf_mtag->pftag_flags & PF_TAG_FRAGCACHE)) {
1683			/*
1684			 * Already passed the fragment cache in the
1685			 * input direction.  If we continued, it would
1686			 * appear to be a dup and would be dropped.
1687			 */
1688			goto fragment_pass;
1689		}
1690
1691		frag = pf_find_fragment_by_ipv4_header(h, &pf_cache_tree);
1692
1693		/* Check if we saw the last fragment already */
1694		if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
1695		    fr_max > frag->fr_max) {
1696			if (r->rule_flag & PFRULE_FRAGDROP)
1697				frag->fr_flags |= PFFRAG_DROP;
1698			goto bad;
1699		}
1700
1701		*m0 = m = pf_fragcache(m0, h, &frag, mff,
1702		    (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem);
1703		if (m == NULL) {
1704			if (nomem)
1705				goto no_mem;
1706			goto drop;
1707		}
1708
1709		VERIFY(m->m_flags & M_PKTHDR);
1710
1711		/* use mtag from copied and trimmed mbuf chain */
1712		pd->pf_mtag = pf_find_mtag(m);
1713#if DIAGNOSTIC
1714		if (pd->pf_mtag == NULL) {
1715			printf("%s: pf_find_mtag returned NULL(2)\n", __func__);
1716			if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) {
1717				m_freem(m);
1718				*m0 = NULL;
1719				goto no_mem;
1720			}
1721		}
1722#endif
1723		if (dir == PF_IN)
1724			pd->pf_mtag->pftag_flags |= PF_TAG_FRAGCACHE;
1725
1726		if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
1727			goto drop;
1728		goto fragment_pass;
1729	}
1730
1731no_fragment:
1732	/* At this point, only IP_DF is allowed in ip_off */
1733	if (h->ip_off & ~htons(IP_DF)) {
1734		u_int16_t ipoff = h->ip_off;
1735
1736		h->ip_off &= htons(IP_DF);
1737		h->ip_sum = pf_cksum_fixup(h->ip_sum, ipoff, h->ip_off, 0);
1738	}
1739
1740	/* Enforce a minimum ttl, may cause endless packet loops */
1741	if (r->min_ttl && h->ip_ttl < r->min_ttl) {
1742		u_int16_t ip_ttl = h->ip_ttl;
1743
1744		h->ip_ttl = r->min_ttl;
1745		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
1746	}
1747	if (r->rule_flag & PFRULE_RANDOMID) {
1748		u_int16_t oip_id = h->ip_id;
1749
1750		h->ip_id = ip_randomid();
1751		h->ip_sum = pf_cksum_fixup(h->ip_sum, oip_id, h->ip_id, 0);
1752	}
1753	if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0)
1754		pd->flags |= PFDESC_IP_REAS;
1755
1756	return (PF_PASS);
1757
1758fragment_pass:
1759	/* Enforce a minimum ttl, may cause endless packet loops */
1760	if (r->min_ttl && h->ip_ttl < r->min_ttl) {
1761		u_int16_t ip_ttl = h->ip_ttl;
1762
1763		h->ip_ttl = r->min_ttl;
1764		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
1765	}
1766	if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0)
1767		pd->flags |= PFDESC_IP_REAS;
1768	return (PF_PASS);
1769
1770no_mem:
1771	REASON_SET(reason, PFRES_MEMORY);
1772	if (r != NULL && r->log)
1773		PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r,
1774		    NULL, NULL, pd);
1775	return (PF_DROP);
1776
1777drop:
1778	REASON_SET(reason, PFRES_NORM);
1779	if (r != NULL && r->log)
1780		PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r,
1781		    NULL, NULL, pd);
1782	return (PF_DROP);
1783
1784bad:
1785	DPFPRINTF(("dropping bad IPv4 fragment\n"));
1786
1787	/* Free associated fragments */
1788	if (frag != NULL)
1789		pf_free_fragment(frag);
1790
1791	REASON_SET(reason, PFRES_FRAG);
1792	if (r != NULL && r->log)
1793		PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r, NULL, NULL, pd);
1794
1795	return (PF_DROP);
1796}
1797
1798#if INET6
1799int
1800pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif,
1801    u_short *reason, struct pf_pdesc *pd)
1802{
1803	struct mbuf		*m = *m0;
1804	struct pf_rule		*r;
1805	struct ip6_hdr		*h = mtod(m, struct ip6_hdr *);
1806	int			 off;
1807	struct ip6_ext		 ext;
1808/* adi XXX */
1809#if 0
1810	struct ip6_opt		 opt;
1811	struct ip6_opt_jumbo	 jumbo;
1812	int			 optend;
1813	int			 ooff;
1814#endif
1815	struct ip6_frag		 frag;
1816	u_int32_t		 jumbolen = 0, plen;
1817	u_int16_t		 fragoff = 0;
1818	u_int8_t		 proto;
1819	int			 terminal;
1820	struct pf_frent		*frent;
1821	struct pf_fragment	*pff = NULL;
1822	int			 mff = 0, rh_cnt = 0;
1823	u_int16_t		 fr_max;
1824	int			 asd = 0;
1825	struct pf_ruleset	*ruleset = NULL;
1826
1827	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1828	while (r != NULL) {
1829		r->evaluations++;
1830		if (pfi_kif_match(r->kif, kif) == r->ifnot)
1831			r = r->skip[PF_SKIP_IFP].ptr;
1832		else if (r->direction && r->direction != dir)
1833			r = r->skip[PF_SKIP_DIR].ptr;
1834		else if (r->af && r->af != AF_INET6)
1835			r = r->skip[PF_SKIP_AF].ptr;
1836#if 0 /* header chain! */
1837		else if (r->proto && r->proto != h->ip6_nxt)
1838			r = r->skip[PF_SKIP_PROTO].ptr;
1839#endif
1840		else if (PF_MISMATCHAW(&r->src.addr,
1841		    (struct pf_addr *)&h->ip6_src, AF_INET6,
1842		    r->src.neg, kif))
1843			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1844		else if (PF_MISMATCHAW(&r->dst.addr,
1845		    (struct pf_addr *)&h->ip6_dst, AF_INET6,
1846		    r->dst.neg, NULL))
1847			r = r->skip[PF_SKIP_DST_ADDR].ptr;
1848		else {
1849			if (r->anchor == NULL)
1850				break;
1851			else
1852				pf_step_into_anchor(&asd, &ruleset,
1853				    PF_RULESET_SCRUB, &r, NULL, NULL);
1854		}
1855		if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
1856		    PF_RULESET_SCRUB, &r, NULL, NULL))
1857			break;
1858	}
1859
1860	if (r == NULL || r->action == PF_NOSCRUB)
1861		return (PF_PASS);
1862	else {
1863		r->packets[dir == PF_OUT]++;
1864		r->bytes[dir == PF_OUT] += pd->tot_len;
1865	}
1866
1867	/* Check for illegal packets */
1868	if ((int)(sizeof (struct ip6_hdr) + IPV6_MAXPACKET) < m->m_pkthdr.len)
1869		goto drop;
1870
1871	off = sizeof (struct ip6_hdr);
1872	proto = h->ip6_nxt;
1873	terminal = 0;
1874	do {
1875		pd->proto = proto;
1876		switch (proto) {
1877		case IPPROTO_FRAGMENT:
1878			goto fragment;
1879			break;
1880		case IPPROTO_AH:
1881		case IPPROTO_ROUTING:
1882		case IPPROTO_DSTOPTS:
1883			if (!pf_pull_hdr(m, off, &ext, sizeof (ext), NULL,
1884			    NULL, AF_INET6))
1885				goto shortpkt;
1886			/*
1887			 * <jhw@apple.com>
1888			 * Multiple routing headers not allowed.
1889			 * Routing header type zero considered harmful.
1890			 */
1891			if (proto == IPPROTO_ROUTING) {
1892				const struct ip6_rthdr *rh =
1893				    (const struct ip6_rthdr *)&ext;
1894				if (rh_cnt++)
1895					goto drop;
1896				if (rh->ip6r_type == IPV6_RTHDR_TYPE_0)
1897					goto drop;
1898			}
1899			else
1900			if (proto == IPPROTO_AH)
1901				off += (ext.ip6e_len + 2) * 4;
1902			else
1903				off += (ext.ip6e_len + 1) * 8;
1904			proto = ext.ip6e_nxt;
1905			break;
1906		case IPPROTO_HOPOPTS:
1907/* adi XXX */
1908#if 0
1909			if (!pf_pull_hdr(m, off, &ext, sizeof (ext), NULL,
1910			    NULL, AF_INET6))
1911				goto shortpkt;
1912			optend = off + (ext.ip6e_len + 1) * 8;
1913			ooff = off + sizeof (ext);
1914			do {
1915				if (!pf_pull_hdr(m, ooff, &opt.ip6o_type,
1916				    sizeof (opt.ip6o_type), NULL, NULL,
1917				    AF_INET6))
1918					goto shortpkt;
1919				if (opt.ip6o_type == IP6OPT_PAD1) {
1920					ooff++;
1921					continue;
1922				}
1923				if (!pf_pull_hdr(m, ooff, &opt, sizeof (opt),
1924				    NULL, NULL, AF_INET6))
1925					goto shortpkt;
1926				if (ooff + sizeof (opt) + opt.ip6o_len > optend)
1927					goto drop;
1928				switch (opt.ip6o_type) {
1929				case IP6OPT_JUMBO:
1930					if (h->ip6_plen != 0)
1931						goto drop;
1932					if (!pf_pull_hdr(m, ooff, &jumbo,
1933					    sizeof (jumbo), NULL, NULL,
1934					    AF_INET6))
1935						goto shortpkt;
1936					memcpy(&jumbolen, jumbo.ip6oj_jumbo_len,
1937					    sizeof (jumbolen));
1938					jumbolen = ntohl(jumbolen);
1939					if (jumbolen <= IPV6_MAXPACKET)
1940						goto drop;
1941					if (sizeof (struct ip6_hdr) +
1942					    jumbolen != m->m_pkthdr.len)
1943						goto drop;
1944					break;
1945				default:
1946					break;
1947				}
1948				ooff += sizeof (opt) + opt.ip6o_len;
1949			} while (ooff < optend);
1950
1951			off = optend;
1952			proto = ext.ip6e_nxt;
1953			break;
1954#endif
1955		default:
1956			terminal = 1;
1957			break;
1958		}
1959	} while (!terminal);
1960
1961	/* jumbo payload option must be present, or plen > 0 */
1962	if (ntohs(h->ip6_plen) == 0)
1963		plen = jumbolen;
1964	else
1965		plen = ntohs(h->ip6_plen);
1966	if (plen == 0)
1967		goto drop;
1968	if ((int)(sizeof (struct ip6_hdr) + plen) > m->m_pkthdr.len)
1969		goto shortpkt;
1970
1971	/* Enforce a minimum ttl, may cause endless packet loops */
1972	if (r->min_ttl && h->ip6_hlim < r->min_ttl)
1973		h->ip6_hlim = r->min_ttl;
1974
1975	return (PF_PASS);
1976
1977fragment:
1978	if (ntohs(h->ip6_plen) == 0 || jumbolen)
1979		goto drop;
1980	plen = ntohs(h->ip6_plen);
1981
1982	if (!pf_pull_hdr(m, off, &frag, sizeof (frag), NULL, NULL, AF_INET6))
1983		goto shortpkt;
1984	fragoff = ntohs(frag.ip6f_offlg & IP6F_OFF_MASK);
1985	pd->proto = frag.ip6f_nxt;
1986	mff = ntohs(frag.ip6f_offlg & IP6F_MORE_FRAG);
1987	off += sizeof frag;
1988	if (fragoff + (plen - off) > IPV6_MAXPACKET)
1989	       goto badfrag;
1990
1991	fr_max = fragoff + plen - (off - sizeof(struct ip6_hdr));
1992	DPFPRINTF(("0x%llx IPv6 frag plen %u mff %d off %u fragoff %u "
1993	    "fr_max %u\n", (uint64_t)VM_KERNEL_ADDRPERM(m), plen, mff, off,
1994	    fragoff, fr_max));
1995
1996	if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) {
1997		/* Fully buffer all of the fragments */
1998		pd->flags |= PFDESC_IP_REAS;
1999
2000		pff = pf_find_fragment_by_ipv6_header(h, &frag,
2001		   &pf_frag_tree);
2002
2003		/* Check if we saw the last fragment already */
2004		if (pff != NULL && (pff->fr_flags & PFFRAG_SEENLAST) &&
2005		    fr_max > pff->fr_max)
2006			goto badfrag;
2007
2008		/* Get an entry for the fragment queue */
2009		frent = pool_get(&pf_frent_pl, PR_NOWAIT);
2010		if (frent == NULL) {
2011			REASON_SET(reason, PFRES_MEMORY);
2012			return (PF_DROP);
2013		}
2014		pf_nfrents++;
2015		frent->fr_ip6 = h;
2016		frent->fr_m = m;
2017		frent->fr_ip6f_opt = frag;
2018		frent->fr_ip6f_hlen = off;
2019
2020		/* Might return a completely reassembled mbuf, or NULL */
2021		DPFPRINTF(("reass IPv6 frag %d @ %d-%d\n",
2022		     ntohl(frag.ip6f_ident), fragoff, fr_max));
2023		*m0 = m = pf_reassemble6(m0, &pff, frent, mff);
2024
2025		if (m == NULL)
2026			return (PF_DROP);
2027
2028		if (pff != NULL && (pff->fr_flags & PFFRAG_DROP))
2029			goto drop;
2030
2031		h = mtod(m, struct ip6_hdr *);
2032	}
2033	else if (dir == PF_IN || !(pd->pf_mtag->pftag_flags & PF_TAG_FRAGCACHE)) {
2034		/* non-buffering fragment cache (overlaps: see RFC 5722) */
2035		int nomem = 0;
2036
2037		pff = pf_find_fragment_by_ipv6_header(h, &frag,
2038		    &pf_cache_tree);
2039
2040		/* Check if we saw the last fragment already */
2041		if (pff != NULL && (pff->fr_flags & PFFRAG_SEENLAST) &&
2042		    fr_max > pff->fr_max) {
2043		       if (r->rule_flag & PFRULE_FRAGDROP)
2044				pff->fr_flags |= PFFRAG_DROP;
2045		       goto badfrag;
2046		}
2047
2048		*m0 = m = pf_frag6cache(m0, h, &frag, &pff, off, mff,
2049		     (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem);
2050		if (m == NULL) {
2051			if (nomem)
2052				goto no_mem;
2053			goto drop;
2054		}
2055
2056		if (dir == PF_IN)
2057			pd->pf_mtag->pftag_flags |= PF_TAG_FRAGCACHE;
2058
2059		if (pff != NULL && (pff->fr_flags & PFFRAG_DROP))
2060			goto drop;
2061	}
2062
2063	/* Enforce a minimum ttl, may cause endless packet loops */
2064	if (r->min_ttl && h->ip6_hlim < r->min_ttl)
2065		h->ip6_hlim = r->min_ttl;
2066	return (PF_PASS);
2067
2068  no_mem:
2069	REASON_SET(reason, PFRES_MEMORY);
2070	goto dropout;
2071
2072  shortpkt:
2073	REASON_SET(reason, PFRES_SHORT);
2074	goto dropout;
2075
2076  drop:
2077	REASON_SET(reason, PFRES_NORM);
2078	goto dropout;
2079
2080  badfrag:
2081	DPFPRINTF(("dropping bad IPv6 fragment\n"));
2082	REASON_SET(reason, PFRES_FRAG);
2083	goto dropout;
2084
2085  dropout:
2086	if (pff != NULL)
2087		pf_free_fragment(pff);
2088	if (r != NULL && r->log)
2089		PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd);
2090	return (PF_DROP);
2091}
2092#endif /* INET6 */
2093
2094int
2095pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff,
2096    int off, void *h, struct pf_pdesc *pd)
2097{
2098#pragma unused(ipoff, h)
2099	struct pf_rule	*r, *rm = NULL;
2100	struct tcphdr	*th = pd->hdr.tcp;
2101	int		 rewrite = 0;
2102	int		 asd = 0;
2103	u_short		 reason;
2104	u_int8_t	 flags;
2105	sa_family_t	 af = pd->af;
2106	struct pf_ruleset *ruleset = NULL;
2107	union pf_state_xport sxport, dxport;
2108
2109	sxport.port = th->th_sport;
2110	dxport.port = th->th_dport;
2111
2112	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
2113	while (r != NULL) {
2114		r->evaluations++;
2115		if (pfi_kif_match(r->kif, kif) == r->ifnot)
2116			r = r->skip[PF_SKIP_IFP].ptr;
2117		else if (r->direction && r->direction != dir)
2118			r = r->skip[PF_SKIP_DIR].ptr;
2119		else if (r->af && r->af != af)
2120			r = r->skip[PF_SKIP_AF].ptr;
2121		else if (r->proto && r->proto != pd->proto)
2122			r = r->skip[PF_SKIP_PROTO].ptr;
2123		else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
2124		    r->src.neg, kif))
2125			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
2126		else if (r->src.xport.range.op &&
2127		    !pf_match_xport(r->src.xport.range.op, r->proto_variant,
2128		    &r->src.xport, &sxport))
2129			r = r->skip[PF_SKIP_SRC_PORT].ptr;
2130		else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
2131		    r->dst.neg, NULL))
2132			r = r->skip[PF_SKIP_DST_ADDR].ptr;
2133		else if (r->dst.xport.range.op &&
2134		    !pf_match_xport(r->dst.xport.range.op, r->proto_variant,
2135		    &r->dst.xport, &dxport))
2136			r = r->skip[PF_SKIP_DST_PORT].ptr;
2137		else if (r->os_fingerprint != PF_OSFP_ANY &&
2138		    !pf_osfp_match(pf_osfp_fingerprint(pd, m, off, th),
2139		    r->os_fingerprint))
2140			r = TAILQ_NEXT(r, entries);
2141		else {
2142			if (r->anchor == NULL) {
2143				rm = r;
2144				break;
2145			} else {
2146				pf_step_into_anchor(&asd, &ruleset,
2147				    PF_RULESET_SCRUB, &r, NULL, NULL);
2148			}
2149		}
2150		if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
2151		    PF_RULESET_SCRUB, &r, NULL, NULL))
2152			break;
2153	}
2154
2155	if (rm == NULL || rm->action == PF_NOSCRUB)
2156		return (PF_PASS);
2157	else {
2158		r->packets[dir == PF_OUT]++;
2159		r->bytes[dir == PF_OUT] += pd->tot_len;
2160	}
2161
2162	if (rm->rule_flag & PFRULE_REASSEMBLE_TCP)
2163		pd->flags |= PFDESC_TCP_NORM;
2164
2165	flags = th->th_flags;
2166	if (flags & TH_SYN) {
2167		/* Illegal packet */
2168		if (flags & TH_RST)
2169			goto tcp_drop;
2170
2171		if (flags & TH_FIN)
2172			flags &= ~TH_FIN;
2173	} else {
2174		/* Illegal packet */
2175		if (!(flags & (TH_ACK|TH_RST)))
2176			goto tcp_drop;
2177	}
2178
2179	if (!(flags & TH_ACK)) {
2180		/* These flags are only valid if ACK is set */
2181		if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG))
2182			goto tcp_drop;
2183	}
2184
2185	/* Check for illegal header length */
2186	if (th->th_off < (sizeof (struct tcphdr) >> 2))
2187		goto tcp_drop;
2188
2189	/* If flags changed, or reserved data set, then adjust */
2190	if (flags != th->th_flags || th->th_x2 != 0) {
2191		u_int16_t	ov, nv;
2192
2193		ov = *(u_int16_t *)(&th->th_ack + 1);
2194		th->th_flags = flags;
2195		th->th_x2 = 0;
2196		nv = *(u_int16_t *)(&th->th_ack + 1);
2197
2198		th->th_sum = pf_cksum_fixup(th->th_sum, ov, nv, 0);
2199		rewrite = 1;
2200	}
2201
2202	/* Remove urgent pointer, if TH_URG is not set */
2203	if (!(flags & TH_URG) && th->th_urp) {
2204		th->th_sum = pf_cksum_fixup(th->th_sum, th->th_urp, 0, 0);
2205		th->th_urp = 0;
2206		rewrite = 1;
2207	}
2208
2209	/* copy back packet headers if we sanitized */
2210	/* Process options */
2211	if (r->max_mss) {
2212		int rv = pf_normalize_tcpopt(r, dir, kif, pd, m, th, off,
2213		    &rewrite);
2214		if (rv == PF_DROP)
2215			return rv;
2216		m = pd->mp;
2217	}
2218
2219	if (rewrite) {
2220		struct mbuf *mw = pf_lazy_makewritable(pd, m,
2221		    off + sizeof (*th));
2222		if (!mw) {
2223			REASON_SET(&reason, PFRES_MEMORY);
2224			if (r->log)
2225				PFLOG_PACKET(kif, h, m, AF_INET, dir, reason,
2226				    r, 0, 0, pd);
2227			return PF_DROP;
2228		}
2229
2230		m_copyback(mw, off, sizeof (*th), th);
2231	}
2232
2233	return (PF_PASS);
2234
2235tcp_drop:
2236	REASON_SET(&reason, PFRES_NORM);
2237	if (rm != NULL && r->log)
2238		PFLOG_PACKET(kif, h, m, AF_INET, dir, reason, r, NULL, NULL, pd);
2239	return (PF_DROP);
2240}
2241
2242int
2243pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd,
2244    struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst)
2245{
2246#pragma unused(dst)
2247	u_int32_t tsval, tsecr;
2248	u_int8_t hdr[60];
2249	u_int8_t *opt;
2250
2251	VERIFY(src->scrub == NULL);
2252
2253	src->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT);
2254	if (src->scrub == NULL)
2255		return (1);
2256	bzero(src->scrub, sizeof (*src->scrub));
2257
2258	switch (pd->af) {
2259#if INET
2260	case AF_INET: {
2261		struct ip *h = mtod(m, struct ip *);
2262		src->scrub->pfss_ttl = h->ip_ttl;
2263		break;
2264	}
2265#endif /* INET */
2266#if INET6
2267	case AF_INET6: {
2268		struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
2269		src->scrub->pfss_ttl = h->ip6_hlim;
2270		break;
2271	}
2272#endif /* INET6 */
2273	}
2274
2275
2276	/*
2277	 * All normalizations below are only begun if we see the start of
2278	 * the connections.  They must all set an enabled bit in pfss_flags
2279	 */
2280	if ((th->th_flags & TH_SYN) == 0)
2281		return (0);
2282
2283
2284	if (th->th_off > (sizeof (struct tcphdr) >> 2) && src->scrub &&
2285	    pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
2286		/* Diddle with TCP options */
2287		int hlen;
2288		opt = hdr + sizeof (struct tcphdr);
2289		hlen = (th->th_off << 2) - sizeof (struct tcphdr);
2290		while (hlen >= TCPOLEN_TIMESTAMP) {
2291			switch (*opt) {
2292			case TCPOPT_EOL:	/* FALLTHROUGH */
2293			case TCPOPT_NOP:
2294				opt++;
2295				hlen--;
2296				break;
2297			case TCPOPT_TIMESTAMP:
2298				if (opt[1] >= TCPOLEN_TIMESTAMP) {
2299					src->scrub->pfss_flags |=
2300					    PFSS_TIMESTAMP;
2301					src->scrub->pfss_ts_mod =
2302					    htonl(random());
2303
2304					/* note PFSS_PAWS not set yet */
2305					memcpy(&tsval, &opt[2],
2306					    sizeof (u_int32_t));
2307					memcpy(&tsecr, &opt[6],
2308					    sizeof (u_int32_t));
2309					src->scrub->pfss_tsval0 = ntohl(tsval);
2310					src->scrub->pfss_tsval = ntohl(tsval);
2311					src->scrub->pfss_tsecr = ntohl(tsecr);
2312					getmicrouptime(&src->scrub->pfss_last);
2313				}
2314				/* FALLTHROUGH */
2315			default:
2316				hlen -= MAX(opt[1], 2);
2317				opt += MAX(opt[1], 2);
2318				break;
2319			}
2320		}
2321	}
2322
2323	return (0);
2324}
2325
2326void
2327pf_normalize_tcp_cleanup(struct pf_state *state)
2328{
2329	if (state->src.scrub)
2330		pool_put(&pf_state_scrub_pl, state->src.scrub);
2331	if (state->dst.scrub)
2332		pool_put(&pf_state_scrub_pl, state->dst.scrub);
2333
2334	/* Someday... flush the TCP segment reassembly descriptors. */
2335}
2336
2337int
2338pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd,
2339    u_short *reason, struct tcphdr *th, struct pf_state *state,
2340    struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback)
2341{
2342	struct timeval uptime;
2343	u_int32_t tsval, tsecr;
2344	u_int tsval_from_last;
2345	u_int8_t hdr[60];
2346	u_int8_t *opt;
2347	int copyback = 0;
2348	int got_ts = 0;
2349
2350	VERIFY(src->scrub || dst->scrub);
2351
2352	/*
2353	 * Enforce the minimum TTL seen for this connection.  Negate a common
2354	 * technique to evade an intrusion detection system and confuse
2355	 * firewall state code.
2356	 */
2357	switch (pd->af) {
2358#if INET
2359	case AF_INET: {
2360		if (src->scrub) {
2361			struct ip *h = mtod(m, struct ip *);
2362			if (h->ip_ttl > src->scrub->pfss_ttl)
2363				src->scrub->pfss_ttl = h->ip_ttl;
2364			h->ip_ttl = src->scrub->pfss_ttl;
2365		}
2366		break;
2367	}
2368#endif /* INET */
2369#if INET6
2370	case AF_INET6: {
2371		if (src->scrub) {
2372			struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
2373			if (h->ip6_hlim > src->scrub->pfss_ttl)
2374				src->scrub->pfss_ttl = h->ip6_hlim;
2375			h->ip6_hlim = src->scrub->pfss_ttl;
2376		}
2377		break;
2378	}
2379#endif /* INET6 */
2380	}
2381
2382	if (th->th_off > (sizeof (struct tcphdr) >> 2) &&
2383	    ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) ||
2384	    (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) &&
2385	    pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
2386		/* Diddle with TCP options */
2387		int hlen;
2388		opt = hdr + sizeof (struct tcphdr);
2389		hlen = (th->th_off << 2) - sizeof (struct tcphdr);
2390		while (hlen >= TCPOLEN_TIMESTAMP) {
2391			switch (*opt) {
2392			case TCPOPT_EOL:	/* FALLTHROUGH */
2393			case TCPOPT_NOP:
2394				opt++;
2395				hlen--;
2396				break;
2397			case TCPOPT_TIMESTAMP:
2398				/*
2399				 * Modulate the timestamps.  Can be used for
2400				 * NAT detection, OS uptime determination or
2401				 * reboot detection.
2402				 */
2403
2404				if (got_ts) {
2405					/* Huh?  Multiple timestamps!? */
2406					if (pf_status.debug >= PF_DEBUG_MISC) {
2407						DPFPRINTF(("multiple TS??"));
2408						pf_print_state(state);
2409						printf("\n");
2410					}
2411					REASON_SET(reason, PFRES_TS);
2412					return (PF_DROP);
2413				}
2414				if (opt[1] >= TCPOLEN_TIMESTAMP) {
2415					memcpy(&tsval, &opt[2],
2416					    sizeof (u_int32_t));
2417					if (tsval && src->scrub &&
2418					    (src->scrub->pfss_flags &
2419					    PFSS_TIMESTAMP)) {
2420						tsval = ntohl(tsval);
2421						pf_change_a(&opt[2],
2422						    &th->th_sum,
2423						    htonl(tsval +
2424						    src->scrub->pfss_ts_mod),
2425						    0);
2426						copyback = 1;
2427					}
2428
2429					/* Modulate TS reply iff valid (!0) */
2430					memcpy(&tsecr, &opt[6],
2431					    sizeof (u_int32_t));
2432					if (tsecr && dst->scrub &&
2433					    (dst->scrub->pfss_flags &
2434					    PFSS_TIMESTAMP)) {
2435						tsecr = ntohl(tsecr)
2436						    - dst->scrub->pfss_ts_mod;
2437						pf_change_a(&opt[6],
2438						    &th->th_sum, htonl(tsecr),
2439						    0);
2440						copyback = 1;
2441					}
2442					got_ts = 1;
2443				}
2444				/* FALLTHROUGH */
2445			default:
2446				hlen -= MAX(opt[1], 2);
2447				opt += MAX(opt[1], 2);
2448				break;
2449			}
2450		}
2451		if (copyback) {
2452			/* Copyback the options, caller copys back header */
2453			int optoff = off + sizeof (*th);
2454			int optlen = (th->th_off << 2) - sizeof (*th);
2455			m = pf_lazy_makewritable(pd, m, optoff + optlen);
2456			if (!m) {
2457				REASON_SET(reason, PFRES_MEMORY);
2458				return PF_DROP;
2459			}
2460			*writeback = optoff + optlen;
2461			m_copyback(m, optoff, optlen, hdr + sizeof (*th));
2462		}
2463	}
2464
2465
2466	/*
2467	 * Must invalidate PAWS checks on connections idle for too long.
2468	 * The fastest allowed timestamp clock is 1ms.  That turns out to
2469	 * be about 24 days before it wraps.  XXX Right now our lowerbound
2470	 * TS echo check only works for the first 12 days of a connection
2471	 * when the TS has exhausted half its 32bit space
2472	 */
2473#define TS_MAX_IDLE	(24*24*60*60)
2474#define TS_MAX_CONN	(12*24*60*60)	/* XXX remove when better tsecr check */
2475
2476	getmicrouptime(&uptime);
2477	if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) &&
2478	    (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE ||
2479	    pf_time_second() - state->creation > TS_MAX_CONN))  {
2480		if (pf_status.debug >= PF_DEBUG_MISC) {
2481			DPFPRINTF(("src idled out of PAWS\n"));
2482			pf_print_state(state);
2483			printf("\n");
2484		}
2485		src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS)
2486		    | PFSS_PAWS_IDLED;
2487	}
2488	if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) &&
2489	    uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) {
2490		if (pf_status.debug >= PF_DEBUG_MISC) {
2491			DPFPRINTF(("dst idled out of PAWS\n"));
2492			pf_print_state(state);
2493			printf("\n");
2494		}
2495		dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS)
2496		    | PFSS_PAWS_IDLED;
2497	}
2498
2499	if (got_ts && src->scrub && dst->scrub &&
2500	    (src->scrub->pfss_flags & PFSS_PAWS) &&
2501	    (dst->scrub->pfss_flags & PFSS_PAWS)) {
2502		/*
2503		 * Validate that the timestamps are "in-window".
2504		 * RFC1323 describes TCP Timestamp options that allow
2505		 * measurement of RTT (round trip time) and PAWS
2506		 * (protection against wrapped sequence numbers).  PAWS
2507		 * gives us a set of rules for rejecting packets on
2508		 * long fat pipes (packets that were somehow delayed
2509		 * in transit longer than the time it took to send the
2510		 * full TCP sequence space of 4Gb).  We can use these
2511		 * rules and infer a few others that will let us treat
2512		 * the 32bit timestamp and the 32bit echoed timestamp
2513		 * as sequence numbers to prevent a blind attacker from
2514		 * inserting packets into a connection.
2515		 *
2516		 * RFC1323 tells us:
2517		 *  - The timestamp on this packet must be greater than
2518		 *    or equal to the last value echoed by the other
2519		 *    endpoint.  The RFC says those will be discarded
2520		 *    since it is a dup that has already been acked.
2521		 *    This gives us a lowerbound on the timestamp.
2522		 *        timestamp >= other last echoed timestamp
2523		 *  - The timestamp will be less than or equal to
2524		 *    the last timestamp plus the time between the
2525		 *    last packet and now.  The RFC defines the max
2526		 *    clock rate as 1ms.  We will allow clocks to be
2527		 *    up to 10% fast and will allow a total difference
2528		 *    or 30 seconds due to a route change.  And this
2529		 *    gives us an upperbound on the timestamp.
2530		 *        timestamp <= last timestamp + max ticks
2531		 *    We have to be careful here.  Windows will send an
2532		 *    initial timestamp of zero and then initialize it
2533		 *    to a random value after the 3whs; presumably to
2534		 *    avoid a DoS by having to call an expensive RNG
2535		 *    during a SYN flood.  Proof MS has at least one
2536		 *    good security geek.
2537		 *
2538		 *  - The TCP timestamp option must also echo the other
2539		 *    endpoints timestamp.  The timestamp echoed is the
2540		 *    one carried on the earliest unacknowledged segment
2541		 *    on the left edge of the sequence window.  The RFC
2542		 *    states that the host will reject any echoed
2543		 *    timestamps that were larger than any ever sent.
2544		 *    This gives us an upperbound on the TS echo.
2545		 *        tescr <= largest_tsval
2546		 *  - The lowerbound on the TS echo is a little more
2547		 *    tricky to determine.  The other endpoint's echoed
2548		 *    values will not decrease.  But there may be
2549		 *    network conditions that re-order packets and
2550		 *    cause our view of them to decrease.  For now the
2551		 *    only lowerbound we can safely determine is that
2552		 *    the TS echo will never be less than the original
2553		 *    TS.  XXX There is probably a better lowerbound.
2554		 *    Remove TS_MAX_CONN with better lowerbound check.
2555		 *        tescr >= other original TS
2556		 *
2557		 * It is also important to note that the fastest
2558		 * timestamp clock of 1ms will wrap its 32bit space in
2559		 * 24 days.  So we just disable TS checking after 24
2560		 * days of idle time.  We actually must use a 12d
2561		 * connection limit until we can come up with a better
2562		 * lowerbound to the TS echo check.
2563		 */
2564		struct timeval delta_ts;
2565		int ts_fudge;
2566
2567
2568		/*
2569		 * PFTM_TS_DIFF is how many seconds of leeway to allow
2570		 * a host's timestamp.  This can happen if the previous
2571		 * packet got delayed in transit for much longer than
2572		 * this packet.
2573		 */
2574		if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0)
2575			ts_fudge = pf_default_rule.timeout[PFTM_TS_DIFF];
2576
2577
2578		/* Calculate max ticks since the last timestamp */
2579#define TS_MAXFREQ	1100		/* RFC max TS freq of 1Khz + 10% skew */
2580#define TS_MICROSECS	1000000		/* microseconds per second */
2581		timersub(&uptime, &src->scrub->pfss_last, &delta_ts);
2582		tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ;
2583		tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ);
2584
2585
2586		if ((src->state >= TCPS_ESTABLISHED &&
2587		    dst->state >= TCPS_ESTABLISHED) &&
2588		    (SEQ_LT(tsval, dst->scrub->pfss_tsecr) ||
2589		    SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ||
2590		    (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) ||
2591		    SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) {
2592			/*
2593			 * Bad RFC1323 implementation or an insertion attack.
2594			 *
2595			 * - Solaris 2.6 and 2.7 are known to send another ACK
2596			 *   after the FIN,FIN|ACK,ACK closing that carries
2597			 *   an old timestamp.
2598			 */
2599
2600			DPFPRINTF(("Timestamp failed %c%c%c%c\n",
2601			    SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ',
2602			    SEQ_GT(tsval, src->scrub->pfss_tsval +
2603			    tsval_from_last) ? '1' : ' ',
2604			    SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ',
2605			    SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' '));
2606			DPFPRINTF((" tsval: %u  tsecr: %u  +ticks: %u  "
2607			    "idle: %lus %ums\n",
2608			    tsval, tsecr, tsval_from_last, delta_ts.tv_sec,
2609			    delta_ts.tv_usec / 1000));
2610			DPFPRINTF((" src->tsval: %u  tsecr: %u\n",
2611			    src->scrub->pfss_tsval, src->scrub->pfss_tsecr));
2612			DPFPRINTF((" dst->tsval: %u  tsecr: %u  tsval0: %u\n",
2613			    dst->scrub->pfss_tsval, dst->scrub->pfss_tsecr,
2614			    dst->scrub->pfss_tsval0));
2615			if (pf_status.debug >= PF_DEBUG_MISC) {
2616				pf_print_state(state);
2617				pf_print_flags(th->th_flags);
2618				printf("\n");
2619			}
2620			REASON_SET(reason, PFRES_TS);
2621			return (PF_DROP);
2622		}
2623
2624		/* XXX I'd really like to require tsecr but it's optional */
2625
2626	} else if (!got_ts && (th->th_flags & TH_RST) == 0 &&
2627	    ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED)
2628	    || pd->p_len > 0 || (th->th_flags & TH_SYN)) &&
2629	    src->scrub && dst->scrub &&
2630	    (src->scrub->pfss_flags & PFSS_PAWS) &&
2631	    (dst->scrub->pfss_flags & PFSS_PAWS)) {
2632		/*
2633		 * Didn't send a timestamp.  Timestamps aren't really useful
2634		 * when:
2635		 *  - connection opening or closing (often not even sent).
2636		 *    but we must not let an attacker to put a FIN on a
2637		 *    data packet to sneak it through our ESTABLISHED check.
2638		 *  - on a TCP reset.  RFC suggests not even looking at TS.
2639		 *  - on an empty ACK.  The TS will not be echoed so it will
2640		 *    probably not help keep the RTT calculation in sync and
2641		 *    there isn't as much danger when the sequence numbers
2642		 *    got wrapped.  So some stacks don't include TS on empty
2643		 *    ACKs :-(
2644		 *
2645		 * To minimize the disruption to mostly RFC1323 conformant
2646		 * stacks, we will only require timestamps on data packets.
2647		 *
2648		 * And what do ya know, we cannot require timestamps on data
2649		 * packets.  There appear to be devices that do legitimate
2650		 * TCP connection hijacking.  There are HTTP devices that allow
2651		 * a 3whs (with timestamps) and then buffer the HTTP request.
2652		 * If the intermediate device has the HTTP response cache, it
2653		 * will spoof the response but not bother timestamping its
2654		 * packets.  So we can look for the presence of a timestamp in
2655		 * the first data packet and if there, require it in all future
2656		 * packets.
2657		 */
2658
2659		if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) {
2660			/*
2661			 * Hey!  Someone tried to sneak a packet in.  Or the
2662			 * stack changed its RFC1323 behavior?!?!
2663			 */
2664			if (pf_status.debug >= PF_DEBUG_MISC) {
2665				DPFPRINTF(("Did not receive expected RFC1323 "
2666				    "timestamp\n"));
2667				pf_print_state(state);
2668				pf_print_flags(th->th_flags);
2669				printf("\n");
2670			}
2671			REASON_SET(reason, PFRES_TS);
2672			return (PF_DROP);
2673		}
2674	}
2675
2676
2677	/*
2678	 * We will note if a host sends his data packets with or without
2679	 * timestamps.  And require all data packets to contain a timestamp
2680	 * if the first does.  PAWS implicitly requires that all data packets be
2681	 * timestamped.  But I think there are middle-man devices that hijack
2682	 * TCP streams immediately after the 3whs and don't timestamp their
2683	 * packets (seen in a WWW accelerator or cache).
2684	 */
2685	if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags &
2686	    (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) {
2687		if (got_ts)
2688			src->scrub->pfss_flags |= PFSS_DATA_TS;
2689		else {
2690			src->scrub->pfss_flags |= PFSS_DATA_NOTS;
2691			if (pf_status.debug >= PF_DEBUG_MISC && dst->scrub &&
2692			    (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) {
2693				/* Don't warn if other host rejected RFC1323 */
2694				DPFPRINTF(("Broken RFC1323 stack did not "
2695				    "timestamp data packet. Disabled PAWS "
2696				    "security.\n"));
2697				pf_print_state(state);
2698				pf_print_flags(th->th_flags);
2699				printf("\n");
2700			}
2701		}
2702	}
2703
2704
2705	/*
2706	 * Update PAWS values
2707	 */
2708	if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags &
2709	    (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) {
2710		getmicrouptime(&src->scrub->pfss_last);
2711		if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) ||
2712		    (src->scrub->pfss_flags & PFSS_PAWS) == 0)
2713			src->scrub->pfss_tsval = tsval;
2714
2715		if (tsecr) {
2716			if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) ||
2717			    (src->scrub->pfss_flags & PFSS_PAWS) == 0)
2718				src->scrub->pfss_tsecr = tsecr;
2719
2720			if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 &&
2721			    (SEQ_LT(tsval, src->scrub->pfss_tsval0) ||
2722			    src->scrub->pfss_tsval0 == 0)) {
2723				/* tsval0 MUST be the lowest timestamp */
2724				src->scrub->pfss_tsval0 = tsval;
2725			}
2726
2727			/* Only fully initialized after a TS gets echoed */
2728			if ((src->scrub->pfss_flags & PFSS_PAWS) == 0)
2729				src->scrub->pfss_flags |= PFSS_PAWS;
2730		}
2731	}
2732
2733	/* I have a dream....  TCP segment reassembly.... */
2734	return (0);
2735}
2736
2737static int
2738pf_normalize_tcpopt(struct pf_rule *r, int dir, struct pfi_kif *kif,
2739    struct pf_pdesc *pd, struct mbuf *m, struct tcphdr *th, int off,
2740    int *rewrptr)
2741{
2742#pragma unused(dir, kif)
2743	sa_family_t af = pd->af;
2744	u_int16_t	*mss;
2745	int		thoff;
2746	int		opt, cnt, optlen = 0;
2747	int		rewrite = 0;
2748	u_char		opts[MAX_TCPOPTLEN];
2749	u_char		*optp = opts;
2750
2751	thoff = th->th_off << 2;
2752	cnt = thoff - sizeof (struct tcphdr);
2753
2754	if (cnt > 0 && !pf_pull_hdr(m, off + sizeof (*th), opts, cnt,
2755	    NULL, NULL, af))
2756		return PF_DROP;
2757
2758	for (; cnt > 0; cnt -= optlen, optp += optlen) {
2759		opt = optp[0];
2760		if (opt == TCPOPT_EOL)
2761			break;
2762		if (opt == TCPOPT_NOP)
2763			optlen = 1;
2764		else {
2765			if (cnt < 2)
2766				break;
2767			optlen = optp[1];
2768			if (optlen < 2 || optlen > cnt)
2769				break;
2770		}
2771		switch (opt) {
2772		case TCPOPT_MAXSEG:
2773			mss = (u_int16_t *)(void *)(optp + 2);
2774			if ((ntohs(*mss)) > r->max_mss) {
2775				/*
2776				 * <jhw@apple.com>
2777				 *  Only do the TCP checksum fixup if delayed
2778				 * checksum calculation will not be performed.
2779				 */
2780				if (m->m_pkthdr.rcvif ||
2781				    !(m->m_pkthdr.csum_flags & CSUM_TCP))
2782					th->th_sum = pf_cksum_fixup(th->th_sum,
2783					    *mss, htons(r->max_mss), 0);
2784				*mss = htons(r->max_mss);
2785				rewrite = 1;
2786			}
2787			break;
2788		default:
2789			break;
2790		}
2791	}
2792
2793	if (rewrite) {
2794		struct mbuf *mw;
2795		u_short reason;
2796
2797		mw = pf_lazy_makewritable(pd, pd->mp,
2798		    off + sizeof (*th) + thoff);
2799		if (!mw) {
2800			REASON_SET(&reason, PFRES_MEMORY);
2801			if (r->log)
2802				PFLOG_PACKET(kif, h, m, AF_INET, dir, reason,
2803				    r, 0, 0, pd);
2804			return PF_DROP;
2805		}
2806
2807		*rewrptr = 1;
2808		m_copyback(mw, off + sizeof (*th), thoff - sizeof (*th), opts);
2809	}
2810
2811	return PF_PASS;
2812}
2813