pf_norm.c revision 293896
1/*	$OpenBSD: pf_norm.c,v 1.114 2009/01/29 14:11:45 henning Exp $ */
2
3/*
4 * Copyright 2001 Niels Provos <provos@citi.umich.edu>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#ifdef __FreeBSD__
29#include "opt_inet.h"
30#include "opt_inet6.h"
31#include "opt_pf.h"
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: releng/9.3/sys/contrib/pf/net/pf_norm.c 293896 2016-01-14 09:11:26Z glebius $");
35
36#ifdef DEV_PFLOG
37#define	NPFLOG	DEV_PFLOG
38#else
39#define	NPFLOG	0
40#endif
41#else
42#include "pflog.h"
43#endif
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/mbuf.h>
48#include <sys/filio.h>
49#include <sys/fcntl.h>
50#include <sys/socket.h>
51#include <sys/kernel.h>
52#include <sys/time.h>
53#ifndef __FreeBSD__
54#include <sys/pool.h>
55
56#include <dev/rndvar.h>
57#endif
58#include <net/if.h>
59#include <net/if_types.h>
60#include <net/bpf.h>
61#include <net/route.h>
62#include <net/if_pflog.h>
63
64#include <netinet/in.h>
65#include <netinet/in_var.h>
66#include <netinet/in_systm.h>
67#include <netinet/ip.h>
68#include <netinet/ip_var.h>
69#include <netinet/tcp.h>
70#include <netinet/tcp_seq.h>
71#include <netinet/udp.h>
72#include <netinet/ip_icmp.h>
73
74#ifdef INET6
75#include <netinet/ip6.h>
76#endif /* INET6 */
77
78#include <net/pfvar.h>
79
80#ifndef __FreeBSD__
81struct pf_frent {
82	LIST_ENTRY(pf_frent) fr_next;
83	struct ip *fr_ip;
84	struct mbuf *fr_m;
85};
86
87struct pf_frcache {
88	LIST_ENTRY(pf_frcache) fr_next;
89	uint16_t	fr_off;
90	uint16_t	fr_end;
91};
92#endif
93
94#define PFFRAG_SEENLAST	0x0001		/* Seen the last fragment for this */
95#define PFFRAG_NOBUFFER	0x0002		/* Non-buffering fragment cache */
96#define PFFRAG_DROP	0x0004		/* Drop all fragments */
97#define BUFFER_FRAGMENTS(fr)	(!((fr)->fr_flags & PFFRAG_NOBUFFER))
98
99#ifndef __FreeBSD__
100struct pf_fragment {
101	RB_ENTRY(pf_fragment) fr_entry;
102	TAILQ_ENTRY(pf_fragment) frag_next;
103	struct in_addr	fr_src;
104	struct in_addr	fr_dst;
105	u_int8_t	fr_p;		/* protocol of this fragment */
106	u_int8_t	fr_flags;	/* status flags */
107	u_int16_t	fr_id;		/* fragment id for reassemble */
108	u_int16_t	fr_max;		/* fragment data max */
109	u_int32_t	fr_timeout;
110#define fr_queue	fr_u.fru_queue
111#define fr_cache	fr_u.fru_cache
112	union {
113		LIST_HEAD(pf_fragq, pf_frent) fru_queue;	/* buffering */
114		LIST_HEAD(pf_cacheq, pf_frcache) fru_cache;	/* non-buf */
115	} fr_u;
116};
117#endif
118
119#ifdef __FreeBSD__
120TAILQ_HEAD(pf_fragqueue, pf_fragment);
121TAILQ_HEAD(pf_cachequeue, pf_fragment);
122VNET_DEFINE(struct pf_fragqueue,	pf_fragqueue);
123#define	V_pf_fragqueue			VNET(pf_fragqueue)
124VNET_DEFINE(struct pf_cachequeue,	pf_cachequeue);
125#define	V_pf_cachequeue			VNET(pf_cachequeue)
126#else
127TAILQ_HEAD(pf_fragqueue, pf_fragment)	pf_fragqueue;
128TAILQ_HEAD(pf_cachequeue, pf_fragment)	pf_cachequeue;
129#endif
130
131#ifndef __FreeBSD__
132static __inline int	 pf_frag_compare(struct pf_fragment *,
133			    struct pf_fragment *);
134#else
135static int		 pf_frag_compare(struct pf_fragment *,
136			    struct pf_fragment *);
137#endif
138
139#ifdef __FreeBSD__
140RB_HEAD(pf_frag_tree, pf_fragment);
141VNET_DEFINE(struct pf_frag_tree,	pf_frag_tree);
142#define	V_pf_frag_tree			VNET(pf_frag_tree)
143VNET_DEFINE(struct pf_frag_tree,	pf_cache_tree);
144#define	V_pf_cache_tree			VNET(pf_cache_tree)
145#else
146RB_HEAD(pf_frag_tree, pf_fragment)	pf_frag_tree, pf_cache_tree;
147#endif
148RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
149RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
150
151/* Private prototypes */
152void			 pf_ip2key(struct pf_fragment *, struct ip *);
153void			 pf_remove_fragment(struct pf_fragment *);
154void			 pf_flush_fragments(void);
155void			 pf_free_fragment(struct pf_fragment *);
156struct pf_fragment	*pf_find_fragment(struct ip *, struct pf_frag_tree *);
157struct mbuf		*pf_reassemble(struct mbuf **, struct pf_fragment **,
158			    struct pf_frent *, int);
159struct mbuf		*pf_fragcache(struct mbuf **, struct ip*,
160			    struct pf_fragment **, int, int, int *);
161int			 pf_normalize_tcpopt(struct pf_rule *, struct mbuf *,
162			    struct tcphdr *, int, sa_family_t);
163void			 pf_scrub_ip(struct mbuf **, u_int32_t, u_int8_t,
164			    u_int8_t);
165#ifdef INET6
166void			 pf_scrub_ip6(struct mbuf **, u_int8_t);
167#endif
168#ifdef __FreeBSD__
169#define	DPFPRINTF(x) do {				\
170	if (V_pf_status.debug >= PF_DEBUG_MISC) {	\
171		printf("%s: ", __func__);		\
172		printf x ;				\
173	}						\
174} while(0)
175#else
176#define	DPFPRINTF(x) do {				\
177	if (pf_status.debug >= PF_DEBUG_MISC) {		\
178		printf("%s: ", __func__);		\
179		printf x ;				\
180	}						\
181} while(0)
182#endif
183
184/* Globals */
185#ifdef __FreeBSD__
186VNET_DEFINE(uma_zone_t,		pf_frent_pl);
187VNET_DEFINE(uma_zone_t,		pf_frag_pl);
188VNET_DEFINE(uma_zone_t,		pf_cache_pl);
189VNET_DEFINE(uma_zone_t,		pf_cent_pl);
190VNET_DEFINE(uma_zone_t,		pf_state_scrub_pl);
191
192VNET_DEFINE(int,		pf_nfrents);
193#define	V_pf_nfrents		VNET(pf_nfrents)
194VNET_DEFINE(int,		pf_ncache);
195#define	V_pf_ncache		VNET(pf_ncache)
196#else
197struct pool		 pf_frent_pl, pf_frag_pl, pf_cache_pl, pf_cent_pl;
198struct pool		 pf_state_scrub_pl;
199int			 pf_nfrents, pf_ncache;
200#endif
201
202void
203pf_normalize_init(void)
204{
205#ifdef __FreeBSD__
206	/*
207	 * XXX
208	 * No high water mark support(It's hint not hard limit).
209	 * uma_zone_set_max(pf_frag_pl, PFFRAG_FRAG_HIWAT);
210	 */
211	uma_zone_set_max(V_pf_frent_pl, PFFRAG_FRENT_HIWAT);
212	uma_zone_set_max(V_pf_cache_pl, PFFRAG_FRCACHE_HIWAT);
213	uma_zone_set_max(V_pf_cent_pl, PFFRAG_FRCENT_HIWAT);
214#else
215	pool_init(&pf_frent_pl, sizeof(struct pf_frent), 0, 0, 0, "pffrent",
216	    NULL);
217	pool_init(&pf_frag_pl, sizeof(struct pf_fragment), 0, 0, 0, "pffrag",
218	    NULL);
219	pool_init(&pf_cache_pl, sizeof(struct pf_fragment), 0, 0, 0,
220	    "pffrcache", NULL);
221	pool_init(&pf_cent_pl, sizeof(struct pf_frcache), 0, 0, 0, "pffrcent",
222	    NULL);
223	pool_init(&pf_state_scrub_pl, sizeof(struct pf_state_scrub), 0, 0, 0,
224	    "pfstscr", NULL);
225
226	pool_sethiwat(&pf_frag_pl, PFFRAG_FRAG_HIWAT);
227	pool_sethardlimit(&pf_frent_pl, PFFRAG_FRENT_HIWAT, NULL, 0);
228	pool_sethardlimit(&pf_cache_pl, PFFRAG_FRCACHE_HIWAT, NULL, 0);
229	pool_sethardlimit(&pf_cent_pl, PFFRAG_FRCENT_HIWAT, NULL, 0);
230#endif
231
232#ifdef __FreeBSD__
233	TAILQ_INIT(&V_pf_fragqueue);
234	TAILQ_INIT(&V_pf_cachequeue);
235#else
236	TAILQ_INIT(&pf_fragqueue);
237	TAILQ_INIT(&pf_cachequeue);
238#endif
239}
240
241#ifdef __FreeBSD__
242static int
243#else
244static __inline int
245#endif
246pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
247{
248	int	diff;
249
250	if ((diff = a->fr_id - b->fr_id))
251		return (diff);
252	else if ((diff = a->fr_p - b->fr_p))
253		return (diff);
254	else if (a->fr_src.s_addr < b->fr_src.s_addr)
255		return (-1);
256	else if (a->fr_src.s_addr > b->fr_src.s_addr)
257		return (1);
258	else if (a->fr_dst.s_addr < b->fr_dst.s_addr)
259		return (-1);
260	else if (a->fr_dst.s_addr > b->fr_dst.s_addr)
261		return (1);
262	return (0);
263}
264
265void
266pf_purge_expired_fragments(void)
267{
268	struct pf_fragment	*frag;
269#ifdef __FreeBSD__
270	u_int32_t		 expire = time_second -
271				    V_pf_default_rule.timeout[PFTM_FRAG];
272#else
273	u_int32_t		 expire = time_second -
274				    pf_default_rule.timeout[PFTM_FRAG];
275#endif
276
277#ifdef __FreeBSD__
278	while ((frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue)) != NULL) {
279		KASSERT((BUFFER_FRAGMENTS(frag)),
280		    ("BUFFER_FRAGMENTS(frag) == 0: %s", __FUNCTION__));
281#else
282	while ((frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue)) != NULL) {
283		KASSERT(BUFFER_FRAGMENTS(frag));
284#endif
285		if (frag->fr_timeout > expire)
286			break;
287
288		DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
289		pf_free_fragment(frag);
290	}
291
292#ifdef __FreeBSD__
293	while ((frag = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue)) != NULL) {
294		KASSERT((!BUFFER_FRAGMENTS(frag)),
295		    ("BUFFER_FRAGMENTS(frag) != 0: %s", __FUNCTION__));
296#else
297	while ((frag = TAILQ_LAST(&pf_cachequeue, pf_cachequeue)) != NULL) {
298		KASSERT(!BUFFER_FRAGMENTS(frag));
299#endif
300		if (frag->fr_timeout > expire)
301			break;
302
303		DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
304		pf_free_fragment(frag);
305#ifdef __FreeBSD__
306		KASSERT((TAILQ_EMPTY(&V_pf_cachequeue) ||
307		    TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue) != frag),
308		    ("!(TAILQ_EMPTY() || TAILQ_LAST() == farg): %s",
309		    __FUNCTION__));
310#else
311		KASSERT(TAILQ_EMPTY(&pf_cachequeue) ||
312		    TAILQ_LAST(&pf_cachequeue, pf_cachequeue) != frag);
313#endif
314	}
315}
316
317/*
318 * Try to flush old fragments to make space for new ones
319 */
320
321void
322pf_flush_fragments(void)
323{
324	struct pf_fragment	*frag;
325	int			 goal;
326
327#ifdef __FreeBSD__
328	goal = V_pf_nfrents * 9 / 10;
329	DPFPRINTF(("trying to free > %d frents\n",
330	    V_pf_nfrents - goal));
331	while (goal < V_pf_nfrents) {
332#else
333	goal = pf_nfrents * 9 / 10;
334	DPFPRINTF(("trying to free > %d frents\n",
335	    pf_nfrents - goal));
336	while (goal < pf_nfrents) {
337#endif
338#ifdef __FreeBSD__
339		frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue);
340#else
341		frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue);
342#endif
343		if (frag == NULL)
344			break;
345		pf_free_fragment(frag);
346	}
347
348
349#ifdef __FreeBSD__
350	goal = V_pf_ncache * 9 / 10;
351	DPFPRINTF(("trying to free > %d cache entries\n",
352	    V_pf_ncache - goal));
353	while (goal < V_pf_ncache) {
354#else
355	goal = pf_ncache * 9 / 10;
356	DPFPRINTF(("trying to free > %d cache entries\n",
357	    pf_ncache - goal));
358	while (goal < pf_ncache) {
359#endif
360#ifdef __FreeBSD__
361		frag = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue);
362#else
363		frag = TAILQ_LAST(&pf_cachequeue, pf_cachequeue);
364#endif
365		if (frag == NULL)
366			break;
367		pf_free_fragment(frag);
368	}
369}
370
371/* Frees the fragments and all associated entries */
372
373void
374pf_free_fragment(struct pf_fragment *frag)
375{
376	struct pf_frent		*frent;
377	struct pf_frcache	*frcache;
378
379	/* Free all fragments */
380	if (BUFFER_FRAGMENTS(frag)) {
381		for (frent = LIST_FIRST(&frag->fr_queue); frent;
382		    frent = LIST_FIRST(&frag->fr_queue)) {
383			LIST_REMOVE(frent, fr_next);
384
385			m_freem(frent->fr_m);
386#ifdef __FreeBSD__
387			pool_put(&V_pf_frent_pl, frent);
388			V_pf_nfrents--;
389#else
390			pool_put(&pf_frent_pl, frent);
391			pf_nfrents--;
392#endif
393		}
394	} else {
395		for (frcache = LIST_FIRST(&frag->fr_cache); frcache;
396		    frcache = LIST_FIRST(&frag->fr_cache)) {
397			LIST_REMOVE(frcache, fr_next);
398
399#ifdef __FreeBSD__
400			KASSERT((LIST_EMPTY(&frag->fr_cache) ||
401			    LIST_FIRST(&frag->fr_cache)->fr_off >
402			    frcache->fr_end),
403			    ("! (LIST_EMPTY() || LIST_FIRST()->fr_off >"
404			      " frcache->fr_end): %s", __FUNCTION__));
405
406			pool_put(&V_pf_cent_pl, frcache);
407			V_pf_ncache--;
408#else
409			KASSERT(LIST_EMPTY(&frag->fr_cache) ||
410			    LIST_FIRST(&frag->fr_cache)->fr_off >
411			    frcache->fr_end);
412
413			pool_put(&pf_cent_pl, frcache);
414			pf_ncache--;
415#endif
416		}
417	}
418
419	pf_remove_fragment(frag);
420}
421
422void
423pf_ip2key(struct pf_fragment *key, struct ip *ip)
424{
425	key->fr_p = ip->ip_p;
426	key->fr_id = ip->ip_id;
427	key->fr_src.s_addr = ip->ip_src.s_addr;
428	key->fr_dst.s_addr = ip->ip_dst.s_addr;
429}
430
431struct pf_fragment *
432pf_find_fragment(struct ip *ip, struct pf_frag_tree *tree)
433{
434	struct pf_fragment	 key;
435	struct pf_fragment	*frag;
436
437	pf_ip2key(&key, ip);
438
439	frag = RB_FIND(pf_frag_tree, tree, &key);
440	if (frag != NULL) {
441		/* XXX Are we sure we want to update the timeout? */
442		frag->fr_timeout = time_second;
443		if (BUFFER_FRAGMENTS(frag)) {
444#ifdef __FreeBSD__
445			TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
446			TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
447#else
448			TAILQ_REMOVE(&pf_fragqueue, frag, frag_next);
449			TAILQ_INSERT_HEAD(&pf_fragqueue, frag, frag_next);
450#endif
451		} else {
452#ifdef __FreeBSD__
453			TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next);
454			TAILQ_INSERT_HEAD(&V_pf_cachequeue, frag, frag_next);
455#else
456			TAILQ_REMOVE(&pf_cachequeue, frag, frag_next);
457			TAILQ_INSERT_HEAD(&pf_cachequeue, frag, frag_next);
458#endif
459		}
460	}
461
462	return (frag);
463}
464
465/* Removes a fragment from the fragment queue and frees the fragment */
466
467void
468pf_remove_fragment(struct pf_fragment *frag)
469{
470	if (BUFFER_FRAGMENTS(frag)) {
471#ifdef __FreeBSD__
472		RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag);
473		TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
474		pool_put(&V_pf_frag_pl, frag);
475#else
476		RB_REMOVE(pf_frag_tree, &pf_frag_tree, frag);
477		TAILQ_REMOVE(&pf_fragqueue, frag, frag_next);
478		pool_put(&pf_frag_pl, frag);
479#endif
480	} else {
481#ifdef __FreeBSD__
482		RB_REMOVE(pf_frag_tree, &V_pf_cache_tree, frag);
483		TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next);
484		pool_put(&V_pf_cache_pl, frag);
485#else
486		RB_REMOVE(pf_frag_tree, &pf_cache_tree, frag);
487		TAILQ_REMOVE(&pf_cachequeue, frag, frag_next);
488		pool_put(&pf_cache_pl, frag);
489#endif
490	}
491}
492
493#define FR_IP_OFF(fr)	((ntohs((fr)->fr_ip->ip_off) & IP_OFFMASK) << 3)
494struct mbuf *
495pf_reassemble(struct mbuf **m0, struct pf_fragment **frag,
496    struct pf_frent *frent, int mff)
497{
498	struct mbuf	*m = *m0, *m2;
499	struct pf_frent	*frea, *next;
500	struct pf_frent	*frep = NULL;
501	struct ip	*ip = frent->fr_ip;
502	int		 hlen = ip->ip_hl << 2;
503	u_int16_t	 off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
504	u_int16_t	 ip_len = ntohs(ip->ip_len) - ip->ip_hl * 4;
505	u_int16_t	 max = ip_len + off;
506
507#ifdef __FreeBSD__
508	KASSERT((*frag == NULL || BUFFER_FRAGMENTS(*frag)),
509	    ("! (*frag == NULL || BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__));
510#else
511	KASSERT(*frag == NULL || BUFFER_FRAGMENTS(*frag));
512#endif
513
514	/* Strip off ip header */
515	m->m_data += hlen;
516	m->m_len -= hlen;
517
518	/* Create a new reassembly queue for this packet */
519	if (*frag == NULL) {
520#ifdef __FreeBSD__
521		*frag = pool_get(&V_pf_frag_pl, PR_NOWAIT);
522#else
523		*frag = pool_get(&pf_frag_pl, PR_NOWAIT);
524#endif
525		if (*frag == NULL) {
526			pf_flush_fragments();
527#ifdef __FreeBSD__
528			*frag = pool_get(&V_pf_frag_pl, PR_NOWAIT);
529#else
530			*frag = pool_get(&pf_frag_pl, PR_NOWAIT);
531#endif
532			if (*frag == NULL)
533				goto drop_fragment;
534		}
535
536		(*frag)->fr_flags = 0;
537		(*frag)->fr_max = 0;
538		(*frag)->fr_src = frent->fr_ip->ip_src;
539		(*frag)->fr_dst = frent->fr_ip->ip_dst;
540		(*frag)->fr_p = frent->fr_ip->ip_p;
541		(*frag)->fr_id = frent->fr_ip->ip_id;
542		(*frag)->fr_timeout = time_second;
543		LIST_INIT(&(*frag)->fr_queue);
544
545#ifdef __FreeBSD__
546		RB_INSERT(pf_frag_tree, &V_pf_frag_tree, *frag);
547		TAILQ_INSERT_HEAD(&V_pf_fragqueue, *frag, frag_next);
548#else
549		RB_INSERT(pf_frag_tree, &pf_frag_tree, *frag);
550		TAILQ_INSERT_HEAD(&pf_fragqueue, *frag, frag_next);
551#endif
552
553		/* We do not have a previous fragment */
554		frep = NULL;
555		goto insert;
556	}
557
558	/*
559	 * Find a fragment after the current one:
560	 *  - off contains the real shifted offset.
561	 */
562	LIST_FOREACH(frea, &(*frag)->fr_queue, fr_next) {
563		if (FR_IP_OFF(frea) > off)
564			break;
565		frep = frea;
566	}
567
568#ifdef __FreeBSD__
569	KASSERT((frep != NULL || frea != NULL),
570	    ("!(frep != NULL || frea != NULL): %s", __FUNCTION__));;
571#else
572	KASSERT(frep != NULL || frea != NULL);
573#endif
574
575	if (frep != NULL &&
576	    FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl *
577	    4 > off)
578	{
579		u_int16_t	precut;
580
581		precut = FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) -
582		    frep->fr_ip->ip_hl * 4 - off;
583		if (precut >= ip_len)
584			goto drop_fragment;
585		m_adj(frent->fr_m, precut);
586		DPFPRINTF(("overlap -%d\n", precut));
587		/* Enforce 8 byte boundaries */
588		ip->ip_off = htons(ntohs(ip->ip_off) + (precut >> 3));
589		off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
590		ip_len -= precut;
591		ip->ip_len = htons(ip_len);
592	}
593
594	for (; frea != NULL && ip_len + off > FR_IP_OFF(frea);
595	    frea = next)
596	{
597		u_int16_t	aftercut;
598
599		aftercut = ip_len + off - FR_IP_OFF(frea);
600		DPFPRINTF(("adjust overlap %d\n", aftercut));
601		if (aftercut < ntohs(frea->fr_ip->ip_len) - frea->fr_ip->ip_hl
602		    * 4)
603		{
604			frea->fr_ip->ip_len =
605			    htons(ntohs(frea->fr_ip->ip_len) - aftercut);
606			frea->fr_ip->ip_off = htons(ntohs(frea->fr_ip->ip_off) +
607			    (aftercut >> 3));
608			m_adj(frea->fr_m, aftercut);
609			break;
610		}
611
612		/* This fragment is completely overlapped, lose it */
613		next = LIST_NEXT(frea, fr_next);
614		m_freem(frea->fr_m);
615		LIST_REMOVE(frea, fr_next);
616#ifdef __FreeBSD__
617		pool_put(&V_pf_frent_pl, frea);
618		V_pf_nfrents--;
619#else
620		pool_put(&pf_frent_pl, frea);
621		pf_nfrents--;
622#endif
623	}
624
625 insert:
626	/* Update maximum data size */
627	if ((*frag)->fr_max < max)
628		(*frag)->fr_max = max;
629	/* This is the last segment */
630	if (!mff)
631		(*frag)->fr_flags |= PFFRAG_SEENLAST;
632
633	if (frep == NULL)
634		LIST_INSERT_HEAD(&(*frag)->fr_queue, frent, fr_next);
635	else
636		LIST_INSERT_AFTER(frep, frent, fr_next);
637
638	/* Check if we are completely reassembled */
639	if (!((*frag)->fr_flags & PFFRAG_SEENLAST))
640		return (NULL);
641
642	/* Check if we have all the data */
643	off = 0;
644	for (frep = LIST_FIRST(&(*frag)->fr_queue); frep; frep = next) {
645		next = LIST_NEXT(frep, fr_next);
646
647		off += ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl * 4;
648		if (off < (*frag)->fr_max &&
649		    (next == NULL || FR_IP_OFF(next) != off))
650		{
651			DPFPRINTF(("missing fragment at %d, next %d, max %d\n",
652			    off, next == NULL ? -1 : FR_IP_OFF(next),
653			    (*frag)->fr_max));
654			return (NULL);
655		}
656	}
657	DPFPRINTF(("%d < %d?\n", off, (*frag)->fr_max));
658	if (off < (*frag)->fr_max)
659		return (NULL);
660
661	/* We have all the data */
662	frent = LIST_FIRST(&(*frag)->fr_queue);
663#ifdef __FreeBSD__
664	KASSERT((frent != NULL), ("frent == NULL: %s", __FUNCTION__));
665#else
666	KASSERT(frent != NULL);
667#endif
668	if ((frent->fr_ip->ip_hl << 2) + off > IP_MAXPACKET) {
669		DPFPRINTF(("drop: too big: %d\n", off));
670		pf_free_fragment(*frag);
671		*frag = NULL;
672		return (NULL);
673	}
674	next = LIST_NEXT(frent, fr_next);
675
676	/* Magic from ip_input */
677	ip = frent->fr_ip;
678	m = frent->fr_m;
679	m2 = m->m_next;
680	m->m_next = NULL;
681	m_cat(m, m2);
682#ifdef __FreeBSD__
683	pool_put(&V_pf_frent_pl, frent);
684	V_pf_nfrents--;
685#else
686	pool_put(&pf_frent_pl, frent);
687	pf_nfrents--;
688#endif
689	for (frent = next; frent != NULL; frent = next) {
690		next = LIST_NEXT(frent, fr_next);
691
692		m2 = frent->fr_m;
693#ifdef __FreeBSD__
694		pool_put(&V_pf_frent_pl, frent);
695		V_pf_nfrents--;
696#else
697		pool_put(&pf_frent_pl, frent);
698		pf_nfrents--;
699#endif
700#ifdef __FreeBSD__
701		m->m_pkthdr.csum_flags &= m2->m_pkthdr.csum_flags;
702		m->m_pkthdr.csum_data += m2->m_pkthdr.csum_data;
703#endif
704		m_cat(m, m2);
705	}
706
707#ifdef __FreeBSD__
708	while (m->m_pkthdr.csum_data & 0xffff0000)
709		m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
710		    (m->m_pkthdr.csum_data >> 16);
711#endif
712	ip->ip_src = (*frag)->fr_src;
713	ip->ip_dst = (*frag)->fr_dst;
714
715	/* Remove from fragment queue */
716	pf_remove_fragment(*frag);
717	*frag = NULL;
718
719	hlen = ip->ip_hl << 2;
720	ip->ip_len = htons(off + hlen);
721	m->m_len += hlen;
722	m->m_data -= hlen;
723
724	/* some debugging cruft by sklower, below, will go away soon */
725	/* XXX this should be done elsewhere */
726	if (m->m_flags & M_PKTHDR) {
727		int plen = 0;
728		for (m2 = m; m2; m2 = m2->m_next)
729			plen += m2->m_len;
730		m->m_pkthdr.len = plen;
731	}
732
733	DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len)));
734	return (m);
735
736 drop_fragment:
737	/* Oops - fail safe - drop packet */
738#ifdef __FreeBSD__
739	pool_put(&V_pf_frent_pl, frent);
740	V_pf_nfrents--;
741#else
742	pool_put(&pf_frent_pl, frent);
743	pf_nfrents--;
744#endif
745	m_freem(m);
746	return (NULL);
747}
748
749struct mbuf *
750pf_fragcache(struct mbuf **m0, struct ip *h, struct pf_fragment **frag, int mff,
751    int drop, int *nomem)
752{
753	struct mbuf		*m = *m0;
754	struct pf_frcache	*frp, *fra, *cur = NULL;
755	int			 ip_len = ntohs(h->ip_len) - (h->ip_hl << 2);
756	u_int16_t		 off = ntohs(h->ip_off) << 3;
757	u_int16_t		 max = ip_len + off;
758	int			 hosed = 0;
759
760#ifdef __FreeBSD__
761	KASSERT((*frag == NULL || !BUFFER_FRAGMENTS(*frag)),
762	    ("!(*frag == NULL || !BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__));
763#else
764	KASSERT(*frag == NULL || !BUFFER_FRAGMENTS(*frag));
765#endif
766
767	/* Create a new range queue for this packet */
768	if (*frag == NULL) {
769#ifdef __FreeBSD__
770		*frag = pool_get(&V_pf_cache_pl, PR_NOWAIT);
771#else
772		*frag = pool_get(&pf_cache_pl, PR_NOWAIT);
773#endif
774		if (*frag == NULL) {
775			pf_flush_fragments();
776#ifdef __FreeBSD__
777			*frag = pool_get(&V_pf_cache_pl, PR_NOWAIT);
778#else
779			*frag = pool_get(&pf_cache_pl, PR_NOWAIT);
780#endif
781			if (*frag == NULL)
782				goto no_mem;
783		}
784
785		/* Get an entry for the queue */
786#ifdef __FreeBSD__
787		cur = pool_get(&V_pf_cent_pl, PR_NOWAIT);
788		if (cur == NULL) {
789			pool_put(&V_pf_cache_pl, *frag);
790#else
791		cur = pool_get(&pf_cent_pl, PR_NOWAIT);
792		if (cur == NULL) {
793			pool_put(&pf_cache_pl, *frag);
794#endif
795			*frag = NULL;
796			goto no_mem;
797		}
798#ifdef __FreeBSD__
799		V_pf_ncache++;
800#else
801		pf_ncache++;
802#endif
803
804		(*frag)->fr_flags = PFFRAG_NOBUFFER;
805		(*frag)->fr_max = 0;
806		(*frag)->fr_src = h->ip_src;
807		(*frag)->fr_dst = h->ip_dst;
808		(*frag)->fr_p = h->ip_p;
809		(*frag)->fr_id = h->ip_id;
810		(*frag)->fr_timeout = time_second;
811
812		cur->fr_off = off;
813		cur->fr_end = max;
814		LIST_INIT(&(*frag)->fr_cache);
815		LIST_INSERT_HEAD(&(*frag)->fr_cache, cur, fr_next);
816
817#ifdef __FreeBSD__
818		RB_INSERT(pf_frag_tree, &V_pf_cache_tree, *frag);
819		TAILQ_INSERT_HEAD(&V_pf_cachequeue, *frag, frag_next);
820#else
821		RB_INSERT(pf_frag_tree, &pf_cache_tree, *frag);
822		TAILQ_INSERT_HEAD(&pf_cachequeue, *frag, frag_next);
823#endif
824
825		DPFPRINTF(("fragcache[%d]: new %d-%d\n", h->ip_id, off, max));
826
827		goto pass;
828	}
829
830	/*
831	 * Find a fragment after the current one:
832	 *  - off contains the real shifted offset.
833	 */
834	frp = NULL;
835	LIST_FOREACH(fra, &(*frag)->fr_cache, fr_next) {
836		if (fra->fr_off > off)
837			break;
838		frp = fra;
839	}
840
841#ifdef __FreeBSD__
842	KASSERT((frp != NULL || fra != NULL),
843	    ("!(frp != NULL || fra != NULL): %s", __FUNCTION__));
844#else
845	KASSERT(frp != NULL || fra != NULL);
846#endif
847
848	if (frp != NULL) {
849		int	precut;
850
851		precut = frp->fr_end - off;
852		if (precut >= ip_len) {
853			/* Fragment is entirely a duplicate */
854			DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n",
855			    h->ip_id, frp->fr_off, frp->fr_end, off, max));
856			goto drop_fragment;
857		}
858		if (precut == 0) {
859			/* They are adjacent.  Fixup cache entry */
860			DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n",
861			    h->ip_id, frp->fr_off, frp->fr_end, off, max));
862			frp->fr_end = max;
863		} else if (precut > 0) {
864			/* The first part of this payload overlaps with a
865			 * fragment that has already been passed.
866			 * Need to trim off the first part of the payload.
867			 * But to do so easily, we need to create another
868			 * mbuf to throw the original header into.
869			 */
870
871			DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n",
872			    h->ip_id, precut, frp->fr_off, frp->fr_end, off,
873			    max));
874
875			off += precut;
876			max -= precut;
877			/* Update the previous frag to encompass this one */
878			frp->fr_end = max;
879
880			if (!drop) {
881				/* XXX Optimization opportunity
882				 * This is a very heavy way to trim the payload.
883				 * we could do it much faster by diddling mbuf
884				 * internals but that would be even less legible
885				 * than this mbuf magic.  For my next trick,
886				 * I'll pull a rabbit out of my laptop.
887				 */
888#ifdef __FreeBSD__
889				*m0 = m_dup(m, M_DONTWAIT);
890#else
891				*m0 = m_copym2(m, 0, h->ip_hl << 2, M_NOWAIT);
892#endif
893				if (*m0 == NULL)
894					goto no_mem;
895#ifdef __FreeBSD__
896				/* From KAME Project : We have missed this! */
897				m_adj(*m0, (h->ip_hl << 2) -
898				    (*m0)->m_pkthdr.len);
899
900				KASSERT(((*m0)->m_next == NULL),
901				    ("(*m0)->m_next != NULL: %s",
902				    __FUNCTION__));
903#else
904				KASSERT((*m0)->m_next == NULL);
905#endif
906				m_adj(m, precut + (h->ip_hl << 2));
907				m_cat(*m0, m);
908				m = *m0;
909				if (m->m_flags & M_PKTHDR) {
910					int plen = 0;
911					struct mbuf *t;
912					for (t = m; t; t = t->m_next)
913						plen += t->m_len;
914					m->m_pkthdr.len = plen;
915				}
916
917
918				h = mtod(m, struct ip *);
919
920#ifdef __FreeBSD__
921				KASSERT(((int)m->m_len ==
922				    ntohs(h->ip_len) - precut),
923				    ("m->m_len != ntohs(h->ip_len) - precut: %s",
924				    __FUNCTION__));
925#else
926				KASSERT((int)m->m_len ==
927				    ntohs(h->ip_len) - precut);
928#endif
929				h->ip_off = htons(ntohs(h->ip_off) +
930				    (precut >> 3));
931				h->ip_len = htons(ntohs(h->ip_len) - precut);
932			} else {
933				hosed++;
934			}
935		} else {
936			/* There is a gap between fragments */
937
938			DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n",
939			    h->ip_id, -precut, frp->fr_off, frp->fr_end, off,
940			    max));
941
942#ifdef __FreeBSD__
943			cur = pool_get(&V_pf_cent_pl, PR_NOWAIT);
944#else
945			cur = pool_get(&pf_cent_pl, PR_NOWAIT);
946#endif
947			if (cur == NULL)
948				goto no_mem;
949#ifdef __FreeBSD__
950			V_pf_ncache++;
951#else
952			pf_ncache++;
953#endif
954
955			cur->fr_off = off;
956			cur->fr_end = max;
957			LIST_INSERT_AFTER(frp, cur, fr_next);
958		}
959	}
960
961	if (fra != NULL) {
962		int	aftercut;
963		int	merge = 0;
964
965		aftercut = max - fra->fr_off;
966		if (aftercut == 0) {
967			/* Adjacent fragments */
968			DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n",
969			    h->ip_id, off, max, fra->fr_off, fra->fr_end));
970			fra->fr_off = off;
971			merge = 1;
972		} else if (aftercut > 0) {
973			/* Need to chop off the tail of this fragment */
974			DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n",
975			    h->ip_id, aftercut, off, max, fra->fr_off,
976			    fra->fr_end));
977			fra->fr_off = off;
978			max -= aftercut;
979
980			merge = 1;
981
982			if (!drop) {
983				m_adj(m, -aftercut);
984				if (m->m_flags & M_PKTHDR) {
985					int plen = 0;
986					struct mbuf *t;
987					for (t = m; t; t = t->m_next)
988						plen += t->m_len;
989					m->m_pkthdr.len = plen;
990				}
991				h = mtod(m, struct ip *);
992#ifdef __FreeBSD__
993				KASSERT(((int)m->m_len == ntohs(h->ip_len) - aftercut),
994				    ("m->m_len != ntohs(h->ip_len) - aftercut: %s",
995				    __FUNCTION__));
996#else
997				KASSERT((int)m->m_len ==
998				    ntohs(h->ip_len) - aftercut);
999#endif
1000				h->ip_len = htons(ntohs(h->ip_len) - aftercut);
1001			} else {
1002				hosed++;
1003			}
1004		} else if (frp == NULL) {
1005			/* There is a gap between fragments */
1006			DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n",
1007			    h->ip_id, -aftercut, off, max, fra->fr_off,
1008			    fra->fr_end));
1009
1010#ifdef __FreeBSD__
1011			cur = pool_get(&V_pf_cent_pl, PR_NOWAIT);
1012#else
1013			cur = pool_get(&pf_cent_pl, PR_NOWAIT);
1014#endif
1015			if (cur == NULL)
1016				goto no_mem;
1017#ifdef __FreeBSD__
1018			V_pf_ncache++;
1019#else
1020			pf_ncache++;
1021#endif
1022
1023			cur->fr_off = off;
1024			cur->fr_end = max;
1025			LIST_INSERT_BEFORE(fra, cur, fr_next);
1026		}
1027
1028
1029		/* Need to glue together two separate fragment descriptors */
1030		if (merge) {
1031			if (cur && fra->fr_off <= cur->fr_end) {
1032				/* Need to merge in a previous 'cur' */
1033				DPFPRINTF(("fragcache[%d]: adjacent(merge "
1034				    "%d-%d) %d-%d (%d-%d)\n",
1035				    h->ip_id, cur->fr_off, cur->fr_end, off,
1036				    max, fra->fr_off, fra->fr_end));
1037				fra->fr_off = cur->fr_off;
1038				LIST_REMOVE(cur, fr_next);
1039#ifdef __FreeBSD__
1040				pool_put(&V_pf_cent_pl, cur);
1041				V_pf_ncache--;
1042#else
1043				pool_put(&pf_cent_pl, cur);
1044				pf_ncache--;
1045#endif
1046				cur = NULL;
1047
1048			} else if (frp && fra->fr_off <= frp->fr_end) {
1049				/* Need to merge in a modified 'frp' */
1050#ifdef __FreeBSD__
1051				KASSERT((cur == NULL), ("cur != NULL: %s",
1052				    __FUNCTION__));
1053#else
1054				KASSERT(cur == NULL);
1055#endif
1056				DPFPRINTF(("fragcache[%d]: adjacent(merge "
1057				    "%d-%d) %d-%d (%d-%d)\n",
1058				    h->ip_id, frp->fr_off, frp->fr_end, off,
1059				    max, fra->fr_off, fra->fr_end));
1060				fra->fr_off = frp->fr_off;
1061				LIST_REMOVE(frp, fr_next);
1062#ifdef __FreeBSD__
1063				pool_put(&V_pf_cent_pl, frp);
1064				V_pf_ncache--;
1065#else
1066				pool_put(&pf_cent_pl, frp);
1067				pf_ncache--;
1068#endif
1069				frp = NULL;
1070
1071			}
1072		}
1073	}
1074
1075	if (hosed) {
1076		/*
1077		 * We must keep tracking the overall fragment even when
1078		 * we're going to drop it anyway so that we know when to
1079		 * free the overall descriptor.  Thus we drop the frag late.
1080		 */
1081		goto drop_fragment;
1082	}
1083
1084
1085 pass:
1086	/* Update maximum data size */
1087	if ((*frag)->fr_max < max)
1088		(*frag)->fr_max = max;
1089
1090	/* This is the last segment */
1091	if (!mff)
1092		(*frag)->fr_flags |= PFFRAG_SEENLAST;
1093
1094	/* Check if we are completely reassembled */
1095	if (((*frag)->fr_flags & PFFRAG_SEENLAST) &&
1096	    LIST_FIRST(&(*frag)->fr_cache)->fr_off == 0 &&
1097	    LIST_FIRST(&(*frag)->fr_cache)->fr_end == (*frag)->fr_max) {
1098		/* Remove from fragment queue */
1099		DPFPRINTF(("fragcache[%d]: done 0-%d\n", h->ip_id,
1100		    (*frag)->fr_max));
1101		pf_free_fragment(*frag);
1102		*frag = NULL;
1103	}
1104
1105	return (m);
1106
1107 no_mem:
1108	*nomem = 1;
1109
1110	/* Still need to pay attention to !IP_MF */
1111	if (!mff && *frag != NULL)
1112		(*frag)->fr_flags |= PFFRAG_SEENLAST;
1113
1114	m_freem(m);
1115	return (NULL);
1116
1117 drop_fragment:
1118
1119	/* Still need to pay attention to !IP_MF */
1120	if (!mff && *frag != NULL)
1121		(*frag)->fr_flags |= PFFRAG_SEENLAST;
1122
1123	if (drop) {
1124		/* This fragment has been deemed bad.  Don't reass */
1125		if (((*frag)->fr_flags & PFFRAG_DROP) == 0)
1126			DPFPRINTF(("fragcache[%d]: dropping overall fragment\n",
1127			    h->ip_id));
1128		(*frag)->fr_flags |= PFFRAG_DROP;
1129	}
1130
1131	m_freem(m);
1132	return (NULL);
1133}
1134
1135#ifdef INET
1136int
1137pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason,
1138    struct pf_pdesc *pd)
1139{
1140	struct mbuf		*m = *m0;
1141	struct pf_rule		*r;
1142	struct pf_frent		*frent;
1143	struct pf_fragment	*frag = NULL;
1144	struct ip		*h = mtod(m, struct ip *);
1145	int			 mff = (ntohs(h->ip_off) & IP_MF);
1146	int			 hlen = h->ip_hl << 2;
1147	u_int16_t		 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
1148	u_int16_t		 max;
1149	int			 ip_len;
1150	int			 ip_off;
1151	int			 tag = -1;
1152
1153	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1154	while (r != NULL) {
1155		r->evaluations++;
1156		if (pfi_kif_match(r->kif, kif) == r->ifnot)
1157			r = r->skip[PF_SKIP_IFP].ptr;
1158		else if (r->direction && r->direction != dir)
1159			r = r->skip[PF_SKIP_DIR].ptr;
1160		else if (r->af && r->af != AF_INET)
1161			r = r->skip[PF_SKIP_AF].ptr;
1162		else if (r->proto && r->proto != h->ip_p)
1163			r = r->skip[PF_SKIP_PROTO].ptr;
1164		else if (PF_MISMATCHAW(&r->src.addr,
1165		    (struct pf_addr *)&h->ip_src.s_addr, AF_INET,
1166		    r->src.neg, kif, M_GETFIB(m)))
1167			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1168		else if (PF_MISMATCHAW(&r->dst.addr,
1169		    (struct pf_addr *)&h->ip_dst.s_addr, AF_INET,
1170		    r->dst.neg, NULL, M_GETFIB(m)))
1171			r = r->skip[PF_SKIP_DST_ADDR].ptr;
1172#ifdef __FreeBSD__
1173		else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag))
1174#else
1175		else if (r->match_tag && !pf_match_tag(m, r, &tag))
1176#endif
1177			r = TAILQ_NEXT(r, entries);
1178		else
1179			break;
1180	}
1181
1182	if (r == NULL || r->action == PF_NOSCRUB)
1183		return (PF_PASS);
1184	else {
1185		r->packets[dir == PF_OUT]++;
1186		r->bytes[dir == PF_OUT] += pd->tot_len;
1187	}
1188
1189	/* Check for illegal packets */
1190	if (hlen < (int)sizeof(struct ip))
1191		goto drop;
1192
1193	if (hlen > ntohs(h->ip_len))
1194		goto drop;
1195
1196	/* Clear IP_DF if the rule uses the no-df option */
1197	if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
1198		u_int16_t ip_off = h->ip_off;
1199
1200		h->ip_off &= htons(~IP_DF);
1201		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
1202	}
1203
1204	/* We will need other tests here */
1205	if (!fragoff && !mff)
1206		goto no_fragment;
1207
1208	/* We're dealing with a fragment now. Don't allow fragments
1209	 * with IP_DF to enter the cache. If the flag was cleared by
1210	 * no-df above, fine. Otherwise drop it.
1211	 */
1212	if (h->ip_off & htons(IP_DF)) {
1213		DPFPRINTF(("IP_DF\n"));
1214		goto bad;
1215	}
1216
1217	ip_len = ntohs(h->ip_len) - hlen;
1218	ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
1219
1220	/* All fragments are 8 byte aligned */
1221	if (mff && (ip_len & 0x7)) {
1222		DPFPRINTF(("mff and %d\n", ip_len));
1223		goto bad;
1224	}
1225
1226	/* Respect maximum length */
1227	if (fragoff + ip_len > IP_MAXPACKET) {
1228		DPFPRINTF(("max packet %d\n", fragoff + ip_len));
1229		goto bad;
1230	}
1231	max = fragoff + ip_len;
1232
1233	if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) {
1234		/* Fully buffer all of the fragments */
1235
1236#ifdef __FreeBSD__
1237		frag = pf_find_fragment(h, &V_pf_frag_tree);
1238#else
1239		frag = pf_find_fragment(h, &pf_frag_tree);
1240#endif
1241
1242		/* Check if we saw the last fragment already */
1243		if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
1244		    max > frag->fr_max)
1245			goto bad;
1246
1247		/* Get an entry for the fragment queue */
1248#ifdef __FreeBSD__
1249		frent = pool_get(&V_pf_frent_pl, PR_NOWAIT);
1250#else
1251		frent = pool_get(&pf_frent_pl, PR_NOWAIT);
1252#endif
1253		if (frent == NULL) {
1254			REASON_SET(reason, PFRES_MEMORY);
1255			return (PF_DROP);
1256		}
1257#ifdef __FreeBSD__
1258		V_pf_nfrents++;
1259#else
1260		pf_nfrents++;
1261#endif
1262		frent->fr_ip = h;
1263		frent->fr_m = m;
1264
1265		/* Might return a completely reassembled mbuf, or NULL */
1266		DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max));
1267		*m0 = m = pf_reassemble(m0, &frag, frent, mff);
1268
1269		if (m == NULL)
1270			return (PF_DROP);
1271
1272		/* use mtag from concatenated mbuf chain */
1273		pd->pf_mtag = pf_find_mtag(m);
1274#ifdef DIAGNOSTIC
1275		if (pd->pf_mtag == NULL) {
1276			printf("%s: pf_find_mtag returned NULL(1)\n", __func__);
1277			if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) {
1278				m_freem(m);
1279				*m0 = NULL;
1280				goto no_mem;
1281			}
1282		}
1283#endif
1284		if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
1285			goto drop;
1286
1287		h = mtod(m, struct ip *);
1288	} else {
1289		/* non-buffering fragment cache (drops or masks overlaps) */
1290		int	nomem = 0;
1291
1292#ifdef __FreeBSD__
1293		if (dir == PF_OUT && pd->pf_mtag->flags & PF_TAG_FRAGCACHE) {
1294#else
1295		if (dir == PF_OUT && m->m_pkthdr.pf.flags & PF_TAG_FRAGCACHE) {
1296#endif
1297			/*
1298			 * Already passed the fragment cache in the
1299			 * input direction.  If we continued, it would
1300			 * appear to be a dup and would be dropped.
1301			 */
1302			goto fragment_pass;
1303		}
1304
1305#ifdef __FreeBSD__
1306		frag = pf_find_fragment(h, &V_pf_cache_tree);
1307#else
1308		frag = pf_find_fragment(h, &pf_cache_tree);
1309#endif
1310
1311		/* Check if we saw the last fragment already */
1312		if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
1313		    max > frag->fr_max) {
1314			if (r->rule_flag & PFRULE_FRAGDROP)
1315				frag->fr_flags |= PFFRAG_DROP;
1316			goto bad;
1317		}
1318
1319		*m0 = m = pf_fragcache(m0, h, &frag, mff,
1320		    (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem);
1321		if (m == NULL) {
1322			if (nomem)
1323				goto no_mem;
1324			goto drop;
1325		}
1326
1327		/* use mtag from copied and trimmed mbuf chain */
1328		pd->pf_mtag = pf_find_mtag(m);
1329#ifdef DIAGNOSTIC
1330		if (pd->pf_mtag == NULL) {
1331			printf("%s: pf_find_mtag returned NULL(2)\n", __func__);
1332			if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) {
1333				m_freem(m);
1334				*m0 = NULL;
1335				goto no_mem;
1336			}
1337		}
1338#endif
1339		if (dir == PF_IN)
1340#ifdef __FreeBSD__
1341			pd->pf_mtag->flags |= PF_TAG_FRAGCACHE;
1342#else
1343			m->m_pkthdr.pf.flags |= PF_TAG_FRAGCACHE;
1344#endif
1345
1346		if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
1347			goto drop;
1348		goto fragment_pass;
1349	}
1350
1351 no_fragment:
1352	/* At this point, only IP_DF is allowed in ip_off */
1353	if (h->ip_off & ~htons(IP_DF)) {
1354		u_int16_t ip_off = h->ip_off;
1355
1356		h->ip_off &= htons(IP_DF);
1357		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
1358	}
1359
1360	/* not missing a return here */
1361
1362 fragment_pass:
1363	pf_scrub_ip(&m, r->rule_flag, r->min_ttl, r->set_tos);
1364
1365	if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0)
1366		pd->flags |= PFDESC_IP_REAS;
1367	return (PF_PASS);
1368
1369 no_mem:
1370	REASON_SET(reason, PFRES_MEMORY);
1371	if (r != NULL && r->log)
1372		PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r, NULL, NULL, pd);
1373	return (PF_DROP);
1374
1375 drop:
1376	REASON_SET(reason, PFRES_NORM);
1377	if (r != NULL && r->log)
1378		PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r, NULL, NULL, pd);
1379	return (PF_DROP);
1380
1381 bad:
1382	DPFPRINTF(("dropping bad fragment\n"));
1383
1384	/* Free associated fragments */
1385	if (frag != NULL)
1386		pf_free_fragment(frag);
1387
1388	REASON_SET(reason, PFRES_FRAG);
1389	if (r != NULL && r->log)
1390		PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r, NULL, NULL, pd);
1391
1392	return (PF_DROP);
1393}
1394#endif
1395
1396#ifdef INET6
1397int
1398pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif,
1399    u_short *reason, struct pf_pdesc *pd)
1400{
1401	struct mbuf		*m = *m0;
1402	struct pf_rule		*r;
1403	struct ip6_hdr		*h = mtod(m, struct ip6_hdr *);
1404	int			 off;
1405	struct ip6_ext		 ext;
1406	struct ip6_opt		 opt;
1407	struct ip6_opt_jumbo	 jumbo;
1408	struct ip6_frag		 frag;
1409	u_int32_t		 jumbolen = 0, plen;
1410	u_int16_t		 fragoff = 0;
1411	int			 optend;
1412	int			 ooff;
1413	u_int8_t		 proto;
1414	int			 terminal;
1415
1416	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1417	while (r != NULL) {
1418		r->evaluations++;
1419		if (pfi_kif_match(r->kif, kif) == r->ifnot)
1420			r = r->skip[PF_SKIP_IFP].ptr;
1421		else if (r->direction && r->direction != dir)
1422			r = r->skip[PF_SKIP_DIR].ptr;
1423		else if (r->af && r->af != AF_INET6)
1424			r = r->skip[PF_SKIP_AF].ptr;
1425#if 0 /* header chain! */
1426		else if (r->proto && r->proto != h->ip6_nxt)
1427			r = r->skip[PF_SKIP_PROTO].ptr;
1428#endif
1429		else if (PF_MISMATCHAW(&r->src.addr,
1430		    (struct pf_addr *)&h->ip6_src, AF_INET6,
1431		    r->src.neg, kif, M_GETFIB(m)))
1432			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1433		else if (PF_MISMATCHAW(&r->dst.addr,
1434		    (struct pf_addr *)&h->ip6_dst, AF_INET6,
1435		    r->dst.neg, NULL, M_GETFIB(m)))
1436			r = r->skip[PF_SKIP_DST_ADDR].ptr;
1437		else
1438			break;
1439	}
1440
1441	if (r == NULL || r->action == PF_NOSCRUB)
1442		return (PF_PASS);
1443	else {
1444		r->packets[dir == PF_OUT]++;
1445		r->bytes[dir == PF_OUT] += pd->tot_len;
1446	}
1447
1448	/* Check for illegal packets */
1449	if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len)
1450		goto drop;
1451
1452	off = sizeof(struct ip6_hdr);
1453	proto = h->ip6_nxt;
1454	terminal = 0;
1455	do {
1456		switch (proto) {
1457		case IPPROTO_FRAGMENT:
1458			goto fragment;
1459			break;
1460		case IPPROTO_AH:
1461		case IPPROTO_ROUTING:
1462		case IPPROTO_DSTOPTS:
1463			if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
1464			    NULL, AF_INET6))
1465				goto shortpkt;
1466			if (proto == IPPROTO_AH)
1467				off += (ext.ip6e_len + 2) * 4;
1468			else
1469				off += (ext.ip6e_len + 1) * 8;
1470			proto = ext.ip6e_nxt;
1471			break;
1472		case IPPROTO_HOPOPTS:
1473			if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
1474			    NULL, AF_INET6))
1475				goto shortpkt;
1476			optend = off + (ext.ip6e_len + 1) * 8;
1477			ooff = off + sizeof(ext);
1478			do {
1479				if (!pf_pull_hdr(m, ooff, &opt.ip6o_type,
1480				    sizeof(opt.ip6o_type), NULL, NULL,
1481				    AF_INET6))
1482					goto shortpkt;
1483				if (opt.ip6o_type == IP6OPT_PAD1) {
1484					ooff++;
1485					continue;
1486				}
1487				if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt),
1488				    NULL, NULL, AF_INET6))
1489					goto shortpkt;
1490				if (ooff + sizeof(opt) + opt.ip6o_len > optend)
1491					goto drop;
1492				switch (opt.ip6o_type) {
1493				case IP6OPT_JUMBO:
1494					if (h->ip6_plen != 0)
1495						goto drop;
1496					if (!pf_pull_hdr(m, ooff, &jumbo,
1497					    sizeof(jumbo), NULL, NULL,
1498					    AF_INET6))
1499						goto shortpkt;
1500					memcpy(&jumbolen, jumbo.ip6oj_jumbo_len,
1501					    sizeof(jumbolen));
1502					jumbolen = ntohl(jumbolen);
1503					if (jumbolen <= IPV6_MAXPACKET)
1504						goto drop;
1505					if (sizeof(struct ip6_hdr) + jumbolen !=
1506					    m->m_pkthdr.len)
1507						goto drop;
1508					break;
1509				default:
1510					break;
1511				}
1512				ooff += sizeof(opt) + opt.ip6o_len;
1513			} while (ooff < optend);
1514
1515			off = optend;
1516			proto = ext.ip6e_nxt;
1517			break;
1518		default:
1519			terminal = 1;
1520			break;
1521		}
1522	} while (!terminal);
1523
1524	/* jumbo payload option must be present, or plen > 0 */
1525	if (ntohs(h->ip6_plen) == 0)
1526		plen = jumbolen;
1527	else
1528		plen = ntohs(h->ip6_plen);
1529	if (plen == 0)
1530		goto drop;
1531	if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len)
1532		goto shortpkt;
1533
1534	pf_scrub_ip6(&m, r->min_ttl);
1535
1536	return (PF_PASS);
1537
1538 fragment:
1539	if (ntohs(h->ip6_plen) == 0 || jumbolen)
1540		goto drop;
1541	plen = ntohs(h->ip6_plen);
1542
1543	if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6))
1544		goto shortpkt;
1545	fragoff = ntohs(frag.ip6f_offlg & IP6F_OFF_MASK);
1546	if (fragoff + (plen - off - sizeof(frag)) > IPV6_MAXPACKET)
1547		goto badfrag;
1548
1549	/* do something about it */
1550	/* remember to set pd->flags |= PFDESC_IP_REAS */
1551	return (PF_PASS);
1552
1553 shortpkt:
1554	REASON_SET(reason, PFRES_SHORT);
1555	if (r != NULL && r->log)
1556		PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd);
1557	return (PF_DROP);
1558
1559 drop:
1560	REASON_SET(reason, PFRES_NORM);
1561	if (r != NULL && r->log)
1562		PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd);
1563	return (PF_DROP);
1564
1565 badfrag:
1566	REASON_SET(reason, PFRES_FRAG);
1567	if (r != NULL && r->log)
1568		PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd);
1569	return (PF_DROP);
1570}
1571#endif /* INET6 */
1572
1573int
1574pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff,
1575    int off, void *h, struct pf_pdesc *pd)
1576{
1577	struct pf_rule	*r, *rm = NULL;
1578	struct tcphdr	*th = pd->hdr.tcp;
1579	int		 rewrite = 0;
1580	u_short		 reason;
1581	u_int8_t	 flags;
1582	sa_family_t	 af = pd->af;
1583
1584	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1585	while (r != NULL) {
1586		r->evaluations++;
1587		if (pfi_kif_match(r->kif, kif) == r->ifnot)
1588			r = r->skip[PF_SKIP_IFP].ptr;
1589		else if (r->direction && r->direction != dir)
1590			r = r->skip[PF_SKIP_DIR].ptr;
1591		else if (r->af && r->af != af)
1592			r = r->skip[PF_SKIP_AF].ptr;
1593		else if (r->proto && r->proto != pd->proto)
1594			r = r->skip[PF_SKIP_PROTO].ptr;
1595		else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
1596		    r->src.neg, kif, M_GETFIB(m)))
1597			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1598		else if (r->src.port_op && !pf_match_port(r->src.port_op,
1599			    r->src.port[0], r->src.port[1], th->th_sport))
1600			r = r->skip[PF_SKIP_SRC_PORT].ptr;
1601		else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
1602		    r->dst.neg, NULL, M_GETFIB(m)))
1603			r = r->skip[PF_SKIP_DST_ADDR].ptr;
1604		else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
1605			    r->dst.port[0], r->dst.port[1], th->th_dport))
1606			r = r->skip[PF_SKIP_DST_PORT].ptr;
1607		else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match(
1608			    pf_osfp_fingerprint(pd, m, off, th),
1609			    r->os_fingerprint))
1610			r = TAILQ_NEXT(r, entries);
1611		else {
1612			rm = r;
1613			break;
1614		}
1615	}
1616
1617	if (rm == NULL || rm->action == PF_NOSCRUB)
1618		return (PF_PASS);
1619	else {
1620		r->packets[dir == PF_OUT]++;
1621		r->bytes[dir == PF_OUT] += pd->tot_len;
1622	}
1623
1624	if (rm->rule_flag & PFRULE_REASSEMBLE_TCP)
1625		pd->flags |= PFDESC_TCP_NORM;
1626
1627	flags = th->th_flags;
1628	if (flags & TH_SYN) {
1629		/* Illegal packet */
1630		if (flags & TH_RST)
1631			goto tcp_drop;
1632
1633		if (flags & TH_FIN)
1634			flags &= ~TH_FIN;
1635	} else {
1636		/* Illegal packet */
1637		if (!(flags & (TH_ACK|TH_RST)))
1638			goto tcp_drop;
1639	}
1640
1641	if (!(flags & TH_ACK)) {
1642		/* These flags are only valid if ACK is set */
1643		if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG))
1644			goto tcp_drop;
1645	}
1646
1647	/* Check for illegal header length */
1648	if (th->th_off < (sizeof(struct tcphdr) >> 2))
1649		goto tcp_drop;
1650
1651	/* If flags changed, or reserved data set, then adjust */
1652	if (flags != th->th_flags || th->th_x2 != 0) {
1653		u_int16_t	ov, nv;
1654
1655		ov = *(u_int16_t *)(&th->th_ack + 1);
1656		th->th_flags = flags;
1657		th->th_x2 = 0;
1658		nv = *(u_int16_t *)(&th->th_ack + 1);
1659
1660		th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, ov, nv, 0);
1661		rewrite = 1;
1662	}
1663
1664	/* Remove urgent pointer, if TH_URG is not set */
1665	if (!(flags & TH_URG) && th->th_urp) {
1666		th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, th->th_urp,
1667		    0, 0);
1668		th->th_urp = 0;
1669		rewrite = 1;
1670	}
1671
1672	/* Process options */
1673	if (r->max_mss && pf_normalize_tcpopt(r, m, th, off, pd->af))
1674		rewrite = 1;
1675
1676	/* copy back packet headers if we sanitized */
1677	if (rewrite)
1678#ifdef __FreeBSD__
1679		m_copyback(m, off, sizeof(*th), (caddr_t)th);
1680#else
1681		m_copyback(m, off, sizeof(*th), th);
1682#endif
1683
1684	return (PF_PASS);
1685
1686 tcp_drop:
1687	REASON_SET(&reason, PFRES_NORM);
1688	if (rm != NULL && r->log)
1689		PFLOG_PACKET(kif, h, m, AF_INET, dir, reason, r, NULL, NULL, pd);
1690	return (PF_DROP);
1691}
1692
1693int
1694pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd,
1695    struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst)
1696{
1697	u_int32_t tsval, tsecr;
1698	u_int8_t hdr[60];
1699	u_int8_t *opt;
1700
1701#ifdef __FreeBSD__
1702	KASSERT((src->scrub == NULL),
1703	    ("pf_normalize_tcp_init: src->scrub != NULL"));
1704
1705	src->scrub = pool_get(&V_pf_state_scrub_pl, PR_NOWAIT);
1706#else
1707	KASSERT(src->scrub == NULL);
1708
1709	src->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT);
1710#endif
1711	if (src->scrub == NULL)
1712		return (1);
1713	bzero(src->scrub, sizeof(*src->scrub));
1714
1715	switch (pd->af) {
1716#ifdef INET
1717	case AF_INET: {
1718		struct ip *h = mtod(m, struct ip *);
1719		src->scrub->pfss_ttl = h->ip_ttl;
1720		break;
1721	}
1722#endif /* INET */
1723#ifdef INET6
1724	case AF_INET6: {
1725		struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
1726		src->scrub->pfss_ttl = h->ip6_hlim;
1727		break;
1728	}
1729#endif /* INET6 */
1730	}
1731
1732
1733	/*
1734	 * All normalizations below are only begun if we see the start of
1735	 * the connections.  They must all set an enabled bit in pfss_flags
1736	 */
1737	if ((th->th_flags & TH_SYN) == 0)
1738		return (0);
1739
1740
1741	if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub &&
1742	    pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
1743		/* Diddle with TCP options */
1744		int hlen;
1745		opt = hdr + sizeof(struct tcphdr);
1746		hlen = (th->th_off << 2) - sizeof(struct tcphdr);
1747		while (hlen >= TCPOLEN_TIMESTAMP) {
1748			switch (*opt) {
1749			case TCPOPT_EOL:	/* FALLTHROUGH */
1750			case TCPOPT_NOP:
1751				opt++;
1752				hlen--;
1753				break;
1754			case TCPOPT_TIMESTAMP:
1755				if (opt[1] >= TCPOLEN_TIMESTAMP) {
1756					src->scrub->pfss_flags |=
1757					    PFSS_TIMESTAMP;
1758					src->scrub->pfss_ts_mod =
1759					    htonl(arc4random());
1760
1761					/* note PFSS_PAWS not set yet */
1762					memcpy(&tsval, &opt[2],
1763					    sizeof(u_int32_t));
1764					memcpy(&tsecr, &opt[6],
1765					    sizeof(u_int32_t));
1766					src->scrub->pfss_tsval0 = ntohl(tsval);
1767					src->scrub->pfss_tsval = ntohl(tsval);
1768					src->scrub->pfss_tsecr = ntohl(tsecr);
1769					getmicrouptime(&src->scrub->pfss_last);
1770				}
1771				/* FALLTHROUGH */
1772			default:
1773				hlen -= MAX(opt[1], 2);
1774				opt += MAX(opt[1], 2);
1775				break;
1776			}
1777		}
1778	}
1779
1780	return (0);
1781}
1782
1783void
1784pf_normalize_tcp_cleanup(struct pf_state *state)
1785{
1786#ifdef __FreeBSD__
1787	if (state->src.scrub)
1788		pool_put(&V_pf_state_scrub_pl, state->src.scrub);
1789	if (state->dst.scrub)
1790		pool_put(&V_pf_state_scrub_pl, state->dst.scrub);
1791#else
1792	if (state->src.scrub)
1793		pool_put(&pf_state_scrub_pl, state->src.scrub);
1794	if (state->dst.scrub)
1795		pool_put(&pf_state_scrub_pl, state->dst.scrub);
1796#endif
1797
1798	/* Someday... flush the TCP segment reassembly descriptors. */
1799}
1800
1801int
1802pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd,
1803    u_short *reason, struct tcphdr *th, struct pf_state *state,
1804    struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback)
1805{
1806	struct timeval uptime;
1807	u_int32_t tsval, tsecr;
1808	u_int tsval_from_last;
1809	u_int8_t hdr[60];
1810	u_int8_t *opt;
1811	int copyback = 0;
1812	int got_ts = 0;
1813
1814#ifdef __FreeBSD__
1815	KASSERT((src->scrub || dst->scrub),
1816	    ("pf_normalize_tcp_statefull: src->scrub && dst->scrub!"));
1817#else
1818	KASSERT(src->scrub || dst->scrub);
1819#endif
1820
1821	/*
1822	 * Enforce the minimum TTL seen for this connection.  Negate a common
1823	 * technique to evade an intrusion detection system and confuse
1824	 * firewall state code.
1825	 */
1826	switch (pd->af) {
1827#ifdef INET
1828	case AF_INET: {
1829		if (src->scrub) {
1830			struct ip *h = mtod(m, struct ip *);
1831			if (h->ip_ttl > src->scrub->pfss_ttl)
1832				src->scrub->pfss_ttl = h->ip_ttl;
1833			h->ip_ttl = src->scrub->pfss_ttl;
1834		}
1835		break;
1836	}
1837#endif /* INET */
1838#ifdef INET6
1839	case AF_INET6: {
1840		if (src->scrub) {
1841			struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
1842			if (h->ip6_hlim > src->scrub->pfss_ttl)
1843				src->scrub->pfss_ttl = h->ip6_hlim;
1844			h->ip6_hlim = src->scrub->pfss_ttl;
1845		}
1846		break;
1847	}
1848#endif /* INET6 */
1849	}
1850
1851	if (th->th_off > (sizeof(struct tcphdr) >> 2) &&
1852	    ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) ||
1853	    (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) &&
1854	    pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
1855		/* Diddle with TCP options */
1856		int hlen;
1857		opt = hdr + sizeof(struct tcphdr);
1858		hlen = (th->th_off << 2) - sizeof(struct tcphdr);
1859		while (hlen >= TCPOLEN_TIMESTAMP) {
1860			switch (*opt) {
1861			case TCPOPT_EOL:	/* FALLTHROUGH */
1862			case TCPOPT_NOP:
1863				opt++;
1864				hlen--;
1865				break;
1866			case TCPOPT_TIMESTAMP:
1867				/* Modulate the timestamps.  Can be used for
1868				 * NAT detection, OS uptime determination or
1869				 * reboot detection.
1870				 */
1871
1872				if (got_ts) {
1873					/* Huh?  Multiple timestamps!? */
1874#ifdef __FreeBSD__
1875					if (V_pf_status.debug >= PF_DEBUG_MISC) {
1876#else
1877					if (pf_status.debug >= PF_DEBUG_MISC) {
1878#endif
1879						DPFPRINTF(("multiple TS??"));
1880						pf_print_state(state);
1881						printf("\n");
1882					}
1883					REASON_SET(reason, PFRES_TS);
1884					return (PF_DROP);
1885				}
1886				if (opt[1] >= TCPOLEN_TIMESTAMP) {
1887					memcpy(&tsval, &opt[2],
1888					    sizeof(u_int32_t));
1889					if (tsval && src->scrub &&
1890					    (src->scrub->pfss_flags &
1891					    PFSS_TIMESTAMP)) {
1892						tsval = ntohl(tsval);
1893						pf_change_proto_a(m, &opt[2],
1894						    &th->th_sum,
1895						    htonl(tsval +
1896						    src->scrub->pfss_ts_mod),
1897						    0);
1898						copyback = 1;
1899					}
1900
1901					/* Modulate TS reply iff valid (!0) */
1902					memcpy(&tsecr, &opt[6],
1903					    sizeof(u_int32_t));
1904					if (tsecr && dst->scrub &&
1905					    (dst->scrub->pfss_flags &
1906					    PFSS_TIMESTAMP)) {
1907						tsecr = ntohl(tsecr)
1908						    - dst->scrub->pfss_ts_mod;
1909						pf_change_proto_a(m, &opt[6],
1910						    &th->th_sum, htonl(tsecr),
1911						    0);
1912						copyback = 1;
1913					}
1914					got_ts = 1;
1915				}
1916				/* FALLTHROUGH */
1917			default:
1918				hlen -= MAX(opt[1], 2);
1919				opt += MAX(opt[1], 2);
1920				break;
1921			}
1922		}
1923		if (copyback) {
1924			/* Copyback the options, caller copys back header */
1925			*writeback = 1;
1926			m_copyback(m, off + sizeof(struct tcphdr),
1927			    (th->th_off << 2) - sizeof(struct tcphdr), hdr +
1928			    sizeof(struct tcphdr));
1929		}
1930	}
1931
1932
1933	/*
1934	 * Must invalidate PAWS checks on connections idle for too long.
1935	 * The fastest allowed timestamp clock is 1ms.  That turns out to
1936	 * be about 24 days before it wraps.  XXX Right now our lowerbound
1937	 * TS echo check only works for the first 12 days of a connection
1938	 * when the TS has exhausted half its 32bit space
1939	 */
1940#define TS_MAX_IDLE	(24*24*60*60)
1941#define TS_MAX_CONN	(12*24*60*60)	/* XXX remove when better tsecr check */
1942
1943	getmicrouptime(&uptime);
1944	if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) &&
1945	    (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE ||
1946	    time_second - state->creation > TS_MAX_CONN))  {
1947#ifdef __FreeBSD__
1948		if (V_pf_status.debug >= PF_DEBUG_MISC) {
1949#else
1950		if (pf_status.debug >= PF_DEBUG_MISC) {
1951#endif
1952			DPFPRINTF(("src idled out of PAWS\n"));
1953			pf_print_state(state);
1954			printf("\n");
1955		}
1956		src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS)
1957		    | PFSS_PAWS_IDLED;
1958	}
1959	if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) &&
1960	    uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) {
1961#ifdef __FreeBSD__
1962		if (V_pf_status.debug >= PF_DEBUG_MISC) {
1963#else
1964		if (pf_status.debug >= PF_DEBUG_MISC) {
1965#endif
1966			DPFPRINTF(("dst idled out of PAWS\n"));
1967			pf_print_state(state);
1968			printf("\n");
1969		}
1970		dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS)
1971		    | PFSS_PAWS_IDLED;
1972	}
1973
1974	if (got_ts && src->scrub && dst->scrub &&
1975	    (src->scrub->pfss_flags & PFSS_PAWS) &&
1976	    (dst->scrub->pfss_flags & PFSS_PAWS)) {
1977		/* Validate that the timestamps are "in-window".
1978		 * RFC1323 describes TCP Timestamp options that allow
1979		 * measurement of RTT (round trip time) and PAWS
1980		 * (protection against wrapped sequence numbers).  PAWS
1981		 * gives us a set of rules for rejecting packets on
1982		 * long fat pipes (packets that were somehow delayed
1983		 * in transit longer than the time it took to send the
1984		 * full TCP sequence space of 4Gb).  We can use these
1985		 * rules and infer a few others that will let us treat
1986		 * the 32bit timestamp and the 32bit echoed timestamp
1987		 * as sequence numbers to prevent a blind attacker from
1988		 * inserting packets into a connection.
1989		 *
1990		 * RFC1323 tells us:
1991		 *  - The timestamp on this packet must be greater than
1992		 *    or equal to the last value echoed by the other
1993		 *    endpoint.  The RFC says those will be discarded
1994		 *    since it is a dup that has already been acked.
1995		 *    This gives us a lowerbound on the timestamp.
1996		 *        timestamp >= other last echoed timestamp
1997		 *  - The timestamp will be less than or equal to
1998		 *    the last timestamp plus the time between the
1999		 *    last packet and now.  The RFC defines the max
2000		 *    clock rate as 1ms.  We will allow clocks to be
2001		 *    up to 10% fast and will allow a total difference
2002		 *    or 30 seconds due to a route change.  And this
2003		 *    gives us an upperbound on the timestamp.
2004		 *        timestamp <= last timestamp + max ticks
2005		 *    We have to be careful here.  Windows will send an
2006		 *    initial timestamp of zero and then initialize it
2007		 *    to a random value after the 3whs; presumably to
2008		 *    avoid a DoS by having to call an expensive RNG
2009		 *    during a SYN flood.  Proof MS has at least one
2010		 *    good security geek.
2011		 *
2012		 *  - The TCP timestamp option must also echo the other
2013		 *    endpoints timestamp.  The timestamp echoed is the
2014		 *    one carried on the earliest unacknowledged segment
2015		 *    on the left edge of the sequence window.  The RFC
2016		 *    states that the host will reject any echoed
2017		 *    timestamps that were larger than any ever sent.
2018		 *    This gives us an upperbound on the TS echo.
2019		 *        tescr <= largest_tsval
2020		 *  - The lowerbound on the TS echo is a little more
2021		 *    tricky to determine.  The other endpoint's echoed
2022		 *    values will not decrease.  But there may be
2023		 *    network conditions that re-order packets and
2024		 *    cause our view of them to decrease.  For now the
2025		 *    only lowerbound we can safely determine is that
2026		 *    the TS echo will never be less than the original
2027		 *    TS.  XXX There is probably a better lowerbound.
2028		 *    Remove TS_MAX_CONN with better lowerbound check.
2029		 *        tescr >= other original TS
2030		 *
2031		 * It is also important to note that the fastest
2032		 * timestamp clock of 1ms will wrap its 32bit space in
2033		 * 24 days.  So we just disable TS checking after 24
2034		 * days of idle time.  We actually must use a 12d
2035		 * connection limit until we can come up with a better
2036		 * lowerbound to the TS echo check.
2037		 */
2038		struct timeval delta_ts;
2039		int ts_fudge;
2040
2041
2042		/*
2043		 * PFTM_TS_DIFF is how many seconds of leeway to allow
2044		 * a host's timestamp.  This can happen if the previous
2045		 * packet got delayed in transit for much longer than
2046		 * this packet.
2047		 */
2048		if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0)
2049#ifdef __FreeBSD__
2050			ts_fudge = V_pf_default_rule.timeout[PFTM_TS_DIFF];
2051#else
2052			ts_fudge = pf_default_rule.timeout[PFTM_TS_DIFF];
2053#endif
2054
2055
2056		/* Calculate max ticks since the last timestamp */
2057#define TS_MAXFREQ	1100		/* RFC max TS freq of 1Khz + 10% skew */
2058#define TS_MICROSECS	1000000		/* microseconds per second */
2059#ifdef __FreeBSD__
2060#ifndef timersub
2061#define	timersub(tvp, uvp, vvp)						\
2062	do {								\
2063		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
2064		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
2065		if ((vvp)->tv_usec < 0) {				\
2066			(vvp)->tv_sec--;				\
2067			(vvp)->tv_usec += 1000000;			\
2068		}							\
2069	} while (0)
2070#endif
2071#endif
2072		timersub(&uptime, &src->scrub->pfss_last, &delta_ts);
2073		tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ;
2074		tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ);
2075
2076
2077		if ((src->state >= TCPS_ESTABLISHED &&
2078		    dst->state >= TCPS_ESTABLISHED) &&
2079		    (SEQ_LT(tsval, dst->scrub->pfss_tsecr) ||
2080		    SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ||
2081		    (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) ||
2082		    SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) {
2083			/* Bad RFC1323 implementation or an insertion attack.
2084			 *
2085			 * - Solaris 2.6 and 2.7 are known to send another ACK
2086			 *   after the FIN,FIN|ACK,ACK closing that carries
2087			 *   an old timestamp.
2088			 */
2089
2090			DPFPRINTF(("Timestamp failed %c%c%c%c\n",
2091			    SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ',
2092			    SEQ_GT(tsval, src->scrub->pfss_tsval +
2093			    tsval_from_last) ? '1' : ' ',
2094			    SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ',
2095			    SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' '));
2096#ifdef __FreeBSD__
2097			DPFPRINTF((" tsval: %u  tsecr: %u  +ticks: %u  "
2098			    "idle: %jus %lums\n",
2099			    tsval, tsecr, tsval_from_last,
2100			    (uintmax_t)delta_ts.tv_sec,
2101			    delta_ts.tv_usec / 1000));
2102			DPFPRINTF((" src->tsval: %u  tsecr: %u\n",
2103			    src->scrub->pfss_tsval, src->scrub->pfss_tsecr));
2104			DPFPRINTF((" dst->tsval: %u  tsecr: %u  tsval0: %u"
2105			    "\n", dst->scrub->pfss_tsval,
2106			    dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0));
2107#else
2108			DPFPRINTF((" tsval: %lu  tsecr: %lu  +ticks: %lu  "
2109			    "idle: %lus %lums\n",
2110			    tsval, tsecr, tsval_from_last, delta_ts.tv_sec,
2111			    delta_ts.tv_usec / 1000));
2112			DPFPRINTF((" src->tsval: %lu  tsecr: %lu\n",
2113			    src->scrub->pfss_tsval, src->scrub->pfss_tsecr));
2114			DPFPRINTF((" dst->tsval: %lu  tsecr: %lu  tsval0: %lu"
2115			    "\n", dst->scrub->pfss_tsval,
2116			    dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0));
2117#endif
2118#ifdef __FreeBSD__
2119			if (V_pf_status.debug >= PF_DEBUG_MISC) {
2120#else
2121			if (pf_status.debug >= PF_DEBUG_MISC) {
2122#endif
2123				pf_print_state(state);
2124				pf_print_flags(th->th_flags);
2125				printf("\n");
2126			}
2127			REASON_SET(reason, PFRES_TS);
2128			return (PF_DROP);
2129		}
2130
2131		/* XXX I'd really like to require tsecr but it's optional */
2132
2133	} else if (!got_ts && (th->th_flags & TH_RST) == 0 &&
2134	    ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED)
2135	    || pd->p_len > 0 || (th->th_flags & TH_SYN)) &&
2136	    src->scrub && dst->scrub &&
2137	    (src->scrub->pfss_flags & PFSS_PAWS) &&
2138	    (dst->scrub->pfss_flags & PFSS_PAWS)) {
2139		/* Didn't send a timestamp.  Timestamps aren't really useful
2140		 * when:
2141		 *  - connection opening or closing (often not even sent).
2142		 *    but we must not let an attacker to put a FIN on a
2143		 *    data packet to sneak it through our ESTABLISHED check.
2144		 *  - on a TCP reset.  RFC suggests not even looking at TS.
2145		 *  - on an empty ACK.  The TS will not be echoed so it will
2146		 *    probably not help keep the RTT calculation in sync and
2147		 *    there isn't as much danger when the sequence numbers
2148		 *    got wrapped.  So some stacks don't include TS on empty
2149		 *    ACKs :-(
2150		 *
2151		 * To minimize the disruption to mostly RFC1323 conformant
2152		 * stacks, we will only require timestamps on data packets.
2153		 *
2154		 * And what do ya know, we cannot require timestamps on data
2155		 * packets.  There appear to be devices that do legitimate
2156		 * TCP connection hijacking.  There are HTTP devices that allow
2157		 * a 3whs (with timestamps) and then buffer the HTTP request.
2158		 * If the intermediate device has the HTTP response cache, it
2159		 * will spoof the response but not bother timestamping its
2160		 * packets.  So we can look for the presence of a timestamp in
2161		 * the first data packet and if there, require it in all future
2162		 * packets.
2163		 */
2164
2165		if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) {
2166			/*
2167			 * Hey!  Someone tried to sneak a packet in.  Or the
2168			 * stack changed its RFC1323 behavior?!?!
2169			 */
2170#ifdef __FreeBSD__
2171			if (V_pf_status.debug >= PF_DEBUG_MISC) {
2172#else
2173			if (pf_status.debug >= PF_DEBUG_MISC) {
2174#endif
2175				DPFPRINTF(("Did not receive expected RFC1323 "
2176				    "timestamp\n"));
2177				pf_print_state(state);
2178				pf_print_flags(th->th_flags);
2179				printf("\n");
2180			}
2181			REASON_SET(reason, PFRES_TS);
2182			return (PF_DROP);
2183		}
2184	}
2185
2186
2187	/*
2188	 * We will note if a host sends his data packets with or without
2189	 * timestamps.  And require all data packets to contain a timestamp
2190	 * if the first does.  PAWS implicitly requires that all data packets be
2191	 * timestamped.  But I think there are middle-man devices that hijack
2192	 * TCP streams immediately after the 3whs and don't timestamp their
2193	 * packets (seen in a WWW accelerator or cache).
2194	 */
2195	if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags &
2196	    (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) {
2197		if (got_ts)
2198			src->scrub->pfss_flags |= PFSS_DATA_TS;
2199		else {
2200			src->scrub->pfss_flags |= PFSS_DATA_NOTS;
2201#ifdef __FreeBSD__
2202			if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub &&
2203#else
2204			if (pf_status.debug >= PF_DEBUG_MISC && dst->scrub &&
2205#endif
2206			    (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) {
2207				/* Don't warn if other host rejected RFC1323 */
2208				DPFPRINTF(("Broken RFC1323 stack did not "
2209				    "timestamp data packet. Disabled PAWS "
2210				    "security.\n"));
2211				pf_print_state(state);
2212				pf_print_flags(th->th_flags);
2213				printf("\n");
2214			}
2215		}
2216	}
2217
2218
2219	/*
2220	 * Update PAWS values
2221	 */
2222	if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags &
2223	    (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) {
2224		getmicrouptime(&src->scrub->pfss_last);
2225		if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) ||
2226		    (src->scrub->pfss_flags & PFSS_PAWS) == 0)
2227			src->scrub->pfss_tsval = tsval;
2228
2229		if (tsecr) {
2230			if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) ||
2231			    (src->scrub->pfss_flags & PFSS_PAWS) == 0)
2232				src->scrub->pfss_tsecr = tsecr;
2233
2234			if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 &&
2235			    (SEQ_LT(tsval, src->scrub->pfss_tsval0) ||
2236			    src->scrub->pfss_tsval0 == 0)) {
2237				/* tsval0 MUST be the lowest timestamp */
2238				src->scrub->pfss_tsval0 = tsval;
2239			}
2240
2241			/* Only fully initialized after a TS gets echoed */
2242			if ((src->scrub->pfss_flags & PFSS_PAWS) == 0)
2243				src->scrub->pfss_flags |= PFSS_PAWS;
2244		}
2245	}
2246
2247	/* I have a dream....  TCP segment reassembly.... */
2248	return (0);
2249}
2250
2251int
2252pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th,
2253    int off, sa_family_t af)
2254{
2255	u_int16_t	*mss;
2256	int		 thoff;
2257	int		 opt, cnt, optlen = 0;
2258	int		 rewrite = 0;
2259#ifdef __FreeBSD__
2260	u_char		 opts[TCP_MAXOLEN];
2261#else
2262	u_char		 opts[MAX_TCPOPTLEN];
2263#endif
2264	u_char		*optp = opts;
2265
2266	thoff = th->th_off << 2;
2267	cnt = thoff - sizeof(struct tcphdr);
2268
2269	if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt,
2270	    NULL, NULL, af))
2271		return (rewrite);
2272
2273	for (; cnt > 0; cnt -= optlen, optp += optlen) {
2274		opt = optp[0];
2275		if (opt == TCPOPT_EOL)
2276			break;
2277		if (opt == TCPOPT_NOP)
2278			optlen = 1;
2279		else {
2280			if (cnt < 2)
2281				break;
2282			optlen = optp[1];
2283			if (optlen < 2 || optlen > cnt)
2284				break;
2285		}
2286		switch (opt) {
2287		case TCPOPT_MAXSEG:
2288			mss = (u_int16_t *)(optp + 2);
2289			if ((ntohs(*mss)) > r->max_mss) {
2290				th->th_sum = pf_proto_cksum_fixup(m,
2291				    th->th_sum, *mss, htons(r->max_mss), 0);
2292				*mss = htons(r->max_mss);
2293				rewrite = 1;
2294			}
2295			break;
2296		default:
2297			break;
2298		}
2299	}
2300
2301	if (rewrite)
2302		m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts);
2303
2304	return (rewrite);
2305}
2306
2307void
2308pf_scrub_ip(struct mbuf **m0, u_int32_t flags, u_int8_t min_ttl, u_int8_t tos)
2309{
2310	struct mbuf		*m = *m0;
2311	struct ip		*h = mtod(m, struct ip *);
2312
2313	/* Clear IP_DF if no-df was requested */
2314	if (flags & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
2315		u_int16_t ip_off = h->ip_off;
2316
2317		h->ip_off &= htons(~IP_DF);
2318		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
2319	}
2320
2321	/* Enforce a minimum ttl, may cause endless packet loops */
2322	if (min_ttl && h->ip_ttl < min_ttl) {
2323		u_int16_t ip_ttl = h->ip_ttl;
2324
2325		h->ip_ttl = min_ttl;
2326		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
2327	}
2328
2329	/* Enforce tos */
2330	if (flags & PFRULE_SET_TOS) {
2331		u_int16_t	ov, nv;
2332
2333		ov = *(u_int16_t *)h;
2334		h->ip_tos = tos;
2335		nv = *(u_int16_t *)h;
2336
2337		h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0);
2338	}
2339
2340	/* random-id, but not for fragments */
2341	if (flags & PFRULE_RANDOMID && !(h->ip_off & ~htons(IP_DF))) {
2342		u_int16_t ip_id = h->ip_id;
2343
2344		h->ip_id = ip_randomid();
2345		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0);
2346	}
2347}
2348
2349#ifdef INET6
2350void
2351pf_scrub_ip6(struct mbuf **m0, u_int8_t min_ttl)
2352{
2353	struct mbuf		*m = *m0;
2354	struct ip6_hdr		*h = mtod(m, struct ip6_hdr *);
2355
2356	/* Enforce a minimum ttl, may cause endless packet loops */
2357	if (min_ttl && h->ip6_hlim < min_ttl)
2358		h->ip6_hlim = min_ttl;
2359}
2360#endif
2361