ip_frag.c revision 89336
1249259Sdim/*
2249259Sdim * Copyright (C) 1993-2001 by Darren Reed.
3249259Sdim *
4249259Sdim * See the IPFILTER.LICENCE file for details on licencing.
5249259Sdim */
6249259Sdim#if defined(KERNEL) && !defined(_KERNEL)
7249259Sdim# define      _KERNEL
8249259Sdim#endif
9249259Sdim
10249259Sdim#include <sys/errno.h>
11249259Sdim#include <sys/types.h>
12249259Sdim#include <sys/param.h>
13249259Sdim#include <sys/time.h>
14249259Sdim#include <sys/file.h>
15263508Sdim#if !defined(_KERNEL) && !defined(KERNEL)
16263508Sdim# include <stdio.h>
17263508Sdim# include <string.h>
18263508Sdim# include <stdlib.h>
19249259Sdim#endif
20249259Sdim#if (defined(KERNEL) || defined(_KERNEL)) && (__FreeBSD_version >= 220000)
21# include <sys/filio.h>
22# include <sys/fcntl.h>
23#else
24# include <sys/ioctl.h>
25#endif
26#include <sys/uio.h>
27#ifndef linux
28# include <sys/protosw.h>
29#endif
30#include <sys/socket.h>
31#if defined(_KERNEL) && !defined(linux)
32# include <sys/systm.h>
33#endif
34#if !defined(__SVR4) && !defined(__svr4__)
35# if defined(_KERNEL) && !defined(__sgi)
36#  include <sys/kernel.h>
37# endif
38# ifndef linux
39#  include <sys/mbuf.h>
40# endif
41#else
42# include <sys/byteorder.h>
43# ifdef _KERNEL
44#  include <sys/dditypes.h>
45# endif
46# include <sys/stream.h>
47# include <sys/kmem.h>
48#endif
49#include <net/if.h>
50#ifdef sun
51# include <net/af.h>
52#endif
53#include <net/route.h>
54#include <netinet/in.h>
55#include <netinet/in_systm.h>
56#include <netinet/ip.h>
57#ifndef linux
58# include <netinet/ip_var.h>
59#endif
60#include <netinet/tcp.h>
61#include <netinet/udp.h>
62#include <netinet/ip_icmp.h>
63#include "netinet/ip_compat.h"
64#include <netinet/tcpip.h>
65#include "netinet/ip_fil.h"
66#include "netinet/ip_proxy.h"
67#include "netinet/ip_nat.h"
68#include "netinet/ip_frag.h"
69#include "netinet/ip_state.h"
70#include "netinet/ip_auth.h"
71#if (__FreeBSD_version >= 300000)
72# include <sys/malloc.h>
73# if (defined(KERNEL) || defined(_KERNEL))
74#  ifndef IPFILTER_LKM
75#   include <sys/libkern.h>
76#   include <sys/systm.h>
77#  endif
78extern struct callout_handle ipfr_slowtimer_ch;
79# endif
80#endif
81#if defined(__NetBSD__) && (__NetBSD_Version__ >= 104230000)
82# include <sys/callout.h>
83extern struct callout ipfr_slowtimer_ch;
84#endif
85#if defined(__OpenBSD__)
86# include <sys/timeout.h>
87extern struct timeout ipfr_slowtimer_ch;
88#endif
89
90#if !defined(lint)
91static const char sccsid[] = "@(#)ip_frag.c	1.11 3/24/96 (C) 1993-2000 Darren Reed";
92static const char rcsid[] = "@(#)$FreeBSD: head/sys/contrib/ipfilter/netinet/ip_frag.c 89336 2002-01-14 09:07:15Z alfred $";
93#endif
94
95
96static ipfr_t	*ipfr_heads[IPFT_SIZE];
97static ipfr_t	*ipfr_nattab[IPFT_SIZE];
98static ipfrstat_t ipfr_stats;
99static int	ipfr_inuse = 0;
100
101int	fr_ipfrttl = 120;	/* 60 seconds */
102int	fr_frag_lock = 0;
103
104#ifdef _KERNEL
105# if SOLARIS2 >= 7
106extern	timeout_id_t	ipfr_timer_id;
107# else
108extern	int	ipfr_timer_id;
109# endif
110#endif
111#if	(SOLARIS || defined(__sgi)) && defined(_KERNEL)
112extern	KRWLOCK_T	ipf_frag, ipf_natfrag, ipf_nat, ipf_mutex;
113# if	SOLARIS
114extern	KRWLOCK_T	ipf_solaris;
115# else
116KRWLOCK_T	ipf_solaris;
117# endif
118extern	kmutex_t	ipf_rw;
119#endif
120
121
122static ipfr_t *ipfr_new __P((ip_t *, fr_info_t *, u_int, ipfr_t **));
123static ipfr_t *ipfr_lookup __P((ip_t *, fr_info_t *, ipfr_t **));
124static void ipfr_delete __P((ipfr_t *));
125
126
127ipfrstat_t *ipfr_fragstats()
128{
129	ipfr_stats.ifs_table = ipfr_heads;
130	ipfr_stats.ifs_nattab = ipfr_nattab;
131	ipfr_stats.ifs_inuse = ipfr_inuse;
132	return &ipfr_stats;
133}
134
135
136/*
137 * add a new entry to the fragment cache, registering it as having come
138 * through this box, with the result of the filter operation.
139 */
140static ipfr_t *ipfr_new(ip, fin, pass, table)
141ip_t *ip;
142fr_info_t *fin;
143u_int pass;
144ipfr_t *table[];
145{
146	ipfr_t **fp, *fra, frag;
147	u_int idx, off;
148
149	if (ipfr_inuse >= IPFT_SIZE)
150		return NULL;
151
152	if (!(fin->fin_fl & FI_FRAG))
153		return NULL;
154
155	frag.ipfr_p = ip->ip_p;
156	idx = ip->ip_p;
157	frag.ipfr_id = ip->ip_id;
158	idx += ip->ip_id;
159	frag.ipfr_tos = ip->ip_tos;
160	frag.ipfr_src.s_addr = ip->ip_src.s_addr;
161	idx += ip->ip_src.s_addr;
162	frag.ipfr_dst.s_addr = ip->ip_dst.s_addr;
163	idx += ip->ip_dst.s_addr;
164	frag.ipfr_ifp = fin->fin_ifp;
165	idx *= 127;
166	idx %= IPFT_SIZE;
167
168	frag.ipfr_optmsk = fin->fin_fi.fi_optmsk & IPF_OPTCOPY;
169	frag.ipfr_secmsk = fin->fin_fi.fi_secmsk;
170	frag.ipfr_auth = fin->fin_fi.fi_auth;
171
172	/*
173	 * first, make sure it isn't already there...
174	 */
175	for (fp = &table[idx]; (fra = *fp); fp = &fra->ipfr_next)
176		if (!bcmp((char *)&frag.ipfr_src, (char *)&fra->ipfr_src,
177			  IPFR_CMPSZ)) {
178			ATOMIC_INCL(ipfr_stats.ifs_exists);
179			return NULL;
180		}
181
182	/*
183	 * allocate some memory, if possible, if not, just record that we
184	 * failed to do so.
185	 */
186	KMALLOC(fra, ipfr_t *);
187	if (fra == NULL) {
188		ATOMIC_INCL(ipfr_stats.ifs_nomem);
189		return NULL;
190	}
191
192	if ((fra->ipfr_rule = fin->fin_fr) != NULL) {
193		ATOMIC_INC32(fin->fin_fr->fr_ref);
194	}
195
196
197	/*
198	 * Instert the fragment into the fragment table, copy the struct used
199	 * in the search using bcopy rather than reassign each field.
200	 * Set the ttl to the default and mask out logging from "pass"
201	 */
202	if ((fra->ipfr_next = table[idx]))
203		table[idx]->ipfr_prev = fra;
204	fra->ipfr_prev = NULL;
205	fra->ipfr_data = NULL;
206	table[idx] = fra;
207	bcopy((char *)&frag.ipfr_src, (char *)&fra->ipfr_src, IPFR_CMPSZ);
208	fra->ipfr_ttl = fr_ipfrttl;
209	/*
210	 * Compute the offset of the expected start of the next packet.
211	 */
212	off = ip->ip_off & IP_OFFMASK;
213	if (!off)
214		fra->ipfr_seen0 = 1;
215	fra->ipfr_off = off + (fin->fin_dlen >> 3);
216	ATOMIC_INCL(ipfr_stats.ifs_new);
217	ATOMIC_INC32(ipfr_inuse);
218	return fra;
219}
220
221
222int ipfr_newfrag(ip, fin, pass)
223ip_t *ip;
224fr_info_t *fin;
225u_int pass;
226{
227	ipfr_t	*ipf;
228
229	if ((ip->ip_v != 4) || (fr_frag_lock))
230		return -1;
231	WRITE_ENTER(&ipf_frag);
232	ipf = ipfr_new(ip, fin, pass, ipfr_heads);
233	RWLOCK_EXIT(&ipf_frag);
234	if (ipf == NULL) {
235		ATOMIC_INCL(frstats[fin->fin_out].fr_bnfr);
236		return -1;
237	}
238	ATOMIC_INCL(frstats[fin->fin_out].fr_nfr);
239	return 0;
240}
241
242
243int ipfr_nat_newfrag(ip, fin, pass, nat)
244ip_t *ip;
245fr_info_t *fin;
246u_int pass;
247nat_t *nat;
248{
249	ipfr_t	*ipf;
250	int off;
251
252	if ((ip->ip_v != 4) || (fr_frag_lock))
253		return -1;
254
255	off = fin->fin_off;
256	off <<= 3;
257	if ((off + fin->fin_dlen) > 0xffff || (fin->fin_dlen == 0))
258		return NULL;
259
260	WRITE_ENTER(&ipf_natfrag);
261	ipf = ipfr_new(ip, fin, pass, ipfr_nattab);
262	if (ipf != NULL) {
263		ipf->ipfr_data = nat;
264		nat->nat_data = ipf;
265	}
266	RWLOCK_EXIT(&ipf_natfrag);
267	return ipf ? 0 : -1;
268}
269
270
271/*
272 * check the fragment cache to see if there is already a record of this packet
273 * with its filter result known.
274 */
275static ipfr_t *ipfr_lookup(ip, fin, table)
276ip_t *ip;
277fr_info_t *fin;
278ipfr_t *table[];
279{
280	ipfr_t	*f, frag;
281	u_int	idx;
282
283	/*
284	 * For fragments, we record protocol, packet id, TOS and both IP#'s
285	 * (these should all be the same for all fragments of a packet).
286	 *
287	 * build up a hash value to index the table with.
288	 */
289	frag.ipfr_p = ip->ip_p;
290	idx = ip->ip_p;
291	frag.ipfr_id = ip->ip_id;
292	idx += ip->ip_id;
293	frag.ipfr_tos = ip->ip_tos;
294	frag.ipfr_src.s_addr = ip->ip_src.s_addr;
295	idx += ip->ip_src.s_addr;
296	frag.ipfr_dst.s_addr = ip->ip_dst.s_addr;
297	idx += ip->ip_dst.s_addr;
298	frag.ipfr_ifp = fin->fin_ifp;
299	idx *= 127;
300	idx %= IPFT_SIZE;
301
302	frag.ipfr_optmsk = fin->fin_fi.fi_optmsk & IPF_OPTCOPY;
303	frag.ipfr_secmsk = fin->fin_fi.fi_secmsk;
304	frag.ipfr_auth = fin->fin_fi.fi_auth;
305
306	/*
307	 * check the table, careful to only compare the right amount of data
308	 */
309	for (f = table[idx]; f; f = f->ipfr_next)
310		if (!bcmp((char *)&frag.ipfr_src, (char *)&f->ipfr_src,
311			  IPFR_CMPSZ)) {
312			u_short	atoff, off;
313
314			off = fin->fin_off;
315
316			/*
317			 * XXX - We really need to be guarding against the
318			 * retransmission of (src,dst,id,offset-range) here
319			 * because a fragmented packet is never resent with
320			 * the same IP ID#.
321			 */
322			if (f->ipfr_seen0) {
323				if (!off || (fin->fin_fl & FI_SHORT))
324					continue;
325			} else if (!off)
326				f->ipfr_seen0 = 1;
327
328			if (f != table[idx]) {
329				/*
330				 * move fragment info. to the top of the list
331				 * to speed up searches.
332				 */
333				if ((f->ipfr_prev->ipfr_next = f->ipfr_next))
334					f->ipfr_next->ipfr_prev = f->ipfr_prev;
335				f->ipfr_next = table[idx];
336				table[idx]->ipfr_prev = f;
337				f->ipfr_prev = NULL;
338				table[idx] = f;
339			}
340			atoff = off + (fin->fin_dlen >> 3);
341			/*
342			 * If we've follwed the fragments, and this is the
343			 * last (in order), shrink expiration time.
344			 */
345			if (off == f->ipfr_off) {
346				if (!(ip->ip_off & IP_MF))
347					f->ipfr_ttl = 1;
348				else
349					f->ipfr_off = atoff;
350			}
351			ATOMIC_INCL(ipfr_stats.ifs_hits);
352			return f;
353		}
354	return NULL;
355}
356
357
358/*
359 * functional interface for NAT lookups of the NAT fragment cache
360 */
361nat_t *ipfr_nat_knownfrag(ip, fin)
362ip_t *ip;
363fr_info_t *fin;
364{
365	ipfr_t *ipf;
366	nat_t *nat;
367	int off;
368
369	if ((fin->fin_v != 4) || (fr_frag_lock))
370		return NULL;
371
372	off = fin->fin_off;
373	off <<= 3;
374	if ((off + fin->fin_dlen) > 0xffff || (fin->fin_dlen == 0))
375		return NULL;
376
377	READ_ENTER(&ipf_natfrag);
378	ipf = ipfr_lookup(ip, fin, ipfr_nattab);
379	if (ipf != NULL) {
380		nat = ipf->ipfr_data;
381		/*
382		 * This is the last fragment for this packet.
383		 */
384		if ((ipf->ipfr_ttl == 1) && (nat != NULL)) {
385			nat->nat_data = NULL;
386			ipf->ipfr_data = NULL;
387		}
388	} else
389		nat = NULL;
390	RWLOCK_EXIT(&ipf_natfrag);
391	return nat;
392}
393
394
395/*
396 * functional interface for normal lookups of the fragment cache
397 */
398frentry_t *ipfr_knownfrag(ip, fin)
399ip_t *ip;
400fr_info_t *fin;
401{
402	frentry_t *fr;
403	ipfr_t *fra;
404	int off;
405
406	if ((fin->fin_v != 4) || (fr_frag_lock))
407		return NULL;
408
409	off = fin->fin_off;
410	off <<= 3;
411	if ((off + fin->fin_dlen) > 0xffff || (fin->fin_dlen == 0))
412		return NULL;
413
414	READ_ENTER(&ipf_frag);
415	fra = ipfr_lookup(ip, fin, ipfr_heads);
416	if (fra != NULL)
417		fr = fra->ipfr_rule;
418	else
419		fr = NULL;
420	RWLOCK_EXIT(&ipf_frag);
421	return fr;
422}
423
424
425/*
426 * forget any references to this external object.
427 */
428void ipfr_forget(nat)
429void *nat;
430{
431	ipfr_t	*fr;
432	int	idx;
433
434	WRITE_ENTER(&ipf_natfrag);
435	for (idx = IPFT_SIZE - 1; idx >= 0; idx--)
436		for (fr = ipfr_heads[idx]; fr; fr = fr->ipfr_next)
437			if (fr->ipfr_data == nat)
438				fr->ipfr_data = NULL;
439
440	RWLOCK_EXIT(&ipf_natfrag);
441}
442
443
444static void ipfr_delete(fra)
445ipfr_t *fra;
446{
447	frentry_t *fr;
448
449	fr = fra->ipfr_rule;
450	if (fr != NULL) {
451		ATOMIC_DEC32(fr->fr_ref);
452		if (fr->fr_ref == 0)
453			KFREE(fr);
454	}
455	if (fra->ipfr_prev)
456		fra->ipfr_prev->ipfr_next = fra->ipfr_next;
457	if (fra->ipfr_next)
458		fra->ipfr_next->ipfr_prev = fra->ipfr_prev;
459	KFREE(fra);
460}
461
462
463/*
464 * Free memory in use by fragment state info. kept.
465 */
466void ipfr_unload()
467{
468	ipfr_t	**fp, *fra;
469	nat_t	*nat;
470	int	idx;
471
472	WRITE_ENTER(&ipf_frag);
473	for (idx = IPFT_SIZE - 1; idx >= 0; idx--)
474		for (fp = &ipfr_heads[idx]; (fra = *fp); ) {
475			*fp = fra->ipfr_next;
476			ipfr_delete(fra);
477		}
478	RWLOCK_EXIT(&ipf_frag);
479
480	WRITE_ENTER(&ipf_nat);
481	WRITE_ENTER(&ipf_natfrag);
482	for (idx = IPFT_SIZE - 1; idx >= 0; idx--)
483		for (fp = &ipfr_nattab[idx]; (fra = *fp); ) {
484			*fp = fra->ipfr_next;
485			nat = fra->ipfr_data;
486			if (nat != NULL) {
487				if (nat->nat_data == fra)
488					nat->nat_data = NULL;
489			}
490			ipfr_delete(fra);
491		}
492	RWLOCK_EXIT(&ipf_natfrag);
493	RWLOCK_EXIT(&ipf_nat);
494}
495
496
497#ifdef	_KERNEL
498void ipfr_fragexpire()
499{
500	ipfr_t	**fp, *fra;
501	nat_t	*nat;
502	int	idx;
503#if defined(_KERNEL)
504# if !SOLARIS
505	int	s;
506# endif
507#endif
508
509	if (fr_frag_lock)
510		return;
511
512	SPL_NET(s);
513	WRITE_ENTER(&ipf_frag);
514
515	/*
516	 * Go through the entire table, looking for entries to expire,
517	 * decreasing the ttl by one for each entry.  If it reaches 0,
518	 * remove it from the chain and free it.
519	 */
520	for (idx = IPFT_SIZE - 1; idx >= 0; idx--)
521		for (fp = &ipfr_heads[idx]; (fra = *fp); ) {
522			--fra->ipfr_ttl;
523			if (fra->ipfr_ttl == 0) {
524				*fp = fra->ipfr_next;
525				ipfr_delete(fra);
526				ATOMIC_INCL(ipfr_stats.ifs_expire);
527				ATOMIC_DEC32(ipfr_inuse);
528			} else
529				fp = &fra->ipfr_next;
530		}
531	RWLOCK_EXIT(&ipf_frag);
532
533	/*
534	 * Same again for the NAT table, except that if the structure also
535	 * still points to a NAT structure, and the NAT structure points back
536	 * at the one to be free'd, NULL the reference from the NAT struct.
537	 * NOTE: We need to grab both mutex's early, and in this order so as
538	 * to prevent a deadlock if both try to expire at the same time.
539	 */
540	WRITE_ENTER(&ipf_nat);
541	WRITE_ENTER(&ipf_natfrag);
542	for (idx = IPFT_SIZE - 1; idx >= 0; idx--)
543		for (fp = &ipfr_nattab[idx]; (fra = *fp); ) {
544			--fra->ipfr_ttl;
545			if (fra->ipfr_ttl == 0) {
546				ATOMIC_INCL(ipfr_stats.ifs_expire);
547				ATOMIC_DEC32(ipfr_inuse);
548				nat = fra->ipfr_data;
549				if (nat != NULL) {
550					if (nat->nat_data == fra)
551						nat->nat_data = NULL;
552				}
553				*fp = fra->ipfr_next;
554				ipfr_delete(fra);
555			} else
556				fp = &fra->ipfr_next;
557		}
558	RWLOCK_EXIT(&ipf_natfrag);
559	RWLOCK_EXIT(&ipf_nat);
560	SPL_X(s);
561}
562
563
564/*
565 * Slowly expire held state for fragments.  Timeouts are set * in expectation
566 * of this being called twice per second.
567 */
568# if (BSD >= 199306) || SOLARIS || defined(__sgi)
569#  if defined(SOLARIS2) && (SOLARIS2 < 7)
570void ipfr_slowtimer()
571#  else
572void ipfr_slowtimer __P((void *ptr))
573#  endif
574# else
575int ipfr_slowtimer()
576# endif
577{
578#if defined(_KERNEL) && SOLARIS
579	extern	int	fr_running;
580
581	if (fr_running <= 0)
582		return;
583#endif
584
585	READ_ENTER(&ipf_solaris);
586#ifdef __sgi
587	ipfilter_sgi_intfsync();
588#endif
589
590	ipfr_fragexpire();
591	fr_timeoutstate();
592	ip_natexpire();
593	fr_authexpire();
594# if    SOLARIS
595	ipfr_timer_id = timeout(ipfr_slowtimer, NULL, drv_usectohz(500000));
596	RWLOCK_EXIT(&ipf_solaris);
597# else
598#  if defined(__NetBSD__) && (__NetBSD_Version__ >= 104240000)
599	callout_reset(&ipfr_slowtimer_ch, hz / 2, ipfr_slowtimer, NULL);
600#  else
601#   if (__FreeBSD_version >= 300000)
602	ipfr_slowtimer_ch = timeout(ipfr_slowtimer, NULL, hz/2);
603#   else
604#    if defined(__OpenBSD_)
605	timeout_add(&ipfr_slowtimer_ch, hz/2, ipfr_slowtimer, NULL);
606#    else
607	timeout(ipfr_slowtimer, NULL, hz/2);
608#    endif
609#   endif
610#   if (BSD < 199306) && !defined(__sgi)
611	return 0;
612#   endif /* FreeBSD */
613#  endif /* NetBSD */
614# endif /* SOLARIS */
615}
616#endif /* defined(_KERNEL) */
617