ip_state.c revision 89336
1/*
2 * Copyright (C) 1995-2001 by Darren Reed.
3 *
4 * See the IPFILTER.LICENCE file for details on licencing.
5 */
6
7#include <sys/errno.h>
8#include <sys/types.h>
9#include <sys/param.h>
10#include <sys/file.h>
11#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \
12    defined(_KERNEL)
13# include "opt_ipfilter_log.h"
14#endif
15#if defined(_KERNEL) && defined(__FreeBSD_version) && \
16    (__FreeBSD_version >= 400000) && !defined(KLD_MODULE)
17#include "opt_inet6.h"
18#endif
19#if !defined(_KERNEL) && !defined(KERNEL) && !defined(__KERNEL__)
20# include <stdio.h>
21# include <stdlib.h>
22# include <string.h>
23#else
24# ifdef linux
25#  include <linux/kernel.h>
26#  include <linux/module.h>
27# endif
28#endif
29#if (defined(KERNEL) || defined(_KERNEL)) && (__FreeBSD_version >= 220000)
30# include <sys/filio.h>
31# include <sys/fcntl.h>
32# if (__FreeBSD_version >= 300000) && !defined(IPFILTER_LKM)
33#  include "opt_ipfilter.h"
34# endif
35#else
36# include <sys/ioctl.h>
37#endif
38#include <sys/time.h>
39#include <sys/uio.h>
40#ifndef linux
41# include <sys/protosw.h>
42#endif
43#include <sys/socket.h>
44#if (defined(_KERNEL) || defined(KERNEL)) && !defined(linux)
45# include <sys/systm.h>
46#endif
47#if !defined(__SVR4) && !defined(__svr4__)
48# ifndef linux
49#  include <sys/mbuf.h>
50# endif
51#else
52# include <sys/filio.h>
53# include <sys/byteorder.h>
54# ifdef _KERNEL
55#  include <sys/dditypes.h>
56# endif
57# include <sys/stream.h>
58# include <sys/kmem.h>
59#endif
60
61#include <net/if.h>
62#ifdef sun
63# include <net/af.h>
64#endif
65#include <net/route.h>
66#include <netinet/in.h>
67#include <netinet/in_systm.h>
68#include <netinet/ip.h>
69#include <netinet/tcp.h>
70#ifndef linux
71# include <netinet/ip_var.h>
72# include <netinet/tcp_fsm.h>
73#endif
74#include <netinet/udp.h>
75#include <netinet/ip_icmp.h>
76#include "netinet/ip_compat.h"
77#include <netinet/tcpip.h>
78#include "netinet/ip_fil.h"
79#include "netinet/ip_nat.h"
80#include "netinet/ip_frag.h"
81#include "netinet/ip_proxy.h"
82#include "netinet/ip_state.h"
83#ifdef	USE_INET6
84#include <netinet/icmp6.h>
85#endif
86#if (__FreeBSD_version >= 300000)
87# include <sys/malloc.h>
88# if (defined(_KERNEL) || defined(KERNEL)) && !defined(IPFILTER_LKM)
89#  include <sys/libkern.h>
90#  include <sys/systm.h>
91# endif
92#endif
93
94#if !defined(lint)
95static const char sccsid[] = "@(#)ip_state.c	1.8 6/5/96 (C) 1993-2000 Darren Reed";
96/* static const char rcsid[] = "@(#)$Id: ip_state.c,v 2.30.2.38 2001/07/23 13:49:46 darrenr Exp $"; */
97static const char rcsid[] = "@(#)$FreeBSD: head/sys/contrib/ipfilter/netinet/ip_state.c 89336 2002-01-14 09:07:15Z alfred $";
98#endif
99
100#ifndef	MIN
101# define	MIN(a,b)	(((a)<(b))?(a):(b))
102#endif
103
104#define	TCP_CLOSE	(TH_FIN|TH_RST)
105
106static ipstate_t **ips_table = NULL;
107static ipstate_t *ips_list = NULL;
108static int	ips_num = 0;
109static int	ips_wild = 0;
110static ips_stat_t ips_stats;
111#if	(SOLARIS || defined(__sgi)) && defined(_KERNEL)
112extern	KRWLOCK_T	ipf_state, ipf_mutex;
113extern	kmutex_t	ipf_rw;
114#endif
115
116#ifdef	USE_INET6
117static frentry_t *fr_checkicmp6matchingstate __P((ip6_t *, fr_info_t *));
118#endif
119static int fr_matchsrcdst __P((ipstate_t *, union i6addr, union i6addr,
120			       fr_info_t *, tcphdr_t *));
121static frentry_t *fr_checkicmpmatchingstate __P((ip_t *, fr_info_t *));
122static int fr_matchicmpqueryreply __P((int, ipstate_t *, icmphdr_t *));
123static int fr_state_flush __P((int));
124static ips_stat_t *fr_statetstats __P((void));
125static void fr_delstate __P((ipstate_t *));
126static int fr_state_remove __P((caddr_t));
127static void fr_ipsmove __P((ipstate_t **, ipstate_t *, u_int));
128int fr_stputent __P((caddr_t));
129int fr_stgetent __P((caddr_t));
130void fr_stinsert __P((ipstate_t *));
131
132
133#define	FIVE_DAYS	(2 * 5 * 86400)	/* 5 days: half closed session */
134
135#define	TCP_MSL	240			/* 2 minutes */
136u_long	fr_tcpidletimeout = FIVE_DAYS,
137	fr_tcpclosewait = 2 * TCP_MSL,
138	fr_tcplastack = 2 * TCP_MSL,
139	fr_tcptimeout = 2 * TCP_MSL,
140	fr_tcpclosed = 120,
141	fr_tcphalfclosed = 2 * 2 * 3600,    /* 2 hours */
142	fr_udptimeout = 240,
143	fr_udpacktimeout = 24,
144	fr_icmptimeout = 120,
145	fr_icmpacktimeout = 12;
146int	fr_statemax = IPSTATE_MAX,
147	fr_statesize = IPSTATE_SIZE;
148int	fr_state_doflush = 0,
149	fr_state_lock = 0;
150
151static 	int icmpreplytype4[ICMP_MAXTYPE + 1];
152
153int fr_stateinit()
154{
155	int i;
156
157	KMALLOCS(ips_table, ipstate_t **, fr_statesize * sizeof(ipstate_t *));
158	if (ips_table != NULL)
159		bzero((char *)ips_table, fr_statesize * sizeof(ipstate_t *));
160	else
161		return -1;
162
163	/* fill icmp reply type table */
164	for (i = 0; i <= ICMP_MAXTYPE; i++)
165		icmpreplytype4[i] = -1;
166	icmpreplytype4[ICMP_ECHO] = ICMP_ECHOREPLY;
167	icmpreplytype4[ICMP_TSTAMP] = ICMP_TSTAMPREPLY;
168	icmpreplytype4[ICMP_IREQ] = ICMP_IREQREPLY;
169	icmpreplytype4[ICMP_MASKREQ] = ICMP_MASKREPLY;
170
171	return 0;
172}
173
174
175static ips_stat_t *fr_statetstats()
176{
177	ips_stats.iss_active = ips_num;
178	ips_stats.iss_table = ips_table;
179	ips_stats.iss_list = ips_list;
180	return &ips_stats;
181}
182
183
184/*
185 * flush state tables.  two actions currently defined:
186 * which == 0 : flush all state table entries
187 * which == 1 : flush TCP connections which have started to close but are
188 *	        stuck for some reason.
189 */
190static int fr_state_flush(which)
191int which;
192{
193	register ipstate_t *is, **isp;
194#if defined(_KERNEL) && !SOLARIS
195	int s;
196#endif
197	int delete, removed = 0;
198
199	SPL_NET(s);
200	for (isp = &ips_list; (is = *isp); ) {
201		delete = 0;
202
203		switch (which)
204		{
205		case 0 :
206			delete = 1;
207			break;
208		case 1 :
209			if (is->is_p != IPPROTO_TCP)
210				break;
211			if ((is->is_state[0] != TCPS_ESTABLISHED) ||
212			    (is->is_state[1] != TCPS_ESTABLISHED))
213				delete = 1;
214			break;
215		}
216
217		if (delete) {
218			if (is->is_p == IPPROTO_TCP)
219				ips_stats.iss_fin++;
220			else
221				ips_stats.iss_expire++;
222#ifdef	IPFILTER_LOG
223			ipstate_log(is, ISL_FLUSH);
224#endif
225			fr_delstate(is);
226			removed++;
227		} else
228			isp = &is->is_next;
229	}
230	SPL_X(s);
231	return removed;
232}
233
234
235static int fr_state_remove(data)
236caddr_t data;
237{
238	ipstate_t *sp, st;
239	int error;
240
241	sp = &st;
242	error = IRCOPYPTR(data, (caddr_t)&st, sizeof(st));
243	if (error)
244		return EFAULT;
245
246	WRITE_ENTER(&ipf_state);
247	for (sp = ips_list; sp; sp = sp->is_next)
248		if ((sp->is_p == st.is_p) && (sp->is_v == st.is_v) &&
249		    !bcmp((char *)&sp->is_src, (char *)&st.is_src,
250			  sizeof(st.is_src)) &&
251		    !bcmp((char *)&sp->is_dst, (char *)&st.is_src,
252			  sizeof(st.is_dst)) &&
253		    !bcmp((char *)&sp->is_ps, (char *)&st.is_ps,
254			  sizeof(st.is_ps))) {
255#ifdef	IPFILTER_LOG
256			ipstate_log(sp, ISL_REMOVE);
257#endif
258			fr_delstate(sp);
259			RWLOCK_EXIT(&ipf_state);
260			return 0;
261		}
262	RWLOCK_EXIT(&ipf_state);
263	return ESRCH;
264}
265
266
267int fr_state_ioctl(data, cmd, mode)
268caddr_t data;
269#if defined(__NetBSD__) || defined(__OpenBSD__)
270u_long cmd;
271#else
272int cmd;
273#endif
274int mode;
275{
276	int arg, ret, error = 0;
277
278	switch (cmd)
279	{
280	case SIOCDELST :
281		error = fr_state_remove(data);
282		break;
283	case SIOCIPFFL :
284		error = IRCOPY(data, (caddr_t)&arg, sizeof(arg));
285		if (error)
286			break;
287		if (arg == 0 || arg == 1) {
288			WRITE_ENTER(&ipf_state);
289			ret = fr_state_flush(arg);
290			RWLOCK_EXIT(&ipf_state);
291			error = IWCOPY((caddr_t)&ret, data, sizeof(ret));
292		} else
293			error = EINVAL;
294		break;
295#ifdef	IPFILTER_LOG
296	case SIOCIPFFB :
297		if (!(mode & FWRITE))
298			error = EPERM;
299		else {
300			int tmp;
301
302			tmp = ipflog_clear(IPL_LOGSTATE);
303			IWCOPY((char *)&tmp, data, sizeof(tmp));
304		}
305		break;
306#endif
307	case SIOCGETFS :
308		error = IWCOPYPTR((caddr_t)fr_statetstats(), data,
309				  sizeof(ips_stat_t));
310		break;
311	case FIONREAD :
312#ifdef	IPFILTER_LOG
313		arg = (int)iplused[IPL_LOGSTATE];
314		error = IWCOPY((caddr_t)&arg, (caddr_t)data, sizeof(arg));
315#endif
316		break;
317	case SIOCSTLCK :
318		error = fr_lock(data, &fr_state_lock);
319		break;
320	case SIOCSTPUT :
321		if (!fr_state_lock) {
322			error = EACCES;
323			break;
324		}
325		error = fr_stputent(data);
326		break;
327	case SIOCSTGET :
328		if (!fr_state_lock) {
329			error = EACCES;
330			break;
331		}
332		error = fr_stgetent(data);
333		break;
334	default :
335		error = EINVAL;
336		break;
337	}
338	return error;
339}
340
341
342int fr_stgetent(data)
343caddr_t data;
344{
345	register ipstate_t *is, *isn;
346	ipstate_save_t ips, *ipsp;
347	int error;
348
349	error = IRCOPY(data, (caddr_t)&ipsp, sizeof(ipsp));
350	if (error)
351		return EFAULT;
352	error = IRCOPY((caddr_t)ipsp, (caddr_t)&ips, sizeof(ips));
353	if (error)
354		return EFAULT;
355
356	isn = ips.ips_next;
357	if (!isn) {
358		isn = ips_list;
359		if (isn == NULL) {
360			if (ips.ips_next == NULL)
361				return ENOENT;
362			return 0;
363		}
364	} else {
365		/*
366		 * Make sure the pointer we're copying from exists in the
367		 * current list of entries.  Security precaution to prevent
368		 * copying of random kernel data.
369		 */
370		for (is = ips_list; is; is = is->is_next)
371			if (is == isn)
372				break;
373		if (!is)
374			return ESRCH;
375	}
376	ips.ips_next = isn->is_next;
377	bcopy((char *)isn, (char *)&ips.ips_is, sizeof(ips.ips_is));
378	if (isn->is_rule)
379		bcopy((char *)isn->is_rule, (char *)&ips.ips_fr,
380		      sizeof(ips.ips_fr));
381	error = IWCOPY((caddr_t)&ips, ipsp, sizeof(ips));
382	if (error)
383		error = EFAULT;
384	return error;
385}
386
387
388int fr_stputent(data)
389caddr_t data;
390{
391	register ipstate_t *is, *isn;
392	ipstate_save_t ips, *ipsp;
393	int error, out;
394	frentry_t *fr;
395
396	error = IRCOPY(data, (caddr_t)&ipsp, sizeof(ipsp));
397	if (error)
398		return EFAULT;
399	error = IRCOPY((caddr_t)ipsp, (caddr_t)&ips, sizeof(ips));
400	if (error)
401		return EFAULT;
402
403	KMALLOC(isn, ipstate_t *);
404	if (isn == NULL)
405		return ENOMEM;
406
407	bcopy((char *)&ips.ips_is, (char *)isn, sizeof(*isn));
408	fr = isn->is_rule;
409	if (fr != NULL) {
410		if (isn->is_flags & FI_NEWFR) {
411			KMALLOC(fr, frentry_t *);
412			if (fr == NULL) {
413				KFREE(isn);
414				return ENOMEM;
415			}
416			bcopy((char *)&ips.ips_fr, (char *)fr, sizeof(*fr));
417			out = fr->fr_flags & FR_OUTQUE ? 1 : 0;
418			isn->is_rule = fr;
419			ips.ips_is.is_rule = fr;
420			if (*fr->fr_ifname) {
421				fr->fr_ifa = GETUNIT(fr->fr_ifname, fr->fr_v);
422				if (fr->fr_ifa == NULL)
423					fr->fr_ifa = (void *)-1;
424#ifdef	_KERNEL
425				else {
426					strncpy(isn->is_ifname[out],
427						IFNAME(fr->fr_ifa), IFNAMSIZ);
428					isn->is_ifp[out] = fr->fr_ifa;
429				}
430#endif
431			} else
432				fr->fr_ifa = NULL;
433			/*
434			 * send a copy back to userland of what we ended up
435			 * to allow for verification.
436			 */
437			error = IWCOPY((caddr_t)&ips, ipsp, sizeof(ips));
438			if (error) {
439				KFREE(isn);
440				KFREE(fr);
441				return EFAULT;
442			}
443		} else {
444			for (is = ips_list; is; is = is->is_next)
445				if (is->is_rule == fr)
446					break;
447			if (!is) {
448				KFREE(isn);
449				return ESRCH;
450			}
451		}
452	}
453	fr_stinsert(isn);
454	return 0;
455}
456
457
458void fr_stinsert(is)
459register ipstate_t *is;
460{
461	register u_int hv = is->is_hv;
462
463	MUTEX_INIT(&is->is_lock, "ipf state entry", NULL);
464
465	is->is_ifname[0][sizeof(is->is_ifname[0]) - 1] = '\0';
466	if (is->is_ifname[0][0] != '\0') {
467		is->is_ifp[0] = GETUNIT(is->is_ifname[0], is->is_v);
468	}
469	is->is_ifname[1][sizeof(is->is_ifname[0]) - 1] = '\0';
470	if (is->is_ifname[1][0] != '\0') {
471		is->is_ifp[1] = GETUNIT(is->is_ifname[1], is->is_v);
472	}
473
474	/*
475	 * add into list table.
476	 */
477	if (ips_list)
478		ips_list->is_pnext = &is->is_next;
479	is->is_pnext = &ips_list;
480	is->is_next = ips_list;
481	ips_list = is;
482	if (ips_table[hv])
483		ips_table[hv]->is_phnext = &is->is_hnext;
484	else
485		ips_stats.iss_inuse++;
486	is->is_phnext = ips_table + hv;
487	is->is_hnext = ips_table[hv];
488	ips_table[hv] = is;
489	ips_num++;
490}
491
492
493/*
494 * Create a new ipstate structure and hang it off the hash table.
495 */
496ipstate_t *fr_addstate(ip, fin, flags)
497ip_t *ip;
498fr_info_t *fin;
499u_int flags;
500{
501	register tcphdr_t *tcp = NULL;
502	register ipstate_t *is;
503	register u_int hv;
504	ipstate_t ips;
505	u_int pass;
506	int out;
507
508	if (fr_state_lock || (fin->fin_off != 0) || (fin->fin_fl & FI_SHORT))
509		return NULL;
510	if (ips_num == fr_statemax) {
511		ips_stats.iss_max++;
512		fr_state_doflush = 1;
513		return NULL;
514	}
515	out = fin->fin_out;
516	is = &ips;
517	bzero((char *)is, sizeof(*is));
518	ips.is_age = 1;
519	ips.is_state[0] = 0;
520	ips.is_state[1] = 0;
521	/*
522	 * Copy and calculate...
523	 */
524	hv = (is->is_p = fin->fin_fi.fi_p);
525	is->is_src = fin->fin_fi.fi_src;
526	hv += is->is_saddr;
527	is->is_dst = fin->fin_fi.fi_dst;
528	hv += is->is_daddr;
529#ifdef	USE_INET6
530	if (fin->fin_v == 6) {
531		if (is->is_p == IPPROTO_ICMPV6) {
532			if (IN6_IS_ADDR_MULTICAST(&is->is_dst.in6))
533				flags |= FI_W_DADDR;
534			if (out)
535				hv -= is->is_daddr;
536			else
537				hv -= is->is_saddr;
538		}
539	}
540#endif
541
542	switch (is->is_p)
543	{
544#ifdef	USE_INET6
545	case IPPROTO_ICMPV6 :
546#endif
547	case IPPROTO_ICMP :
548	    {
549		struct icmp *ic = (struct icmp *)fin->fin_dp;
550
551#ifdef	USE_INET6
552		if ((is->is_p == IPPROTO_ICMPV6) &&
553		    ((ic->icmp_type & ICMP6_INFOMSG_MASK) == 0))
554			return NULL;
555#endif
556		switch (ic->icmp_type)
557		{
558#ifdef	USE_INET6
559		case ICMP6_ECHO_REQUEST :
560			is->is_icmp.ics_type = ICMP6_ECHO_REPLY;
561			hv += (is->is_icmp.ics_id = ic->icmp_id);
562			hv += (is->is_icmp.ics_seq = ic->icmp_seq);
563			break;
564		case ICMP6_MEMBERSHIP_QUERY :
565		case ND_ROUTER_SOLICIT :
566		case ND_NEIGHBOR_SOLICIT :
567			is->is_icmp.ics_type = ic->icmp_type + 1;
568			break;
569#endif
570		case ICMP_ECHO :
571		case ICMP_TSTAMP :
572		case ICMP_IREQ :
573		case ICMP_MASKREQ :
574			is->is_icmp.ics_type = ic->icmp_type;
575			hv += (is->is_icmp.ics_id = ic->icmp_id);
576			hv += (is->is_icmp.ics_seq = ic->icmp_seq);
577			break;
578		default :
579			return NULL;
580		}
581		ATOMIC_INCL(ips_stats.iss_icmp);
582		is->is_age = fr_icmptimeout;
583		break;
584	    }
585	case IPPROTO_TCP :
586	    {
587		tcp = (tcphdr_t *)fin->fin_dp;
588
589		if (tcp->th_flags & TH_RST)
590			return NULL;
591		/*
592		 * The endian of the ports doesn't matter, but the ack and
593		 * sequence numbers do as we do mathematics on them later.
594		 */
595		is->is_dport = tcp->th_dport;
596		is->is_sport = tcp->th_sport;
597		if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) {
598			hv += tcp->th_dport;
599			hv += tcp->th_sport;
600		}
601		is->is_send = ntohl(tcp->th_seq) + fin->fin_dlen -
602			      (tcp->th_off << 2) +
603			      ((tcp->th_flags & TH_SYN) ? 1 : 0) +
604			      ((tcp->th_flags & TH_FIN) ? 1 : 0);
605		is->is_maxsend = is->is_send;
606		is->is_dend = 0;
607		is->is_maxdwin = 1;
608		is->is_maxswin = ntohs(tcp->th_win);
609		if (is->is_maxswin == 0)
610			is->is_maxswin = 1;
611		/*
612		 * If we're creating state for a starting connection, start the
613		 * timer on it as we'll never see an error if it fails to
614		 * connect.
615		 */
616		ATOMIC_INCL(ips_stats.iss_tcp);
617		break;
618	    }
619	case IPPROTO_UDP :
620	    {
621		tcp = (tcphdr_t *)fin->fin_dp;
622
623		is->is_dport = tcp->th_dport;
624		is->is_sport = tcp->th_sport;
625		if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) {
626			hv += tcp->th_dport;
627			hv += tcp->th_sport;
628		}
629		ATOMIC_INCL(ips_stats.iss_udp);
630		is->is_age = fr_udptimeout;
631		break;
632	    }
633	default :
634		return NULL;
635	}
636
637	KMALLOC(is, ipstate_t *);
638	if (is == NULL) {
639		ATOMIC_INCL(ips_stats.iss_nomem);
640		return NULL;
641	}
642	bcopy((char *)&ips, (char *)is, sizeof(*is));
643	hv %= fr_statesize;
644	is->is_hv = hv;
645	is->is_rule = fin->fin_fr;
646	if (is->is_rule != NULL) {
647		ATOMIC_INC32(is->is_rule->fr_ref);
648		pass = is->is_rule->fr_flags;
649	} else
650		pass = fr_flags;
651	WRITE_ENTER(&ipf_state);
652
653	is->is_pass = pass;
654	is->is_pkts = 1;
655	is->is_bytes = fin->fin_dlen + fin->fin_hlen;
656	/*
657	 * We want to check everything that is a property of this packet,
658	 * but we don't (automatically) care about it's fragment status as
659	 * this may change.
660	 */
661	is->is_v = fin->fin_fi.fi_v;
662	is->is_opt = fin->fin_fi.fi_optmsk;
663	is->is_optmsk = 0xffffffff;
664	is->is_sec = fin->fin_fi.fi_secmsk;
665	is->is_secmsk = 0xffff;
666	is->is_auth = fin->fin_fi.fi_auth;
667	is->is_authmsk = 0xffff;
668	is->is_flags = fin->fin_fl & FI_CMP;
669	is->is_flags |= FI_CMP << 4;
670	is->is_flags |= flags & (FI_WILDP|FI_WILDA);
671	if (flags & (FI_WILDP|FI_WILDA))
672		ips_wild++;
673	is->is_ifp[1 - out] = NULL;
674	is->is_ifp[out] = fin->fin_ifp;
675#ifdef	_KERNEL
676	strncpy(is->is_ifname[out], IFNAME(fin->fin_ifp), IFNAMSIZ);
677#endif
678	is->is_ifname[1 - out][0] = '\0';
679	if (pass & FR_LOGFIRST)
680		is->is_pass &= ~(FR_LOGFIRST|FR_LOG);
681	fr_stinsert(is);
682	if (is->is_p == IPPROTO_TCP) {
683		MUTEX_ENTER(&is->is_lock);
684		fr_tcp_age(&is->is_age, is->is_state, fin,
685			   0); /* 0 = packet from the source */
686		MUTEX_EXIT(&is->is_lock);
687	}
688#ifdef	IPFILTER_LOG
689	ipstate_log(is, ISL_NEW);
690#endif
691	RWLOCK_EXIT(&ipf_state);
692	fin->fin_rev = IP6NEQ(is->is_dst, fin->fin_fi.fi_dst);
693	if ((fin->fin_fi.fi_fl & FI_FRAG) && (pass & FR_KEEPFRAG))
694		ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE);
695	return is;
696}
697
698
699
700/*
701 * check to see if a packet with TCP headers fits within the TCP window.
702 * change timeout depending on whether new packet is a SYN-ACK returning for a
703 * SYN or a RST or FIN which indicate time to close up shop.
704 */
705int fr_tcpstate(is, fin, ip, tcp)
706register ipstate_t *is;
707fr_info_t *fin;
708ip_t *ip;
709tcphdr_t *tcp;
710{
711	register tcp_seq seq, ack, end;
712	register int ackskew;
713	tcpdata_t  *fdata, *tdata;
714	u_short	win, maxwin;
715	int ret = 0;
716	int source;
717
718	/*
719	 * Find difference between last checked packet and this packet.
720	 */
721	source = IP6EQ(fin->fin_fi.fi_src, is->is_src);
722	if (source && (ntohs(is->is_sport) != fin->fin_data[0]))
723		source = 0;
724	fdata = &is->is_tcp.ts_data[!source];
725	tdata = &is->is_tcp.ts_data[source];
726	seq = ntohl(tcp->th_seq);
727	ack = ntohl(tcp->th_ack);
728	win = ntohs(tcp->th_win);
729	end = seq + fin->fin_dlen - (tcp->th_off << 2) +
730	       ((tcp->th_flags & TH_SYN) ? 1 : 0) +
731	       ((tcp->th_flags & TH_FIN) ? 1 : 0);
732
733	MUTEX_ENTER(&is->is_lock);
734	if (fdata->td_end == 0) {
735		/*
736		 * Must be a (outgoing) SYN-ACK in reply to a SYN.
737		 */
738		fdata->td_end = end;
739		fdata->td_maxwin = 1;
740		fdata->td_maxend = end + 1;
741	}
742
743	if (!(tcp->th_flags & TH_ACK)) {  /* Pretend an ack was sent */
744		ack = tdata->td_end;
745	} else if (((tcp->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) &&
746		   (ack == 0)) {
747		/* gross hack to get around certain broken tcp stacks */
748		ack = tdata->td_end;
749	}
750
751	if (seq == end)
752		seq = end = fdata->td_end;
753
754	maxwin = tdata->td_maxwin;
755	ackskew = tdata->td_end - ack;
756
757#define	SEQ_GE(a,b)	((int)((a) - (b)) >= 0)
758#define	SEQ_GT(a,b)	((int)((a) - (b)) > 0)
759	if ((SEQ_GE(fdata->td_maxend, end)) &&
760	    (SEQ_GE(seq, fdata->td_end - maxwin)) &&
761/* XXX what about big packets */
762#define MAXACKWINDOW 66000
763	    (ackskew >= -MAXACKWINDOW) &&
764	    (ackskew <= MAXACKWINDOW)) {
765		/* if ackskew < 0 then this should be due to fragented
766		 * packets. There is no way to know the length of the
767		 * total packet in advance.
768		 * We do know the total length from the fragment cache though.
769		 * Note however that there might be more sessions with
770		 * exactly the same source and destination paramters in the
771		 * state cache (and source and destination is the only stuff
772		 * that is saved in the fragment cache). Note further that
773		 * some TCP connections in the state cache are hashed with
774		 * sport and dport as well which makes it not worthwhile to
775		 * look for them.
776		 * Thus, when ackskew is negative but still seems to belong
777		 * to this session, we bump up the destinations end value.
778		 */
779		if (ackskew < 0)
780			tdata->td_end = ack;
781
782		/* update max window seen */
783		if (fdata->td_maxwin < win)
784			fdata->td_maxwin = win;
785		if (SEQ_GT(end, fdata->td_end))
786			fdata->td_end = end;
787		if (SEQ_GE(ack + win, tdata->td_maxend)) {
788			tdata->td_maxend = ack + win;
789			if (win == 0)
790				tdata->td_maxend++;
791		}
792
793		ATOMIC_INCL(ips_stats.iss_hits);
794		/*
795		 * Nearing end of connection, start timeout.
796		 */
797		/* source ? 0 : 1 -> !source */
798		fr_tcp_age(&is->is_age, is->is_state, fin, !source);
799		ret = 1;
800	}
801	MUTEX_EXIT(&is->is_lock);
802	return ret;
803}
804
805
806static int fr_matchsrcdst(is, src, dst, fin, tcp)
807ipstate_t *is;
808union i6addr src, dst;
809fr_info_t *fin;
810tcphdr_t *tcp;
811{
812	int ret = 0, rev, out, flags;
813	u_short sp, dp;
814	void *ifp;
815
816	rev = IP6NEQ(is->is_dst, dst);
817	ifp = fin->fin_ifp;
818	out = fin->fin_out;
819
820	if (tcp != NULL) {
821		flags = is->is_flags;
822		sp = tcp->th_sport;
823		dp = tcp->th_dport;
824		if (!rev) {
825			if (!(flags & FI_W_SPORT) && (sp != is->is_sport))
826				rev = 1;
827			else if (!(flags & FI_W_DPORT) && (dp != is->is_dport))
828				rev = 1;
829		}
830	} else {
831		flags = is->is_flags & FI_WILDA;
832		sp = 0;
833		dp = 0;
834	}
835
836	if (rev == 0) {
837		if (!out) {
838			if (is->is_ifpin == NULL || is->is_ifpin == ifp)
839				ret = 1;
840		} else {
841			if (is->is_ifpout == NULL || is->is_ifpout == ifp)
842				ret = 1;
843		}
844	} else {
845		if (out) {
846			if (is->is_ifpin == NULL || is->is_ifpin == ifp)
847				ret = 1;
848		} else {
849			if (is->is_ifpout == NULL || is->is_ifpout == ifp)
850				ret = 1;
851		}
852	}
853	if (ret == 0)
854		return 0;
855	ret = 0;
856
857	if (rev == 0) {
858		if (
859		    (IP6EQ(is->is_dst, dst) || (flags & FI_W_DADDR)) &&
860		    (IP6EQ(is->is_src, src) || (flags & FI_W_SADDR)) &&
861		    (!tcp || ((sp == is->is_sport || flags & FI_W_SPORT) &&
862		     (dp == is->is_dport || flags & FI_W_DPORT)))) {
863			ret = 1;
864		}
865	} else {
866		if (
867		    (IP6EQ(is->is_dst, src) || (flags & FI_W_DADDR)) &&
868		    (IP6EQ(is->is_src, dst) || (flags & FI_W_SADDR)) &&
869		    (!tcp || ((sp == is->is_dport || flags & FI_W_DPORT) &&
870		     (dp == is->is_sport || flags & FI_W_SPORT)))) {
871			ret = 1;
872		}
873	}
874	if (ret == 0)
875		return 0;
876
877	/*
878	 * Whether or not this should be here, is questionable, but the aim
879	 * is to get this out of the main line.
880	 */
881	if (tcp == NULL)
882		flags = is->is_flags & (FI_CMP|(FI_CMP<<4));
883
884	if (((fin->fin_fl & (flags >> 4)) != (flags & FI_CMP)) ||
885	    (fin->fin_fi.fi_optmsk != is->is_opt) ||
886	    (fin->fin_fi.fi_secmsk != is->is_sec) ||
887	    (fin->fin_fi.fi_auth != is->is_auth))
888		return 0;
889
890	if ((flags & (FI_W_SPORT|FI_W_DPORT))) {
891		if ((flags & FI_W_SPORT) != 0) {
892			if (rev == 0) {
893				is->is_sport = sp;
894				is->is_send = htonl(tcp->th_seq);
895			} else {
896				is->is_sport = dp;
897				is->is_send = htonl(tcp->th_ack);
898			}
899			is->is_maxsend = is->is_send + 1;
900		} else if ((flags & FI_W_DPORT) != 0) {
901			if (rev == 0) {
902				is->is_dport = dp;
903				is->is_dend = htonl(tcp->th_ack);
904			} else {
905				is->is_dport = sp;
906				is->is_dend = htonl(tcp->th_seq);
907			}
908			is->is_maxdend = is->is_dend + 1;
909		}
910		is->is_flags &= ~(FI_W_SPORT|FI_W_DPORT);
911		ips_wild--;
912	}
913
914	ret = -1;
915
916	if (!rev) {
917		if (out) {
918			if (!is->is_ifpout)
919				ret = 1;
920		} else {
921			if (!is->is_ifpin)
922				ret = 0;
923		}
924	} else {
925		if (out) {
926			if (!is->is_ifpin)
927				ret = 0;
928		} else {
929			if (!is->is_ifpout)
930				ret = 1;
931		}
932	}
933
934	if (ret >= 0) {
935		is->is_ifp[ret] = ifp;
936#ifdef	_KERNEL
937		strncpy(is->is_ifname[ret], IFNAME(fin->fin_ifp),
938			sizeof(is->is_ifname[ret]));
939#endif
940	}
941	fin->fin_rev = rev;
942	return 1;
943}
944
945static int fr_matchicmpqueryreply(v, is, icmp)
946int v;
947ipstate_t *is;
948icmphdr_t *icmp;
949{
950	if (v == 4) {
951		/*
952		 * If we matched its type on the way in, then when going out
953		 * it will still be the same type.
954		 */
955		if (((icmp->icmp_type == is->is_type) ||
956		     (icmpreplytype4[is->is_type] == icmp->icmp_type)) &&
957		    (icmp->icmp_id == is->is_icmp.ics_id) &&
958		    (icmp->icmp_seq == is->is_icmp.ics_seq)) {
959			return 1;
960		};
961	}
962#ifdef	USE_INET6
963	else if (is->is_v == 6) {
964		if ((is->is_type == ICMP6_ECHO_REPLY) &&
965		    (icmp->icmp_type == ICMP6_ECHO_REQUEST) &&
966		    (icmp->icmp_id == is->is_icmp.ics_id) &&
967		    (icmp->icmp_seq == is->is_icmp.ics_seq)) {
968			return 1;
969		};
970	}
971#endif
972	return 0;
973}
974
975static frentry_t *fr_checkicmpmatchingstate(ip, fin)
976ip_t *ip;
977fr_info_t *fin;
978{
979	register ipstate_t *is, **isp;
980	register u_short sport, dport;
981	register u_char	pr;
982	union i6addr dst, src;
983	struct icmp *ic;
984	u_short savelen;
985	icmphdr_t *icmp;
986	fr_info_t ofin;
987	int type, len;
988	tcphdr_t *tcp;
989	frentry_t *fr;
990	ip_t *oip;
991	u_int hv;
992
993	/*
994	 * Does it at least have the return (basic) IP header ?
995	 * Only a basic IP header (no options) should be with
996	 * an ICMP error header.
997	 */
998	if (((ip->ip_v != 4) || (ip->ip_hl != 5)) ||
999	    (fin->fin_plen < ICMPERR_MINPKTLEN))
1000		return NULL;
1001	ic = (struct icmp *)fin->fin_dp;
1002	type = ic->icmp_type;
1003	/*
1004	 * If it's not an error type, then return
1005	 */
1006	if ((type != ICMP_UNREACH) && (type != ICMP_SOURCEQUENCH) &&
1007    	    (type != ICMP_REDIRECT) && (type != ICMP_TIMXCEED) &&
1008    	    (type != ICMP_PARAMPROB))
1009		return NULL;
1010
1011	oip = (ip_t *)((char *)ic + ICMPERR_ICMPHLEN);
1012	if (fin->fin_plen < ICMPERR_MAXPKTLEN + ((oip->ip_hl - 5) << 2))
1013		return NULL;
1014
1015	/*
1016	 * Sanity checks.
1017	 */
1018	len = fin->fin_dlen - ICMPERR_ICMPHLEN;
1019	if ((len <= 0) || ((oip->ip_hl << 2) > len))
1020		return NULL;
1021
1022	/*
1023	 * Is the buffer big enough for all of it ?  It's the size of the IP
1024	 * header claimed in the encapsulated part which is of concern.  It
1025	 * may be too big to be in this buffer but not so big that it's
1026	 * outside the ICMP packet, leading to TCP deref's causing problems.
1027	 * This is possible because we don't know how big oip_hl is when we
1028	 * do the pullup early in fr_check() and thus can't gaurantee it is
1029	 * all here now.
1030	 */
1031#ifdef  _KERNEL
1032	{
1033	mb_t *m;
1034
1035# if SOLARIS
1036	m = fin->fin_qfm;
1037	if ((char *)oip + len > (char *)m->b_wptr)
1038		return NULL;
1039# else
1040	m = *(mb_t **)fin->fin_mp;
1041	if ((char *)oip + len > (char *)ip + m->m_len)
1042		return NULL;
1043# endif
1044	}
1045#endif
1046
1047	/*
1048	 * in the IPv4 case we must zero the i6addr union otherwise
1049	 * the IP6EQ and IP6NEQ macros produce the wrong results because
1050	 * of the 'junk' in the unused part of the union
1051	 */
1052	bzero((char *)&src, sizeof(src));
1053	bzero((char *)&dst, sizeof(dst));
1054
1055	if (oip->ip_p == IPPROTO_ICMP) {
1056		icmp = (icmphdr_t *)((char *)oip + (oip->ip_hl << 2));
1057
1058		/*
1059		 * a ICMP error can only be generated as a result of an
1060		 * ICMP query, not as the response on an ICMP error
1061		 *
1062		 * XXX theoretically ICMP_ECHOREP and the other reply's are
1063		 * ICMP query's as well, but adding them here seems strange XXX
1064		 */
1065		 if ((icmp->icmp_type != ICMP_ECHO) &&
1066		     (icmp->icmp_type != ICMP_TSTAMP) &&
1067		     (icmp->icmp_type != ICMP_IREQ) &&
1068		     (icmp->icmp_type != ICMP_MASKREQ))
1069		    	return NULL;
1070
1071		/*
1072		 * perform a lookup of the ICMP packet in the state table
1073		 */
1074		hv = (pr = oip->ip_p);
1075		src.in4 = oip->ip_src;
1076		hv += src.in4.s_addr;
1077		dst.in4 = oip->ip_dst;
1078		hv += dst.in4.s_addr;
1079		hv += icmp->icmp_id;
1080		hv += icmp->icmp_seq;
1081		hv %= fr_statesize;
1082
1083		savelen = oip->ip_len;
1084		oip->ip_len = len;
1085		ofin.fin_v = 4;
1086		fr_makefrip(oip->ip_hl << 2, oip, &ofin);
1087		oip->ip_len = savelen;
1088		ofin.fin_ifp = fin->fin_ifp;
1089		ofin.fin_out = !fin->fin_out;
1090		ofin.fin_mp = NULL; /* if dereferenced, panic XXX */
1091
1092		READ_ENTER(&ipf_state);
1093		for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext)
1094			if ((is->is_p == pr) && (is->is_v == 4) &&
1095			    fr_matchsrcdst(is, src, dst, &ofin, NULL) &&
1096			    fr_matchicmpqueryreply(is->is_v, is, icmp)) {
1097				ips_stats.iss_hits++;
1098				is->is_pkts++;
1099				is->is_bytes += ip->ip_len;
1100				fr = is->is_rule;
1101				RWLOCK_EXIT(&ipf_state);
1102				return fr;
1103			}
1104		RWLOCK_EXIT(&ipf_state);
1105		return NULL;
1106	};
1107
1108	if ((oip->ip_p != IPPROTO_TCP) && (oip->ip_p != IPPROTO_UDP))
1109		return NULL;
1110
1111	tcp = (tcphdr_t *)((char *)oip + (oip->ip_hl << 2));
1112	dport = tcp->th_dport;
1113	sport = tcp->th_sport;
1114
1115	hv = (pr = oip->ip_p);
1116	src.in4 = oip->ip_src;
1117	hv += src.in4.s_addr;
1118	dst.in4 = oip->ip_dst;
1119	hv += dst.in4.s_addr;
1120	hv += dport;
1121	hv += sport;
1122	hv %= fr_statesize;
1123	/*
1124	 * we make an fin entry to be able to feed it to
1125	 * matchsrcdst note that not all fields are encessary
1126	 * but this is the cleanest way. Note further we fill
1127	 * in fin_mp such that if someone uses it we'll get
1128	 * a kernel panic. fr_matchsrcdst does not use this.
1129	 *
1130	 * watch out here, as ip is in host order and oip in network
1131	 * order. Any change we make must be undone afterwards.
1132	 */
1133	savelen = oip->ip_len;
1134	oip->ip_len = len;
1135	ofin.fin_v = 4;
1136	fr_makefrip(oip->ip_hl << 2, oip, &ofin);
1137	oip->ip_len = savelen;
1138	ofin.fin_ifp = fin->fin_ifp;
1139	ofin.fin_out = !fin->fin_out;
1140	ofin.fin_mp = NULL; /* if dereferenced, panic XXX */
1141	READ_ENTER(&ipf_state);
1142	for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) {
1143		/*
1144		 * Only allow this icmp though if the
1145		 * encapsulated packet was allowed through the
1146		 * other way around. Note that the minimal amount
1147		 * of info present does not allow for checking against
1148		 * tcp internals such as seq and ack numbers.
1149		 */
1150		if ((is->is_p == pr) && (is->is_v == 4) &&
1151		    fr_matchsrcdst(is, src, dst, &ofin, tcp)) {
1152			fr = is->is_rule;
1153			ips_stats.iss_hits++;
1154			is->is_pkts++;
1155			is->is_bytes += fin->fin_plen;
1156			/*
1157			 * we deliberately do not touch the timeouts
1158			 * for the accompanying state table entry.
1159			 * It remains to be seen if that is correct. XXX
1160			 */
1161			RWLOCK_EXIT(&ipf_state);
1162			return fr;
1163		}
1164	}
1165	RWLOCK_EXIT(&ipf_state);
1166	return NULL;
1167}
1168
1169
1170static void fr_ipsmove(isp, is, hv)
1171ipstate_t **isp, *is;
1172u_int hv;
1173{
1174	u_int hvm;
1175
1176	hvm = is->is_hv;
1177	/*
1178	 * Remove the hash from the old location...
1179	 */
1180	if (is->is_hnext)
1181		is->is_hnext->is_phnext = isp;
1182	*isp = is->is_hnext;
1183	if (ips_table[hvm] == NULL)
1184		ips_stats.iss_inuse--;
1185
1186	/*
1187	 * ...and put the hash in the new one.
1188	 */
1189	hvm = hv % fr_statesize;
1190	is->is_hv = hvm;
1191	isp = &ips_table[hvm];
1192	if (*isp)
1193		(*isp)->is_phnext = &is->is_hnext;
1194	else
1195		ips_stats.iss_inuse++;
1196	is->is_phnext = isp;
1197	is->is_hnext = *isp;
1198	*isp = is;
1199}
1200
1201
1202/*
1203 * Check if a packet has a registered state.
1204 */
1205frentry_t *fr_checkstate(ip, fin)
1206ip_t *ip;
1207fr_info_t *fin;
1208{
1209	union i6addr dst, src;
1210	register ipstate_t *is, **isp;
1211	register u_char pr;
1212	u_int hv, hvm, hlen, tryagain, pass, v;
1213	struct icmp *ic;
1214	frentry_t *fr;
1215	tcphdr_t *tcp;
1216
1217	if (fr_state_lock || (fin->fin_off != 0) || (fin->fin_fl & FI_SHORT))
1218		return NULL;
1219
1220	is = NULL;
1221	hlen = fin->fin_hlen;
1222	tcp = (tcphdr_t *)((char *)ip + hlen);
1223	ic = (struct icmp *)tcp;
1224	hv = (pr = fin->fin_fi.fi_p);
1225	src = fin->fin_fi.fi_src;
1226	dst = fin->fin_fi.fi_dst;
1227	hv += src.in4.s_addr;
1228	hv += dst.in4.s_addr;
1229
1230	/*
1231	 * Search the hash table for matching packet header info.
1232	 */
1233	v = fin->fin_fi.fi_v;
1234	switch (fin->fin_fi.fi_p)
1235	{
1236#ifdef	USE_INET6
1237	case IPPROTO_ICMPV6 :
1238		if (v == 6) {
1239			if (fin->fin_out)
1240				hv -= dst.in4.s_addr;
1241			else
1242				hv -= src.in4.s_addr;
1243			if ((ic->icmp_type == ICMP6_ECHO_REQUEST) ||
1244			    (ic->icmp_type == ICMP6_ECHO_REPLY)) {
1245				hv += ic->icmp_id;
1246				hv += ic->icmp_seq;
1247			}
1248		}
1249#endif
1250	case IPPROTO_ICMP :
1251		if (v == 4) {
1252			hv += ic->icmp_id;
1253			hv += ic->icmp_seq;
1254		}
1255		hv %= fr_statesize;
1256		READ_ENTER(&ipf_state);
1257		for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) {
1258			if ((is->is_p == pr) && (is->is_v == v) &&
1259			    fr_matchsrcdst(is, src, dst, fin, NULL) &&
1260			    fr_matchicmpqueryreply(v, is, ic)) {
1261				if (fin->fin_rev)
1262					is->is_age = fr_icmpacktimeout;
1263				else
1264					is->is_age = fr_icmptimeout;
1265				break;
1266			}
1267		}
1268		if (is != NULL)
1269			break;
1270		RWLOCK_EXIT(&ipf_state);
1271		/*
1272		 * No matching icmp state entry. Perhaps this is a
1273		 * response to another state entry.
1274		 */
1275#ifdef	USE_INET6
1276		if (v == 6)
1277			fr = fr_checkicmp6matchingstate((ip6_t *)ip, fin);
1278		else
1279#endif
1280			fr = fr_checkicmpmatchingstate(ip, fin);
1281		if (fr)
1282			return fr;
1283		break;
1284	case IPPROTO_TCP :
1285	    {
1286		register u_short dport, sport;
1287		register int i;
1288
1289		i = tcp->th_flags;
1290		/*
1291		 * Just plain ignore RST flag set with either FIN or SYN.
1292		 */
1293		if ((i & TH_RST) &&
1294		    ((i & (TH_FIN|TH_SYN|TH_RST)) != TH_RST))
1295			break;
1296	case IPPROTO_UDP :
1297		dport = tcp->th_dport;
1298		sport = tcp->th_sport;
1299		tryagain = 0;
1300		hv += dport;
1301		hv += sport;
1302		READ_ENTER(&ipf_state);
1303retry_tcpudp:
1304		hvm = hv % fr_statesize;
1305		for (isp = &ips_table[hvm]; (is = *isp); isp = &is->is_hnext)
1306			if ((is->is_p == pr) && (is->is_v == v) &&
1307			    fr_matchsrcdst(is, src, dst, fin, tcp)) {
1308				if ((pr == IPPROTO_TCP)) {
1309					if (!fr_tcpstate(is, fin, ip, tcp)) {
1310						continue;
1311					}
1312				} else if ((pr == IPPROTO_UDP)) {
1313					if (fin->fin_rev)
1314						is->is_age = fr_udpacktimeout;
1315					else
1316						is->is_age = fr_udptimeout;
1317				}
1318				break;
1319			}
1320		if (is != NULL) {
1321			if (tryagain &&
1322			    !(is->is_flags & (FI_WILDP|FI_WILDA))) {
1323				hv += dport;
1324				hv += sport;
1325				fr_ipsmove(isp, is, hv);
1326				MUTEX_DOWNGRADE(&ipf_state);
1327			}
1328			break;
1329		}
1330		RWLOCK_EXIT(&ipf_state);
1331		if (!tryagain && ips_wild) {
1332			hv -= dport;
1333			hv -= sport;
1334			tryagain = 1;
1335			WRITE_ENTER(&ipf_state);
1336			goto retry_tcpudp;
1337		}
1338		break;
1339	    }
1340	default :
1341		break;
1342	}
1343	if (is == NULL) {
1344		ATOMIC_INCL(ips_stats.iss_miss);
1345		return NULL;
1346	}
1347	MUTEX_ENTER(&is->is_lock);
1348	is->is_bytes += fin->fin_plen;
1349	ips_stats.iss_hits++;
1350	is->is_pkts++;
1351	MUTEX_EXIT(&is->is_lock);
1352	fr = is->is_rule;
1353	fin->fin_fr = fr;
1354	pass = is->is_pass;
1355#ifndef	_KERNEL
1356	if (tcp->th_flags & TCP_CLOSE)
1357		fr_delstate(is);
1358#endif
1359	RWLOCK_EXIT(&ipf_state);
1360	if ((fin->fin_fi.fi_fl & FI_FRAG) && (pass & FR_KEEPFRAG))
1361		ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE);
1362	return fr;
1363}
1364
1365
1366void ip_statesync(ifp)
1367void *ifp;
1368{
1369	register ipstate_t *is;
1370
1371	WRITE_ENTER(&ipf_state);
1372	for (is = ips_list; is; is = is->is_next) {
1373		if (is->is_ifpin == ifp) {
1374			is->is_ifpin = GETUNIT(is->is_ifname[0], is->is_v);
1375			if (!is->is_ifpin)
1376				is->is_ifpin = (void *)-1;
1377		}
1378		if (is->is_ifpout == ifp) {
1379			is->is_ifpout = GETUNIT(is->is_ifname[1], is->is_v);
1380			if (!is->is_ifpout)
1381				is->is_ifpout = (void *)-1;
1382		}
1383	}
1384	RWLOCK_EXIT(&ipf_state);
1385}
1386
1387
1388/*
1389 * Must always be called with fr_ipfstate held as a write lock.
1390 */
1391static void fr_delstate(is)
1392ipstate_t *is;
1393{
1394	frentry_t *fr;
1395
1396	if (is->is_flags & (FI_WILDP|FI_WILDA))
1397		ips_wild--;
1398	if (is->is_next)
1399		is->is_next->is_pnext = is->is_pnext;
1400	*is->is_pnext = is->is_next;
1401	if (is->is_hnext)
1402		is->is_hnext->is_phnext = is->is_phnext;
1403	*is->is_phnext = is->is_hnext;
1404	if (ips_table[is->is_hv] == NULL)
1405		ips_stats.iss_inuse--;
1406
1407	fr = is->is_rule;
1408	if (fr != NULL) {
1409		fr->fr_ref--;
1410		if (fr->fr_ref == 0) {
1411			KFREE(fr);
1412		}
1413	}
1414#ifdef	_KERNEL
1415	MUTEX_DESTROY(&is->is_lock);
1416#endif
1417	KFREE(is);
1418	ips_num--;
1419}
1420
1421
1422/*
1423 * Free memory in use by all state info. kept.
1424 */
1425void fr_stateunload()
1426{
1427	register ipstate_t *is;
1428
1429	WRITE_ENTER(&ipf_state);
1430	while ((is = ips_list))
1431		fr_delstate(is);
1432	ips_stats.iss_inuse = 0;
1433	ips_num = 0;
1434	RWLOCK_EXIT(&ipf_state);
1435	if (ips_table)
1436		KFREES(ips_table, fr_statesize * sizeof(ipstate_t *));
1437	ips_table = NULL;
1438}
1439
1440
1441/*
1442 * Slowly expire held state for thingslike UDP and ICMP.  Timeouts are set
1443 * in expectation of this being called twice per second.
1444 */
1445void fr_timeoutstate()
1446{
1447	register ipstate_t *is, **isp;
1448#if defined(_KERNEL) && !SOLARIS
1449	int s;
1450#endif
1451
1452	SPL_NET(s);
1453	WRITE_ENTER(&ipf_state);
1454	for (isp = &ips_list; (is = *isp); )
1455		if (is->is_age && !--is->is_age) {
1456			if (is->is_p == IPPROTO_TCP)
1457				ips_stats.iss_fin++;
1458			else
1459				ips_stats.iss_expire++;
1460#ifdef	IPFILTER_LOG
1461			ipstate_log(is, ISL_EXPIRE);
1462#endif
1463			fr_delstate(is);
1464		} else
1465			isp = &is->is_next;
1466	if (fr_state_doflush) {
1467		(void) fr_state_flush(1);
1468		fr_state_doflush = 0;
1469	}
1470	RWLOCK_EXIT(&ipf_state);
1471	SPL_X(s);
1472}
1473
1474
1475/*
1476 * Original idea freom Pradeep Krishnan for use primarily with NAT code.
1477 * (pkrishna@netcom.com)
1478 *
1479 * Rewritten by Arjan de Vet <Arjan.deVet@adv.iae.nl>, 2000-07-29:
1480 *
1481 * - (try to) base state transitions on real evidence only,
1482 *   i.e. packets that are sent and have been received by ipfilter;
1483 *   diagram 18.12 of TCP/IP volume 1 by W. Richard Stevens was used.
1484 *
1485 * - deal with half-closed connections correctly;
1486 *
1487 * - store the state of the source in state[0] such that ipfstat
1488 *   displays the state as source/dest instead of dest/source; the calls
1489 *   to fr_tcp_age have been changed accordingly.
1490 *
1491 * Parameters:
1492 *
1493 *    state[0] = state of source (host that initiated connection)
1494 *    state[1] = state of dest   (host that accepted the connection)
1495 *
1496 *    dir == 0 : a packet from source to dest
1497 *    dir == 1 : a packet from dest to source
1498 *
1499 */
1500void fr_tcp_age(age, state, fin, dir)
1501u_long *age;
1502u_char *state;
1503fr_info_t *fin;
1504int dir;
1505{
1506	tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp;
1507	u_char flags = tcp->th_flags;
1508	int dlen, ostate;
1509
1510	ostate = state[1 - dir];
1511
1512	dlen = fin->fin_plen - fin->fin_hlen - (tcp->th_off << 2);
1513
1514	if (flags & TH_RST) {
1515		if (!(tcp->th_flags & TH_PUSH) && !dlen) {
1516			*age = fr_tcpclosed;
1517			state[dir] = TCPS_CLOSED;
1518		} else {
1519			*age = fr_tcpclosewait;
1520			state[dir] = TCPS_CLOSE_WAIT;
1521		}
1522		return;
1523	}
1524
1525	*age = fr_tcptimeout; /* default 4 mins */
1526
1527	switch(state[dir])
1528	{
1529	case TCPS_CLOSED: /* 0 */
1530		if ((flags & TH_OPENING) == TH_OPENING) {
1531			/*
1532			 * 'dir' received an S and sends SA in response,
1533			 * CLOSED -> SYN_RECEIVED
1534			 */
1535			state[dir] = TCPS_SYN_RECEIVED;
1536			*age = fr_tcptimeout;
1537		} else if ((flags & (TH_SYN|TH_ACK)) == TH_SYN) {
1538			/* 'dir' sent S, CLOSED -> SYN_SENT */
1539			state[dir] = TCPS_SYN_SENT;
1540			*age = fr_tcptimeout;
1541		}
1542		/*
1543		 * The next piece of code makes it possible to get
1544		 * already established connections into the state table
1545		 * after a restart or reload of the filter rules; this
1546		 * does not work when a strict 'flags S keep state' is
1547		 * used for tcp connections of course
1548		 */
1549		if ((flags & (TH_FIN|TH_SYN|TH_RST|TH_ACK)) == TH_ACK) {
1550			/* we saw an A, guess 'dir' is in ESTABLISHED mode */
1551			state[dir] = TCPS_ESTABLISHED;
1552			*age = fr_tcpidletimeout;
1553		}
1554		/*
1555		 * TODO: besides regular ACK packets we can have other
1556		 * packets as well; it is yet to be determined how we
1557		 * should initialize the states in those cases
1558		 */
1559		break;
1560
1561	case TCPS_LISTEN: /* 1 */
1562		/* NOT USED */
1563		break;
1564
1565	case TCPS_SYN_SENT: /* 2 */
1566		if ((flags & (TH_SYN|TH_FIN|TH_ACK)) == TH_ACK) {
1567			/*
1568			 * We see an A from 'dir' which is in SYN_SENT
1569			 * state: 'dir' sent an A in response to an SA
1570			 * which it received, SYN_SENT -> ESTABLISHED
1571			 */
1572			state[dir] = TCPS_ESTABLISHED;
1573			*age = fr_tcpidletimeout;
1574		} else if (flags & TH_FIN) {
1575			/*
1576			 * We see an F from 'dir' which is in SYN_SENT
1577			 * state and wants to close its side of the
1578			 * connection; SYN_SENT -> FIN_WAIT_1
1579			 */
1580			state[dir] = TCPS_FIN_WAIT_1;
1581			*age = fr_tcpidletimeout; /* or fr_tcptimeout? */
1582		} else if ((flags & TH_OPENING) == TH_OPENING) {
1583			/*
1584			 * We see an SA from 'dir' which is already in
1585			 * SYN_SENT state, this means we have a
1586			 * simultaneous open; SYN_SENT -> SYN_RECEIVED
1587			 */
1588			state[dir] = TCPS_SYN_RECEIVED;
1589			*age = fr_tcptimeout;
1590		}
1591		break;
1592
1593	case TCPS_SYN_RECEIVED: /* 3 */
1594		if ((flags & (TH_SYN|TH_FIN|TH_ACK)) == TH_ACK) {
1595			/*
1596			 * We see an A from 'dir' which was in SYN_RECEIVED
1597			 * state so it must now be in established state,
1598			 * SYN_RECEIVED -> ESTABLISHED
1599			 */
1600			state[dir] = TCPS_ESTABLISHED;
1601			*age = fr_tcpidletimeout;
1602		} else if (flags & TH_FIN) {
1603			/*
1604			 * We see an F from 'dir' which is in SYN_RECEIVED
1605			 * state and wants to close its side of the connection;
1606			 * SYN_RECEIVED -> FIN_WAIT_1
1607			 */
1608			state[dir] = TCPS_FIN_WAIT_1;
1609			*age = fr_tcpidletimeout;
1610		}
1611		break;
1612
1613	case TCPS_ESTABLISHED: /* 4 */
1614		if (flags & TH_FIN) {
1615			/*
1616			 * 'dir' closed its side of the connection; this
1617			 * gives us a half-closed connection;
1618			 * ESTABLISHED -> FIN_WAIT_1
1619			 */
1620			state[dir] = TCPS_FIN_WAIT_1;
1621			*age = fr_tcphalfclosed;
1622		} else if (flags & TH_ACK) {
1623			/* an ACK, should we exclude other flags here? */
1624			if (ostate == TCPS_FIN_WAIT_1) {
1625				/*
1626				 * We know the other side did an active close,
1627				 * so we are ACKing the recvd FIN packet (does
1628				 * the window matching code guarantee this?)
1629				 * and go into CLOSE_WAIT state; this gives us
1630				 * a half-closed connection
1631				 */
1632				state[dir] = TCPS_CLOSE_WAIT;
1633				*age = fr_tcphalfclosed;
1634			} else if (ostate < TCPS_CLOSE_WAIT)
1635				/*
1636				 * Still a fully established connection,
1637				 * reset timeout
1638				 */
1639				*age = fr_tcpidletimeout;
1640		}
1641		break;
1642
1643	case TCPS_CLOSE_WAIT: /* 5 */
1644		if (flags & TH_FIN) {
1645			/*
1646			 * Application closed and 'dir' sent a FIN, we're now
1647			 * going into LAST_ACK state
1648			 */
1649			*age  = fr_tcplastack;
1650			state[dir] = TCPS_LAST_ACK;
1651		} else {
1652			/*
1653			 * We remain in CLOSE_WAIT because the other side has
1654			 * closed already and we did not close our side yet;
1655			 * reset timeout
1656			 */
1657			*age  = fr_tcphalfclosed;
1658		}
1659		break;
1660
1661	case TCPS_FIN_WAIT_1: /* 6 */
1662		if ((flags & TH_ACK) && ostate > TCPS_CLOSE_WAIT) {
1663			/*
1664			 * If the other side is not active anymore it has sent
1665			 * us a FIN packet that we are ack'ing now with an ACK;
1666			 * this means both sides have now closed the connection
1667			 * and we go into TIME_WAIT
1668			 */
1669			/*
1670			 * XXX: how do we know we really are ACKing the FIN
1671			 * packet here? does the window code guarantee that?
1672			 */
1673			state[dir] = TCPS_TIME_WAIT;
1674			*age = fr_tcptimeout;
1675		} else
1676			/*
1677			 * We closed our side of the connection already but the
1678			 * other side is still active (ESTABLISHED/CLOSE_WAIT);
1679			 * continue with this half-closed connection
1680			 */
1681			*age = fr_tcphalfclosed;
1682		break;
1683
1684	case TCPS_CLOSING: /* 7 */
1685		/* NOT USED */
1686		break;
1687
1688	case TCPS_LAST_ACK: /* 8 */
1689		if (flags & TH_ACK) {
1690			if ((flags & TH_PUSH) || dlen)
1691				/*
1692				 * There is still data to be delivered, reset
1693				 * timeout
1694				 */
1695				*age  = fr_tcplastack;
1696		}
1697		/*
1698		 * We cannot detect when we go out of LAST_ACK state to CLOSED
1699		 * because that is based on the reception of ACK packets;
1700		 * ipfilter can only detect that a packet has been sent by a
1701		 * host
1702		 */
1703		break;
1704
1705	case TCPS_FIN_WAIT_2: /* 9 */
1706		/* NOT USED */
1707		break;
1708
1709	case TCPS_TIME_WAIT: /* 10 */
1710		/* we're in 2MSL timeout now */
1711		break;
1712	}
1713}
1714
1715
1716#ifdef	IPFILTER_LOG
1717void ipstate_log(is, type)
1718struct ipstate *is;
1719u_int type;
1720{
1721	struct	ipslog	ipsl;
1722	void *items[1];
1723	size_t sizes[1];
1724	int types[1];
1725
1726	ipsl.isl_type = type;
1727	ipsl.isl_pkts = is->is_pkts;
1728	ipsl.isl_bytes = is->is_bytes;
1729	ipsl.isl_src = is->is_src;
1730	ipsl.isl_dst = is->is_dst;
1731	ipsl.isl_p = is->is_p;
1732	ipsl.isl_v = is->is_v;
1733	ipsl.isl_flags = is->is_flags;
1734	if (ipsl.isl_p == IPPROTO_TCP || ipsl.isl_p == IPPROTO_UDP) {
1735		ipsl.isl_sport = is->is_sport;
1736		ipsl.isl_dport = is->is_dport;
1737		if (ipsl.isl_p == IPPROTO_TCP) {
1738			ipsl.isl_state[0] = is->is_state[0];
1739			ipsl.isl_state[1] = is->is_state[1];
1740		}
1741	} else if (ipsl.isl_p == IPPROTO_ICMP)
1742		ipsl.isl_itype = is->is_icmp.ics_type;
1743	else {
1744		ipsl.isl_ps.isl_filler[0] = 0;
1745		ipsl.isl_ps.isl_filler[1] = 0;
1746	}
1747	items[0] = &ipsl;
1748	sizes[0] = sizeof(ipsl);
1749	types[0] = 0;
1750
1751	(void) ipllog(IPL_LOGSTATE, NULL, items, sizes, types, 1);
1752}
1753#endif
1754
1755
1756#ifdef	USE_INET6
1757frentry_t *fr_checkicmp6matchingstate(ip, fin)
1758ip6_t *ip;
1759fr_info_t *fin;
1760{
1761	register ipstate_t *is, **isp;
1762	register u_short sport, dport;
1763	register u_char	pr;
1764	struct icmp6_hdr *ic, *oic;
1765	union i6addr dst, src;
1766	u_short savelen;
1767	fr_info_t ofin;
1768	tcphdr_t *tcp;
1769	frentry_t *fr;
1770	ip6_t *oip;
1771	int type;
1772	u_int hv;
1773
1774	/*
1775	 * Does it at least have the return (basic) IP header ?
1776	 * Only a basic IP header (no options) should be with
1777	 * an ICMP error header.
1778	 */
1779	if ((fin->fin_v != 6) || (fin->fin_plen < ICMP6ERR_MINPKTLEN))
1780		return NULL;
1781	ic = (struct icmp6_hdr *)fin->fin_dp;
1782	type = ic->icmp6_type;
1783	/*
1784	 * If it's not an error type, then return
1785	 */
1786	if ((type != ICMP6_DST_UNREACH) && (type != ICMP6_PACKET_TOO_BIG) &&
1787	    (type != ICMP6_TIME_EXCEEDED) && (type != ICMP6_PARAM_PROB))
1788		return NULL;
1789
1790	oip = (ip6_t *)((char *)ic + ICMPERR_ICMPHLEN);
1791	if (fin->fin_plen < sizeof(*oip))
1792		return NULL;
1793
1794	if (oip->ip6_nxt == IPPROTO_ICMPV6) {
1795		oic = (struct icmp6_hdr *)(oip + 1);
1796		/*
1797		 * a ICMP error can only be generated as a result of an
1798		 * ICMP query, not as the response on an ICMP error
1799		 *
1800		 * XXX theoretically ICMP_ECHOREP and the other reply's are
1801		 * ICMP query's as well, but adding them here seems strange XXX
1802		 */
1803		 if (!(oic->icmp6_type & ICMP6_INFOMSG_MASK))
1804		    	return NULL;
1805
1806		/*
1807		 * perform a lookup of the ICMP packet in the state table
1808		 */
1809		hv = (pr = oip->ip6_nxt);
1810		src.in6 = oip->ip6_src;
1811		hv += src.in4.s_addr;
1812		dst.in6 = oip->ip6_dst;
1813		hv += dst.in4.s_addr;
1814		hv += oic->icmp6_id;
1815		hv += oic->icmp6_seq;
1816		hv %= fr_statesize;
1817
1818		oip->ip6_plen = ntohs(oip->ip6_plen);
1819		ofin.fin_v = 6;
1820		fr_makefrip(sizeof(*oip), (ip_t *)oip, &ofin);
1821		oip->ip6_plen = htons(oip->ip6_plen);
1822		ofin.fin_ifp = fin->fin_ifp;
1823		ofin.fin_out = !fin->fin_out;
1824		ofin.fin_mp = NULL; /* if dereferenced, panic XXX */
1825
1826		READ_ENTER(&ipf_state);
1827		for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext)
1828			if ((is->is_p == pr) &&
1829			    (oic->icmp6_id == is->is_icmp.ics_id) &&
1830			    (oic->icmp6_seq == is->is_icmp.ics_seq) &&
1831			    fr_matchsrcdst(is, src, dst, &ofin, NULL)) {
1832			    	/*
1833			    	 * in the state table ICMP query's are stored
1834			    	 * with the type of the corresponding ICMP
1835			    	 * response. Correct here
1836			    	 */
1837				if (((is->is_type == ICMP6_ECHO_REPLY) &&
1838				     (oic->icmp6_type == ICMP6_ECHO_REQUEST)) ||
1839				     (is->is_type - 1 == oic->icmp6_type )) {
1840				    	ips_stats.iss_hits++;
1841    					is->is_pkts++;
1842					is->is_bytes += fin->fin_plen;
1843					return is->is_rule;
1844				}
1845			}
1846		RWLOCK_EXIT(&ipf_state);
1847
1848		return NULL;
1849	};
1850
1851	if ((oip->ip6_nxt != IPPROTO_TCP) && (oip->ip6_nxt != IPPROTO_UDP))
1852		return NULL;
1853	tcp = (tcphdr_t *)(oip + 1);
1854	dport = tcp->th_dport;
1855	sport = tcp->th_sport;
1856
1857	hv = (pr = oip->ip6_nxt);
1858	src.in6 = oip->ip6_src;
1859	hv += src.in4.s_addr;
1860	dst.in6 = oip->ip6_dst;
1861	hv += dst.in4.s_addr;
1862	hv += dport;
1863	hv += sport;
1864	hv %= fr_statesize;
1865	/*
1866	 * we make an fin entry to be able to feed it to
1867	 * matchsrcdst note that not all fields are encessary
1868	 * but this is the cleanest way. Note further we fill
1869	 * in fin_mp such that if someone uses it we'll get
1870	 * a kernel panic. fr_matchsrcdst does not use this.
1871	 *
1872	 * watch out here, as ip is in host order and oip in network
1873	 * order. Any change we make must be undone afterwards.
1874	 */
1875	savelen = oip->ip6_plen;
1876	oip->ip6_plen = ip->ip6_plen - sizeof(*ip) - ICMPERR_ICMPHLEN;
1877	ofin.fin_v = 6;
1878	fr_makefrip(sizeof(*oip), (ip_t *)oip, &ofin);
1879	oip->ip6_plen = savelen;
1880	ofin.fin_ifp = fin->fin_ifp;
1881	ofin.fin_out = !fin->fin_out;
1882	ofin.fin_mp = NULL; /* if dereferenced, panic XXX */
1883	READ_ENTER(&ipf_state);
1884	for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) {
1885		/*
1886		 * Only allow this icmp though if the
1887		 * encapsulated packet was allowed through the
1888		 * other way around. Note that the minimal amount
1889		 * of info present does not allow for checking against
1890		 * tcp internals such as seq and ack numbers.
1891		 */
1892		if ((is->is_p == pr) && (is->is_v == 6) &&
1893		    fr_matchsrcdst(is, src, dst, &ofin, tcp)) {
1894			fr = is->is_rule;
1895			ips_stats.iss_hits++;
1896			/*
1897			 * we must swap src and dst here because the icmp
1898			 * comes the other way around
1899			 */
1900			is->is_pkts++;
1901			is->is_bytes += fin->fin_plen;
1902			/*
1903			 * we deliberately do not touch the timeouts
1904			 * for the accompanying state table entry.
1905			 * It remains to be seen if that is correct. XXX
1906			 */
1907			RWLOCK_EXIT(&ipf_state);
1908			return fr;
1909		}
1910	}
1911	RWLOCK_EXIT(&ipf_state);
1912	return NULL;
1913}
1914#endif
1915