ip_input.c revision 1.115
1/*	$NetBSD: ip_input.c,v 1.115 2000/06/28 03:01:16 mrg Exp $	*/
2
3/*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*-
33 * Copyright (c) 1998 The NetBSD Foundation, Inc.
34 * All rights reserved.
35 *
36 * This code is derived from software contributed to The NetBSD Foundation
37 * by Public Access Networks Corporation ("Panix").  It was developed under
38 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 *    notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 *    notice, this list of conditions and the following disclaimer in the
47 *    documentation and/or other materials provided with the distribution.
48 * 3. All advertising materials mentioning features or use of this software
49 *    must display the following acknowledgement:
50 *	This product includes software developed by the NetBSD
51 *	Foundation, Inc. and its contributors.
52 * 4. Neither the name of The NetBSD Foundation nor the names of its
53 *    contributors may be used to endorse or promote products derived
54 *    from this software without specific prior written permission.
55 *
56 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
57 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
58 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
59 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
60 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
61 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
62 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
63 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
64 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
65 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
66 * POSSIBILITY OF SUCH DAMAGE.
67 */
68
69/*
70 * Copyright (c) 1982, 1986, 1988, 1993
71 *	The Regents of the University of California.  All rights reserved.
72 *
73 * Redistribution and use in source and binary forms, with or without
74 * modification, are permitted provided that the following conditions
75 * are met:
76 * 1. Redistributions of source code must retain the above copyright
77 *    notice, this list of conditions and the following disclaimer.
78 * 2. Redistributions in binary form must reproduce the above copyright
79 *    notice, this list of conditions and the following disclaimer in the
80 *    documentation and/or other materials provided with the distribution.
81 * 3. All advertising materials mentioning features or use of this software
82 *    must display the following acknowledgement:
83 *	This product includes software developed by the University of
84 *	California, Berkeley and its contributors.
85 * 4. Neither the name of the University nor the names of its contributors
86 *    may be used to endorse or promote products derived from this software
87 *    without specific prior written permission.
88 *
89 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
90 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
91 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
92 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
93 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
94 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
95 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
96 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
97 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
98 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
99 * SUCH DAMAGE.
100 *
101 *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
102 */
103
104#include "opt_gateway.h"
105#include "opt_pfil_hooks.h"
106#include "opt_ipsec.h"
107#include "opt_mrouting.h"
108
109#include <sys/param.h>
110#include <sys/systm.h>
111#include <sys/malloc.h>
112#include <sys/mbuf.h>
113#include <sys/domain.h>
114#include <sys/protosw.h>
115#include <sys/socket.h>
116#include <sys/socketvar.h>
117#include <sys/errno.h>
118#include <sys/time.h>
119#include <sys/kernel.h>
120#include <sys/proc.h>
121#include <sys/pool.h>
122
123#include <uvm/uvm_extern.h>
124
125#include <sys/sysctl.h>
126
127#include <net/if.h>
128#include <net/if_dl.h>
129#include <net/route.h>
130#include <net/pfil.h>
131
132#include <netinet/in.h>
133#include <netinet/in_systm.h>
134#include <netinet/ip.h>
135#include <netinet/in_pcb.h>
136#include <netinet/in_var.h>
137#include <netinet/ip_var.h>
138#include <netinet/ip_icmp.h>
139/* just for gif_ttl */
140#include <netinet/in_gif.h>
141#include "gif.h"
142
143#ifdef MROUTING
144#include <netinet/ip_mroute.h>
145#endif
146
147#ifdef IPSEC
148#include <netinet6/ipsec.h>
149#include <netkey/key.h>
150#include <netkey/key_debug.h>
151#endif
152
153#ifndef	IPFORWARDING
154#ifdef GATEWAY
155#define	IPFORWARDING	1	/* forward IP packets not for us */
156#else /* GATEWAY */
157#define	IPFORWARDING	0	/* don't forward IP packets not for us */
158#endif /* GATEWAY */
159#endif /* IPFORWARDING */
160#ifndef	IPSENDREDIRECTS
161#define	IPSENDREDIRECTS	1
162#endif
163#ifndef IPFORWSRCRT
164#define	IPFORWSRCRT	1	/* forward source-routed packets */
165#endif
166#ifndef IPALLOWSRCRT
167#define	IPALLOWSRCRT	1	/* allow source-routed packets */
168#endif
169#ifndef IPMTUDISC
170#define IPMTUDISC	0
171#endif
172#ifndef IPMTUDISCTIMEOUT
173#define IPMTUDISCTIMEOUT (10 * 60)	/* as per RFC 1191 */
174#endif
175
176/*
177 * Note: DIRECTED_BROADCAST is handled this way so that previous
178 * configuration using this option will Just Work.
179 */
180#ifndef IPDIRECTEDBCAST
181#ifdef DIRECTED_BROADCAST
182#define IPDIRECTEDBCAST	1
183#else
184#define	IPDIRECTEDBCAST	0
185#endif /* DIRECTED_BROADCAST */
186#endif /* IPDIRECTEDBCAST */
187int	ipforwarding = IPFORWARDING;
188int	ipsendredirects = IPSENDREDIRECTS;
189int	ip_defttl = IPDEFTTL;
190int	ip_forwsrcrt = IPFORWSRCRT;
191int	ip_directedbcast = IPDIRECTEDBCAST;
192int	ip_allowsrcrt = IPALLOWSRCRT;
193int	ip_mtudisc = IPMTUDISC;
194u_int	ip_mtudisc_timeout = IPMTUDISCTIMEOUT;
195#ifdef DIAGNOSTIC
196int	ipprintfs = 0;
197#endif
198
199struct rttimer_queue *ip_mtudisc_timeout_q = NULL;
200
201extern	struct domain inetdomain;
202int	ipqmaxlen = IFQ_MAXLEN;
203struct	in_ifaddrhead in_ifaddr;
204struct	in_ifaddrhashhead *in_ifaddrhashtbl;
205struct	ifqueue ipintrq;
206struct	ipstat	ipstat;
207u_int16_t	ip_id;
208
209struct ipqhead ipq;
210int	ipq_locked;
211
212static __inline int ipq_lock_try __P((void));
213static __inline void ipq_unlock __P((void));
214
215static __inline int
216ipq_lock_try()
217{
218	int s;
219
220	s = splimp();
221	if (ipq_locked) {
222		splx(s);
223		return (0);
224	}
225	ipq_locked = 1;
226	splx(s);
227	return (1);
228}
229
230static __inline void
231ipq_unlock()
232{
233	int s;
234
235	s = splimp();
236	ipq_locked = 0;
237	splx(s);
238}
239
240#ifdef DIAGNOSTIC
241#define	IPQ_LOCK()							\
242do {									\
243	if (ipq_lock_try() == 0) {					\
244		printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \
245		panic("ipq_lock");					\
246	}								\
247} while (0)
248#define	IPQ_LOCK_CHECK()						\
249do {									\
250	if (ipq_locked == 0) {						\
251		printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \
252		panic("ipq lock check");				\
253	}								\
254} while (0)
255#else
256#define	IPQ_LOCK()		(void) ipq_lock_try()
257#define	IPQ_LOCK_CHECK()	/* nothing */
258#endif
259
260#define	IPQ_UNLOCK()		ipq_unlock()
261
262struct pool ipqent_pool;
263
264/*
265 * We need to save the IP options in case a protocol wants to respond
266 * to an incoming packet over the same route if the packet got here
267 * using IP source routing.  This allows connection establishment and
268 * maintenance when the remote end is on a network that is not known
269 * to us.
270 */
271int	ip_nhops = 0;
272static	struct ip_srcrt {
273	struct	in_addr dst;			/* final destination */
274	char	nop;				/* one NOP to align */
275	char	srcopt[IPOPT_OFFSET + 1];	/* OPTVAL, OLEN and OFFSET */
276	struct	in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)];
277} ip_srcrt;
278
279static void save_rte __P((u_char *, struct in_addr));
280
281/*
282 * IP initialization: fill in IP protocol switch table.
283 * All protocols not implemented in kernel go to raw IP protocol handler.
284 */
285void
286ip_init()
287{
288	struct protosw *pr;
289	int i;
290
291	pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
292	    0, NULL, NULL, M_IPQ);
293
294	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
295	if (pr == 0)
296		panic("ip_init");
297	for (i = 0; i < IPPROTO_MAX; i++)
298		ip_protox[i] = pr - inetsw;
299	for (pr = inetdomain.dom_protosw;
300	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
301		if (pr->pr_domain->dom_family == PF_INET &&
302		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
303			ip_protox[pr->pr_protocol] = pr - inetsw;
304	LIST_INIT(&ipq);
305	ip_id = time.tv_sec & 0xffff;
306	ipintrq.ifq_maxlen = ipqmaxlen;
307	TAILQ_INIT(&in_ifaddr);
308	in_ifaddrhashtbl =
309	    hashinit(IN_IFADDR_HASH_SIZE, M_IFADDR, M_WAITOK, &in_ifaddrhash);
310	if (ip_mtudisc != 0)
311		ip_mtudisc_timeout_q =
312		    rt_timer_queue_create(ip_mtudisc_timeout);
313#ifdef GATEWAY
314	ipflow_init();
315#endif
316}
317
318struct	sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
319struct	route ipforward_rt;
320
321/*
322 * IP software interrupt routine
323 */
324void
325ipintr()
326{
327	int s;
328	struct mbuf *m;
329
330	while (1) {
331		s = splimp();
332		IF_DEQUEUE(&ipintrq, m);
333		splx(s);
334		if (m == 0)
335			return;
336		ip_input(m);
337	}
338}
339
340/*
341 * Ip input routine.  Checksum and byte swap header.  If fragmented
342 * try to reassemble.  Process options.  Pass to next level.
343 */
344void
345ip_input(struct mbuf *m)
346{
347	struct ip *ip = NULL;
348	struct ipq *fp;
349	struct in_ifaddr *ia;
350	struct ifaddr *ifa;
351	struct ipqent *ipqe;
352	int hlen = 0, mff, len;
353	int downmatch;
354#ifdef PFIL_HOOKS
355	struct packet_filter_hook *pfh;
356	struct mbuf *m0;
357	int rv;
358#endif /* PFIL_HOOKS */
359
360#ifdef	DIAGNOSTIC
361	if ((m->m_flags & M_PKTHDR) == 0)
362		panic("ipintr no HDR");
363#endif
364#ifdef IPSEC
365	/*
366	 * should the inner packet be considered authentic?
367	 * see comment in ah4_input().
368	 */
369	if (m) {
370		m->m_flags &= ~M_AUTHIPHDR;
371		m->m_flags &= ~M_AUTHIPDGM;
372	}
373#endif
374	/*
375	 * If no IP addresses have been set yet but the interfaces
376	 * are receiving, can't do anything with incoming packets yet.
377	 */
378	if (in_ifaddr.tqh_first == 0)
379		goto bad;
380	ipstat.ips_total++;
381	if (m->m_len < sizeof (struct ip) &&
382	    (m = m_pullup(m, sizeof (struct ip))) == 0) {
383		ipstat.ips_toosmall++;
384		return;
385	}
386	ip = mtod(m, struct ip *);
387	if (ip->ip_v != IPVERSION) {
388		ipstat.ips_badvers++;
389		goto bad;
390	}
391	hlen = ip->ip_hl << 2;
392	if (hlen < sizeof(struct ip)) {	/* minimum header length */
393		ipstat.ips_badhlen++;
394		goto bad;
395	}
396	if (hlen > m->m_len) {
397		if ((m = m_pullup(m, hlen)) == 0) {
398			ipstat.ips_badhlen++;
399			return;
400		}
401		ip = mtod(m, struct ip *);
402	}
403
404	/*
405	 * RFC1122: packets with a multicast source address are
406	 * not allowed.
407	 */
408	if (IN_MULTICAST(ip->ip_src.s_addr)) {
409		/* XXX stat */
410		goto bad;
411	}
412
413	if (in_cksum(m, hlen) != 0) {
414		ipstat.ips_badsum++;
415		goto bad;
416	}
417
418	/*
419	 * Convert fields to host representation.
420	 */
421	NTOHS(ip->ip_len);
422	NTOHS(ip->ip_off);
423	len = ip->ip_len;
424
425	/*
426	 * Check for additional length bogosity
427	 */
428	if (len < hlen) {
429	 	ipstat.ips_badlen++;
430		goto bad;
431	}
432
433	/*
434	 * Check that the amount of data in the buffers
435	 * is as at least much as the IP header would have us expect.
436	 * Trim mbufs if longer than we expect.
437	 * Drop packet if shorter than we expect.
438	 */
439	if (m->m_pkthdr.len < len) {
440		ipstat.ips_tooshort++;
441		goto bad;
442	}
443	if (m->m_pkthdr.len > len) {
444		if (m->m_len == m->m_pkthdr.len) {
445			m->m_len = len;
446			m->m_pkthdr.len = len;
447		} else
448			m_adj(m, len - m->m_pkthdr.len);
449	}
450
451#ifdef IPSEC
452	/* ipflow (IP fast fowarding) is not compatible with IPsec. */
453	m->m_flags &= ~M_CANFASTFWD;
454#else
455	/*
456	 * Assume that we can create a fast-forward IP flow entry
457	 * based on this packet.
458	 */
459	m->m_flags |= M_CANFASTFWD;
460#endif
461
462#ifdef PFIL_HOOKS
463	/*
464	 * Run through list of hooks for input packets.  If there are any
465	 * filters which require that additional packets in the flow are
466	 * not fast-forwarded, they must clear the M_CANFASTFWD flag.
467	 * Note that filters must _never_ set this flag, as another filter
468	 * in the list may have previously cleared it.
469	 */
470	m0 = m;
471	pfh = pfil_hook_get(PFIL_IN, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh);
472	for (; pfh; pfh = pfh->pfil_link.tqe_next)
473		if (pfh->pfil_func) {
474			rv = pfh->pfil_func(ip, hlen,
475					    m->m_pkthdr.rcvif, 0, &m0);
476			if (rv)
477				return;
478			m = m0;
479			if (m == NULL)
480				return;
481			ip = mtod(m, struct ip *);
482		}
483#endif /* PFIL_HOOKS */
484
485	/*
486	 * Process options and, if not destined for us,
487	 * ship it on.  ip_dooptions returns 1 when an
488	 * error was detected (causing an icmp message
489	 * to be sent and the original packet to be freed).
490	 */
491	ip_nhops = 0;		/* for source routed packets */
492	if (hlen > sizeof (struct ip) && ip_dooptions(m))
493		return;
494
495	/*
496	 * Check our list of addresses, to see if the packet is for us.
497	 *
498	 * Traditional 4.4BSD did not consult IFF_UP at all.
499	 * The behavior here is to treat addresses on !IFF_UP interface
500	 * as not mine.
501	 */
502	downmatch = 0;
503	for (ia = IN_IFADDR_HASH(ip->ip_dst.s_addr).lh_first;
504	     ia != NULL;
505	     ia = ia->ia_hash.le_next) {
506		if (in_hosteq(ia->ia_addr.sin_addr, ip->ip_dst)) {
507			if ((ia->ia_ifp->if_flags & IFF_UP) != 0)
508				break;
509			else
510				downmatch++;
511		}
512	}
513	if (ia != NULL)
514		goto ours;
515	if (m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) {
516		for (ifa = m->m_pkthdr.rcvif->if_addrlist.tqh_first;
517		    ifa != NULL; ifa = ifa->ifa_list.tqe_next) {
518			if (ifa->ifa_addr->sa_family != AF_INET) continue;
519			ia = ifatoia(ifa);
520			if (in_hosteq(ip->ip_dst, ia->ia_broadaddr.sin_addr) ||
521			    in_hosteq(ip->ip_dst, ia->ia_netbroadcast) ||
522			    /*
523			     * Look for all-0's host part (old broadcast addr),
524			     * either for subnet or net.
525			     */
526			    ip->ip_dst.s_addr == ia->ia_subnet ||
527			    ip->ip_dst.s_addr == ia->ia_net)
528				goto ours;
529			/*
530			 * An interface with IP address zero accepts
531			 * all packets that arrive on that interface.
532			 */
533			if (in_nullhost(ia->ia_addr.sin_addr))
534				goto ours;
535		}
536	}
537	if (IN_MULTICAST(ip->ip_dst.s_addr)) {
538		struct in_multi *inm;
539#ifdef MROUTING
540		extern struct socket *ip_mrouter;
541
542		if (m->m_flags & M_EXT) {
543			if ((m = m_pullup(m, hlen)) == 0) {
544				ipstat.ips_toosmall++;
545				return;
546			}
547			ip = mtod(m, struct ip *);
548		}
549
550		if (ip_mrouter) {
551			/*
552			 * If we are acting as a multicast router, all
553			 * incoming multicast packets are passed to the
554			 * kernel-level multicast forwarding function.
555			 * The packet is returned (relatively) intact; if
556			 * ip_mforward() returns a non-zero value, the packet
557			 * must be discarded, else it may be accepted below.
558			 *
559			 * (The IP ident field is put in the same byte order
560			 * as expected when ip_mforward() is called from
561			 * ip_output().)
562			 */
563			if (ip_mforward(m, m->m_pkthdr.rcvif) != 0) {
564				ipstat.ips_cantforward++;
565				m_freem(m);
566				return;
567			}
568
569			/*
570			 * The process-level routing demon needs to receive
571			 * all multicast IGMP packets, whether or not this
572			 * host belongs to their destination groups.
573			 */
574			if (ip->ip_p == IPPROTO_IGMP)
575				goto ours;
576			ipstat.ips_forward++;
577		}
578#endif
579		/*
580		 * See if we belong to the destination multicast group on the
581		 * arrival interface.
582		 */
583		IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm);
584		if (inm == NULL) {
585			ipstat.ips_cantforward++;
586			m_freem(m);
587			return;
588		}
589		goto ours;
590	}
591	if (ip->ip_dst.s_addr == INADDR_BROADCAST ||
592	    in_nullhost(ip->ip_dst))
593		goto ours;
594
595	/*
596	 * Not for us; forward if possible and desirable.
597	 */
598	if (ipforwarding == 0) {
599		ipstat.ips_cantforward++;
600		m_freem(m);
601	} else {
602		/*
603		 * If ip_dst matched any of my address on !IFF_UP interface,
604		 * and there's no IFF_UP interface that matches ip_dst,
605		 * send icmp unreach.  Forwarding it will result in in-kernel
606		 * forwarding loop till TTL goes to 0.
607		 */
608		if (downmatch) {
609			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
610			ipstat.ips_cantforward++;
611			return;
612		}
613		ip_forward(m, 0);
614	}
615	return;
616
617ours:
618	/*
619	 * If offset or IP_MF are set, must reassemble.
620	 * Otherwise, nothing need be done.
621	 * (We could look in the reassembly queue to see
622	 * if the packet was previously fragmented,
623	 * but it's not worth the time; just let them time out.)
624	 */
625	if (ip->ip_off & ~(IP_DF|IP_RF)) {
626		/*
627		 * Look for queue of fragments
628		 * of this datagram.
629		 */
630		IPQ_LOCK();
631		for (fp = ipq.lh_first; fp != NULL; fp = fp->ipq_q.le_next)
632			if (ip->ip_id == fp->ipq_id &&
633			    in_hosteq(ip->ip_src, fp->ipq_src) &&
634			    in_hosteq(ip->ip_dst, fp->ipq_dst) &&
635			    ip->ip_p == fp->ipq_p)
636				goto found;
637		fp = 0;
638found:
639
640		/*
641		 * Adjust ip_len to not reflect header,
642		 * set ipqe_mff if more fragments are expected,
643		 * convert offset of this to bytes.
644		 */
645		ip->ip_len -= hlen;
646		mff = (ip->ip_off & IP_MF) != 0;
647		if (mff) {
648		        /*
649		         * Make sure that fragments have a data length
650			 * that's a non-zero multiple of 8 bytes.
651		         */
652			if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
653				ipstat.ips_badfrags++;
654				IPQ_UNLOCK();
655				goto bad;
656			}
657		}
658		ip->ip_off <<= 3;
659
660		/*
661		 * If datagram marked as having more fragments
662		 * or if this is not the first fragment,
663		 * attempt reassembly; if it succeeds, proceed.
664		 */
665		if (mff || ip->ip_off) {
666			ipstat.ips_fragments++;
667			ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
668			if (ipqe == NULL) {
669				ipstat.ips_rcvmemdrop++;
670				IPQ_UNLOCK();
671				goto bad;
672			}
673			ipqe->ipqe_mff = mff;
674			ipqe->ipqe_m = m;
675			ipqe->ipqe_ip = ip;
676			m = ip_reass(ipqe, fp);
677			if (m == 0) {
678				IPQ_UNLOCK();
679				return;
680			}
681			ipstat.ips_reassembled++;
682			ip = mtod(m, struct ip *);
683			hlen = ip->ip_hl << 2;
684			ip->ip_len += hlen;
685		} else
686			if (fp)
687				ip_freef(fp);
688		IPQ_UNLOCK();
689	}
690
691	/*
692	 * Switch out to protocol's input routine.
693	 */
694#if IFA_STATS
695	ia->ia_ifa.ifa_data.ifad_inbytes += ip->ip_len;
696#endif
697	ipstat.ips_delivered++;
698    {
699	int off = hlen, nh = ip->ip_p;
700
701	(*inetsw[ip_protox[nh]].pr_input)(m, off, nh);
702	return;
703    }
704bad:
705	m_freem(m);
706}
707
708/*
709 * Take incoming datagram fragment and try to
710 * reassemble it into whole datagram.  If a chain for
711 * reassembly of this datagram already exists, then it
712 * is given as fp; otherwise have to make a chain.
713 */
714struct mbuf *
715ip_reass(ipqe, fp)
716	struct ipqent *ipqe;
717	struct ipq *fp;
718{
719	struct mbuf *m = ipqe->ipqe_m;
720	struct ipqent *nq, *p, *q;
721	struct ip *ip;
722	struct mbuf *t;
723	int hlen = ipqe->ipqe_ip->ip_hl << 2;
724	int i, next;
725
726	IPQ_LOCK_CHECK();
727
728	/*
729	 * Presence of header sizes in mbufs
730	 * would confuse code below.
731	 */
732	m->m_data += hlen;
733	m->m_len -= hlen;
734
735	/*
736	 * If first fragment to arrive, create a reassembly queue.
737	 */
738	if (fp == 0) {
739		MALLOC(fp, struct ipq *, sizeof (struct ipq),
740		    M_FTABLE, M_NOWAIT);
741		if (fp == NULL)
742			goto dropfrag;
743		LIST_INSERT_HEAD(&ipq, fp, ipq_q);
744		fp->ipq_ttl = IPFRAGTTL;
745		fp->ipq_p = ipqe->ipqe_ip->ip_p;
746		fp->ipq_id = ipqe->ipqe_ip->ip_id;
747		LIST_INIT(&fp->ipq_fragq);
748		fp->ipq_src = ipqe->ipqe_ip->ip_src;
749		fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
750		p = NULL;
751		goto insert;
752	}
753
754	/*
755	 * Find a segment which begins after this one does.
756	 */
757	for (p = NULL, q = fp->ipq_fragq.lh_first; q != NULL;
758	    p = q, q = q->ipqe_q.le_next)
759		if (q->ipqe_ip->ip_off > ipqe->ipqe_ip->ip_off)
760			break;
761
762	/*
763	 * If there is a preceding segment, it may provide some of
764	 * our data already.  If so, drop the data from the incoming
765	 * segment.  If it provides all of our data, drop us.
766	 */
767	if (p != NULL) {
768		i = p->ipqe_ip->ip_off + p->ipqe_ip->ip_len -
769		    ipqe->ipqe_ip->ip_off;
770		if (i > 0) {
771			if (i >= ipqe->ipqe_ip->ip_len)
772				goto dropfrag;
773			m_adj(ipqe->ipqe_m, i);
774			ipqe->ipqe_ip->ip_off += i;
775			ipqe->ipqe_ip->ip_len -= i;
776		}
777	}
778
779	/*
780	 * While we overlap succeeding segments trim them or,
781	 * if they are completely covered, dequeue them.
782	 */
783	for (; q != NULL && ipqe->ipqe_ip->ip_off + ipqe->ipqe_ip->ip_len >
784	    q->ipqe_ip->ip_off; q = nq) {
785		i = (ipqe->ipqe_ip->ip_off + ipqe->ipqe_ip->ip_len) -
786		    q->ipqe_ip->ip_off;
787		if (i < q->ipqe_ip->ip_len) {
788			q->ipqe_ip->ip_len -= i;
789			q->ipqe_ip->ip_off += i;
790			m_adj(q->ipqe_m, i);
791			break;
792		}
793		nq = q->ipqe_q.le_next;
794		m_freem(q->ipqe_m);
795		LIST_REMOVE(q, ipqe_q);
796		pool_put(&ipqent_pool, q);
797	}
798
799insert:
800	/*
801	 * Stick new segment in its place;
802	 * check for complete reassembly.
803	 */
804	if (p == NULL) {
805		LIST_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
806	} else {
807		LIST_INSERT_AFTER(p, ipqe, ipqe_q);
808	}
809	next = 0;
810	for (p = NULL, q = fp->ipq_fragq.lh_first; q != NULL;
811	    p = q, q = q->ipqe_q.le_next) {
812		if (q->ipqe_ip->ip_off != next)
813			return (0);
814		next += q->ipqe_ip->ip_len;
815	}
816	if (p->ipqe_mff)
817		return (0);
818
819	/*
820	 * Reassembly is complete.  Check for a bogus message size and
821	 * concatenate fragments.
822	 */
823	q = fp->ipq_fragq.lh_first;
824	ip = q->ipqe_ip;
825	if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
826		ipstat.ips_toolong++;
827		ip_freef(fp);
828		return (0);
829	}
830	m = q->ipqe_m;
831	t = m->m_next;
832	m->m_next = 0;
833	m_cat(m, t);
834	nq = q->ipqe_q.le_next;
835	pool_put(&ipqent_pool, q);
836	for (q = nq; q != NULL; q = nq) {
837		t = q->ipqe_m;
838		nq = q->ipqe_q.le_next;
839		pool_put(&ipqent_pool, q);
840		m_cat(m, t);
841	}
842
843	/*
844	 * Create header for new ip packet by
845	 * modifying header of first packet;
846	 * dequeue and discard fragment reassembly header.
847	 * Make header visible.
848	 */
849	ip->ip_len = next;
850	ip->ip_src = fp->ipq_src;
851	ip->ip_dst = fp->ipq_dst;
852	LIST_REMOVE(fp, ipq_q);
853	FREE(fp, M_FTABLE);
854	m->m_len += (ip->ip_hl << 2);
855	m->m_data -= (ip->ip_hl << 2);
856	/* some debugging cruft by sklower, below, will go away soon */
857	if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
858		int plen = 0;
859		for (t = m; t; t = t->m_next)
860			plen += t->m_len;
861		m->m_pkthdr.len = plen;
862	}
863	return (m);
864
865dropfrag:
866	ipstat.ips_fragdropped++;
867	m_freem(m);
868	pool_put(&ipqent_pool, ipqe);
869	return (0);
870}
871
872/*
873 * Free a fragment reassembly header and all
874 * associated datagrams.
875 */
876void
877ip_freef(fp)
878	struct ipq *fp;
879{
880	struct ipqent *q, *p;
881
882	IPQ_LOCK_CHECK();
883
884	for (q = fp->ipq_fragq.lh_first; q != NULL; q = p) {
885		p = q->ipqe_q.le_next;
886		m_freem(q->ipqe_m);
887		LIST_REMOVE(q, ipqe_q);
888		pool_put(&ipqent_pool, q);
889	}
890	LIST_REMOVE(fp, ipq_q);
891	FREE(fp, M_FTABLE);
892}
893
894/*
895 * IP timer processing;
896 * if a timer expires on a reassembly
897 * queue, discard it.
898 */
899void
900ip_slowtimo()
901{
902	struct ipq *fp, *nfp;
903	int s = splsoftnet();
904
905	IPQ_LOCK();
906	for (fp = ipq.lh_first; fp != NULL; fp = nfp) {
907		nfp = fp->ipq_q.le_next;
908		if (--fp->ipq_ttl == 0) {
909			ipstat.ips_fragtimeout++;
910			ip_freef(fp);
911		}
912	}
913	IPQ_UNLOCK();
914#ifdef GATEWAY
915	ipflow_slowtimo();
916#endif
917	splx(s);
918}
919
920/*
921 * Drain off all datagram fragments.
922 */
923void
924ip_drain()
925{
926
927	/*
928	 * We may be called from a device's interrupt context.  If
929	 * the ipq is already busy, just bail out now.
930	 */
931	if (ipq_lock_try() == 0)
932		return;
933
934	while (ipq.lh_first != NULL) {
935		ipstat.ips_fragdropped++;
936		ip_freef(ipq.lh_first);
937	}
938
939	IPQ_UNLOCK();
940}
941
942/*
943 * Do option processing on a datagram,
944 * possibly discarding it if bad options are encountered,
945 * or forwarding it if source-routed.
946 * Returns 1 if packet has been forwarded/freed,
947 * 0 if the packet should be processed further.
948 */
949int
950ip_dooptions(m)
951	struct mbuf *m;
952{
953	struct ip *ip = mtod(m, struct ip *);
954	u_char *cp, *cp0;
955	struct ip_timestamp *ipt;
956	struct in_ifaddr *ia;
957	int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
958	struct in_addr dst;
959	n_time ntime;
960
961	dst = ip->ip_dst;
962	cp = (u_char *)(ip + 1);
963	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
964	for (; cnt > 0; cnt -= optlen, cp += optlen) {
965		opt = cp[IPOPT_OPTVAL];
966		if (opt == IPOPT_EOL)
967			break;
968		if (opt == IPOPT_NOP)
969			optlen = 1;
970		else {
971			if (cnt < IPOPT_OLEN + sizeof(*cp)) {
972				code = &cp[IPOPT_OLEN] - (u_char *)ip;
973				goto bad;
974			}
975			optlen = cp[IPOPT_OLEN];
976			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
977				code = &cp[IPOPT_OLEN] - (u_char *)ip;
978				goto bad;
979			}
980		}
981		switch (opt) {
982
983		default:
984			break;
985
986		/*
987		 * Source routing with record.
988		 * Find interface with current destination address.
989		 * If none on this machine then drop if strictly routed,
990		 * or do nothing if loosely routed.
991		 * Record interface address and bring up next address
992		 * component.  If strictly routed make sure next
993		 * address is on directly accessible net.
994		 */
995		case IPOPT_LSRR:
996		case IPOPT_SSRR:
997			if (ip_allowsrcrt == 0) {
998				type = ICMP_UNREACH;
999				code = ICMP_UNREACH_NET_PROHIB;
1000				goto bad;
1001			}
1002			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1003				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1004				goto bad;
1005			}
1006			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1007				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1008				goto bad;
1009			}
1010			ipaddr.sin_addr = ip->ip_dst;
1011			ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr)));
1012			if (ia == 0) {
1013				if (opt == IPOPT_SSRR) {
1014					type = ICMP_UNREACH;
1015					code = ICMP_UNREACH_SRCFAIL;
1016					goto bad;
1017				}
1018				/*
1019				 * Loose routing, and not at next destination
1020				 * yet; nothing to do except forward.
1021				 */
1022				break;
1023			}
1024			off--;			/* 0 origin */
1025			if ((off + sizeof(struct in_addr)) > optlen) {
1026				/*
1027				 * End of source route.  Should be for us.
1028				 */
1029				save_rte(cp, ip->ip_src);
1030				break;
1031			}
1032			/*
1033			 * locate outgoing interface
1034			 */
1035			bcopy((caddr_t)(cp + off), (caddr_t)&ipaddr.sin_addr,
1036			    sizeof(ipaddr.sin_addr));
1037			if (opt == IPOPT_SSRR)
1038				ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr)));
1039			else
1040				ia = ip_rtaddr(ipaddr.sin_addr);
1041			if (ia == 0) {
1042				type = ICMP_UNREACH;
1043				code = ICMP_UNREACH_SRCFAIL;
1044				goto bad;
1045			}
1046			ip->ip_dst = ipaddr.sin_addr;
1047			bcopy((caddr_t)&ia->ia_addr.sin_addr,
1048			    (caddr_t)(cp + off), sizeof(struct in_addr));
1049			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1050			/*
1051			 * Let ip_intr's mcast routing check handle mcast pkts
1052			 */
1053			forward = !IN_MULTICAST(ip->ip_dst.s_addr);
1054			break;
1055
1056		case IPOPT_RR:
1057			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1058				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1059				goto bad;
1060			}
1061			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1062				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1063				goto bad;
1064			}
1065			/*
1066			 * If no space remains, ignore.
1067			 */
1068			off--;			/* 0 origin */
1069			if ((off + sizeof(struct in_addr)) > optlen)
1070				break;
1071			bcopy((caddr_t)(&ip->ip_dst), (caddr_t)&ipaddr.sin_addr,
1072			    sizeof(ipaddr.sin_addr));
1073			/*
1074			 * locate outgoing interface; if we're the destination,
1075			 * use the incoming interface (should be same).
1076			 */
1077			if ((ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr))))
1078			    == NULL &&
1079			    (ia = ip_rtaddr(ipaddr.sin_addr)) == NULL) {
1080				type = ICMP_UNREACH;
1081				code = ICMP_UNREACH_HOST;
1082				goto bad;
1083			}
1084			bcopy((caddr_t)&ia->ia_addr.sin_addr,
1085			    (caddr_t)(cp + off), sizeof(struct in_addr));
1086			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1087			break;
1088
1089		case IPOPT_TS:
1090			code = cp - (u_char *)ip;
1091			ipt = (struct ip_timestamp *)cp;
1092			if (ipt->ipt_len < 4 || ipt->ipt_len > 40) {
1093				code = (u_char *)&ipt->ipt_len - (u_char *)ip;
1094				goto bad;
1095			}
1096			if (ipt->ipt_ptr < 5) {
1097				code = (u_char *)&ipt->ipt_ptr - (u_char *)ip;
1098				goto bad;
1099			}
1100			if (ipt->ipt_ptr > ipt->ipt_len - sizeof (int32_t)) {
1101				if (++ipt->ipt_oflw == 0) {
1102					code = (u_char *)&ipt->ipt_ptr -
1103					    (u_char *)ip;
1104					goto bad;
1105				}
1106				break;
1107			}
1108			cp0 = (cp + ipt->ipt_ptr - 1);
1109			switch (ipt->ipt_flg) {
1110
1111			case IPOPT_TS_TSONLY:
1112				break;
1113
1114			case IPOPT_TS_TSANDADDR:
1115				if (ipt->ipt_ptr - 1 + sizeof(n_time) +
1116				    sizeof(struct in_addr) > ipt->ipt_len) {
1117					code = (u_char *)&ipt->ipt_ptr -
1118					    (u_char *)ip;
1119					goto bad;
1120				}
1121				ipaddr.sin_addr = dst;
1122				ia = ifatoia(ifaof_ifpforaddr(sintosa(&ipaddr),
1123				    m->m_pkthdr.rcvif));
1124				if (ia == 0)
1125					continue;
1126				bcopy(&ia->ia_addr.sin_addr,
1127				    cp0, sizeof(struct in_addr));
1128				ipt->ipt_ptr += sizeof(struct in_addr);
1129				break;
1130
1131			case IPOPT_TS_PRESPEC:
1132				if (ipt->ipt_ptr - 1 + sizeof(n_time) +
1133				    sizeof(struct in_addr) > ipt->ipt_len) {
1134					code = (u_char *)&ipt->ipt_ptr -
1135					    (u_char *)ip;
1136					goto bad;
1137				}
1138				bcopy(cp0, &ipaddr.sin_addr,
1139				    sizeof(struct in_addr));
1140				if (ifatoia(ifa_ifwithaddr(sintosa(&ipaddr)))
1141				    == NULL)
1142					continue;
1143				ipt->ipt_ptr += sizeof(struct in_addr);
1144				break;
1145
1146			default:
1147				/* XXX can't take &ipt->ipt_flg */
1148				code = (u_char *)&ipt->ipt_ptr -
1149				    (u_char *)ip + 1;
1150				goto bad;
1151			}
1152			ntime = iptime();
1153			cp0 = (u_char *) &ntime; /* XXX grumble, GCC... */
1154			bcopy(cp0, (caddr_t)cp + ipt->ipt_ptr - 1,
1155			    sizeof(n_time));
1156			ipt->ipt_ptr += sizeof(n_time);
1157		}
1158	}
1159	if (forward) {
1160		if (ip_forwsrcrt == 0) {
1161			type = ICMP_UNREACH;
1162			code = ICMP_UNREACH_SRCFAIL;
1163			goto bad;
1164		}
1165		ip_forward(m, 1);
1166		return (1);
1167	}
1168	return (0);
1169bad:
1170	icmp_error(m, type, code, 0, 0);
1171	ipstat.ips_badoptions++;
1172	return (1);
1173}
1174
1175/*
1176 * Given address of next destination (final or next hop),
1177 * return internet address info of interface to be used to get there.
1178 */
1179struct in_ifaddr *
1180ip_rtaddr(dst)
1181	 struct in_addr dst;
1182{
1183	struct sockaddr_in *sin;
1184
1185	sin = satosin(&ipforward_rt.ro_dst);
1186
1187	if (ipforward_rt.ro_rt == 0 || !in_hosteq(dst, sin->sin_addr)) {
1188		if (ipforward_rt.ro_rt) {
1189			RTFREE(ipforward_rt.ro_rt);
1190			ipforward_rt.ro_rt = 0;
1191		}
1192		sin->sin_family = AF_INET;
1193		sin->sin_len = sizeof(*sin);
1194		sin->sin_addr = dst;
1195
1196		rtalloc(&ipforward_rt);
1197	}
1198	if (ipforward_rt.ro_rt == 0)
1199		return ((struct in_ifaddr *)0);
1200	return (ifatoia(ipforward_rt.ro_rt->rt_ifa));
1201}
1202
1203/*
1204 * Save incoming source route for use in replies,
1205 * to be picked up later by ip_srcroute if the receiver is interested.
1206 */
1207void
1208save_rte(option, dst)
1209	u_char *option;
1210	struct in_addr dst;
1211{
1212	unsigned olen;
1213
1214	olen = option[IPOPT_OLEN];
1215#ifdef DIAGNOSTIC
1216	if (ipprintfs)
1217		printf("save_rte: olen %d\n", olen);
1218#endif /* 0 */
1219	if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst)))
1220		return;
1221	bcopy((caddr_t)option, (caddr_t)ip_srcrt.srcopt, olen);
1222	ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
1223	ip_srcrt.dst = dst;
1224}
1225
1226/*
1227 * Retrieve incoming source route for use in replies,
1228 * in the same form used by setsockopt.
1229 * The first hop is placed before the options, will be removed later.
1230 */
1231struct mbuf *
1232ip_srcroute()
1233{
1234	struct in_addr *p, *q;
1235	struct mbuf *m;
1236
1237	if (ip_nhops == 0)
1238		return ((struct mbuf *)0);
1239	m = m_get(M_DONTWAIT, MT_SOOPTS);
1240	if (m == 0)
1241		return ((struct mbuf *)0);
1242
1243#define OPTSIZ	(sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt))
1244
1245	/* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
1246	m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) +
1247	    OPTSIZ;
1248#ifdef DIAGNOSTIC
1249	if (ipprintfs)
1250		printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
1251#endif
1252
1253	/*
1254	 * First save first hop for return route
1255	 */
1256	p = &ip_srcrt.route[ip_nhops - 1];
1257	*(mtod(m, struct in_addr *)) = *p--;
1258#ifdef DIAGNOSTIC
1259	if (ipprintfs)
1260		printf(" hops %x", ntohl(mtod(m, struct in_addr *)->s_addr));
1261#endif
1262
1263	/*
1264	 * Copy option fields and padding (nop) to mbuf.
1265	 */
1266	ip_srcrt.nop = IPOPT_NOP;
1267	ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
1268	bcopy((caddr_t)&ip_srcrt.nop,
1269	    mtod(m, caddr_t) + sizeof(struct in_addr), OPTSIZ);
1270	q = (struct in_addr *)(mtod(m, caddr_t) +
1271	    sizeof(struct in_addr) + OPTSIZ);
1272#undef OPTSIZ
1273	/*
1274	 * Record return path as an IP source route,
1275	 * reversing the path (pointers are now aligned).
1276	 */
1277	while (p >= ip_srcrt.route) {
1278#ifdef DIAGNOSTIC
1279		if (ipprintfs)
1280			printf(" %x", ntohl(q->s_addr));
1281#endif
1282		*q++ = *p--;
1283	}
1284	/*
1285	 * Last hop goes to final destination.
1286	 */
1287	*q = ip_srcrt.dst;
1288#ifdef DIAGNOSTIC
1289	if (ipprintfs)
1290		printf(" %x\n", ntohl(q->s_addr));
1291#endif
1292	return (m);
1293}
1294
1295/*
1296 * Strip out IP options, at higher
1297 * level protocol in the kernel.
1298 * Second argument is buffer to which options
1299 * will be moved, and return value is their length.
1300 * XXX should be deleted; last arg currently ignored.
1301 */
1302void
1303ip_stripoptions(m, mopt)
1304	struct mbuf *m;
1305	struct mbuf *mopt;
1306{
1307	int i;
1308	struct ip *ip = mtod(m, struct ip *);
1309	caddr_t opts;
1310	int olen;
1311
1312	olen = (ip->ip_hl << 2) - sizeof (struct ip);
1313	opts = (caddr_t)(ip + 1);
1314	i = m->m_len - (sizeof (struct ip) + olen);
1315	bcopy(opts  + olen, opts, (unsigned)i);
1316	m->m_len -= olen;
1317	if (m->m_flags & M_PKTHDR)
1318		m->m_pkthdr.len -= olen;
1319	ip->ip_len -= olen;
1320	ip->ip_hl = sizeof (struct ip) >> 2;
1321}
1322
1323int inetctlerrmap[PRC_NCMDS] = {
1324	0,		0,		0,		0,
1325	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
1326	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
1327	EMSGSIZE,	EHOSTUNREACH,	0,		0,
1328	0,		0,		0,		0,
1329	ENOPROTOOPT
1330};
1331
1332/*
1333 * Forward a packet.  If some error occurs return the sender
1334 * an icmp packet.  Note we can't always generate a meaningful
1335 * icmp message because icmp doesn't have a large enough repertoire
1336 * of codes and types.
1337 *
1338 * If not forwarding, just drop the packet.  This could be confusing
1339 * if ipforwarding was zero but some routing protocol was advancing
1340 * us as a gateway to somewhere.  However, we must let the routing
1341 * protocol deal with that.
1342 *
1343 * The srcrt parameter indicates whether the packet is being forwarded
1344 * via a source route.
1345 */
1346void
1347ip_forward(m, srcrt)
1348	struct mbuf *m;
1349	int srcrt;
1350{
1351	struct ip *ip = mtod(m, struct ip *);
1352	struct sockaddr_in *sin;
1353	struct rtentry *rt;
1354	int error, type = 0, code = 0;
1355	struct mbuf *mcopy;
1356	n_long dest;
1357	struct ifnet *destifp;
1358#ifdef IPSEC
1359	struct ifnet dummyifp;
1360#endif
1361
1362	dest = 0;
1363#ifdef DIAGNOSTIC
1364	if (ipprintfs)
1365		printf("forward: src %2.2x dst %2.2x ttl %x\n",
1366		    ntohl(ip->ip_src.s_addr),
1367		    ntohl(ip->ip_dst.s_addr), ip->ip_ttl);
1368#endif
1369	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
1370		ipstat.ips_cantforward++;
1371		m_freem(m);
1372		return;
1373	}
1374	if (ip->ip_ttl <= IPTTLDEC) {
1375		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0);
1376		return;
1377	}
1378	ip->ip_ttl -= IPTTLDEC;
1379
1380	sin = satosin(&ipforward_rt.ro_dst);
1381	if ((rt = ipforward_rt.ro_rt) == 0 ||
1382	    !in_hosteq(ip->ip_dst, sin->sin_addr)) {
1383		if (ipforward_rt.ro_rt) {
1384			RTFREE(ipforward_rt.ro_rt);
1385			ipforward_rt.ro_rt = 0;
1386		}
1387		sin->sin_family = AF_INET;
1388		sin->sin_len = sizeof(struct sockaddr_in);
1389		sin->sin_addr = ip->ip_dst;
1390
1391		rtalloc(&ipforward_rt);
1392		if (ipforward_rt.ro_rt == 0) {
1393			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
1394			return;
1395		}
1396		rt = ipforward_rt.ro_rt;
1397	}
1398
1399	/*
1400	 * Save at most 68 bytes of the packet in case
1401	 * we need to generate an ICMP message to the src.
1402	 */
1403	mcopy = m_copy(m, 0, imin((int)ip->ip_len, 68));
1404
1405	/*
1406	 * If forwarding packet using same interface that it came in on,
1407	 * perhaps should send a redirect to sender to shortcut a hop.
1408	 * Only send redirect if source is sending directly to us,
1409	 * and if packet was not source routed (or has any options).
1410	 * Also, don't send redirect if forwarding using a default route
1411	 * or a route modified by a redirect.
1412	 */
1413	if (rt->rt_ifp == m->m_pkthdr.rcvif &&
1414	    (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
1415	    !in_nullhost(satosin(rt_key(rt))->sin_addr) &&
1416	    ipsendredirects && !srcrt) {
1417		if (rt->rt_ifa &&
1418		    (ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_subnetmask) ==
1419		    ifatoia(rt->rt_ifa)->ia_subnet) {
1420			if (rt->rt_flags & RTF_GATEWAY)
1421				dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
1422			else
1423				dest = ip->ip_dst.s_addr;
1424			/*
1425			 * Router requirements says to only send host
1426			 * redirects.
1427			 */
1428			type = ICMP_REDIRECT;
1429			code = ICMP_REDIRECT_HOST;
1430#ifdef DIAGNOSTIC
1431			if (ipprintfs)
1432				printf("redirect (%d) to %x\n", code,
1433				    (u_int32_t)dest);
1434#endif
1435		}
1436	}
1437
1438#ifdef IPSEC
1439	/* Don't lookup socket in forwading case */
1440	ipsec_setsocket(m, NULL);
1441#endif
1442	error = ip_output(m, (struct mbuf *)0, &ipforward_rt,
1443	    (IP_FORWARDING | (ip_directedbcast ? IP_ALLOWBROADCAST : 0)), 0);
1444	if (error)
1445		ipstat.ips_cantforward++;
1446	else {
1447		ipstat.ips_forward++;
1448		if (type)
1449			ipstat.ips_redirectsent++;
1450		else {
1451			if (mcopy) {
1452#ifdef GATEWAY
1453				if (mcopy->m_flags & M_CANFASTFWD)
1454					ipflow_create(&ipforward_rt, mcopy);
1455#endif
1456				m_freem(mcopy);
1457			}
1458			return;
1459		}
1460	}
1461	if (mcopy == NULL)
1462		return;
1463	destifp = NULL;
1464
1465	switch (error) {
1466
1467	case 0:				/* forwarded, but need redirect */
1468		/* type, code set above */
1469		break;
1470
1471	case ENETUNREACH:		/* shouldn't happen, checked above */
1472	case EHOSTUNREACH:
1473	case ENETDOWN:
1474	case EHOSTDOWN:
1475	default:
1476		type = ICMP_UNREACH;
1477		code = ICMP_UNREACH_HOST;
1478		break;
1479
1480	case EMSGSIZE:
1481		type = ICMP_UNREACH;
1482		code = ICMP_UNREACH_NEEDFRAG;
1483#ifndef IPSEC
1484		if (ipforward_rt.ro_rt)
1485			destifp = ipforward_rt.ro_rt->rt_ifp;
1486#else
1487		/*
1488		 * If the packet is routed over IPsec tunnel, tell the
1489		 * originator the tunnel MTU.
1490		 *	tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
1491		 * XXX quickhack!!!
1492		 */
1493		if (ipforward_rt.ro_rt) {
1494			struct secpolicy *sp;
1495			int ipsecerror;
1496			size_t ipsechdr;
1497			struct route *ro;
1498
1499			sp = ipsec4_getpolicybyaddr(mcopy,
1500			                            IPSEC_DIR_OUTBOUND,
1501			                            IP_FORWARDING,
1502			                            &ipsecerror);
1503
1504			if (sp == NULL)
1505				destifp = ipforward_rt.ro_rt->rt_ifp;
1506			else {
1507				/* count IPsec header size */
1508				ipsechdr = ipsec4_hdrsiz(mcopy,
1509				                         IPSEC_DIR_OUTBOUND,
1510				                         NULL);
1511
1512				/*
1513				 * find the correct route for outer IPv4
1514				 * header, compute tunnel MTU.
1515				 *
1516				 * XXX BUG ALERT
1517				 * The "dummyifp" code relies upon the fact
1518				 * that icmp_error() touches only ifp->if_mtu.
1519				 */
1520				/*XXX*/
1521				destifp = NULL;
1522				if (sp->req != NULL
1523				 && sp->req->sav != NULL
1524				 && sp->req->sav->sah != NULL) {
1525					ro = &sp->req->sav->sah->sa_route;
1526					if (ro->ro_rt && ro->ro_rt->rt_ifp) {
1527						dummyifp.if_mtu =
1528						    ro->ro_rt->rt_ifp->if_mtu;
1529						dummyifp.if_mtu -= ipsechdr;
1530						destifp = &dummyifp;
1531					}
1532				}
1533
1534				key_freesp(sp);
1535			}
1536		}
1537#endif /*IPSEC*/
1538		ipstat.ips_cantfrag++;
1539		break;
1540
1541	case ENOBUFS:
1542		type = ICMP_SOURCEQUENCH;
1543		code = 0;
1544		break;
1545	}
1546	icmp_error(mcopy, type, code, dest, destifp);
1547}
1548
1549void
1550ip_savecontrol(inp, mp, ip, m)
1551	struct inpcb *inp;
1552	struct mbuf **mp;
1553	struct ip *ip;
1554	struct mbuf *m;
1555{
1556
1557	if (inp->inp_socket->so_options & SO_TIMESTAMP) {
1558		struct timeval tv;
1559
1560		microtime(&tv);
1561		*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
1562		    SCM_TIMESTAMP, SOL_SOCKET);
1563		if (*mp)
1564			mp = &(*mp)->m_next;
1565	}
1566	if (inp->inp_flags & INP_RECVDSTADDR) {
1567		*mp = sbcreatecontrol((caddr_t) &ip->ip_dst,
1568		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
1569		if (*mp)
1570			mp = &(*mp)->m_next;
1571	}
1572#ifdef notyet
1573	/*
1574	 * XXX
1575	 * Moving these out of udp_input() made them even more broken
1576	 * than they already were.
1577	 *	- fenner@parc.xerox.com
1578	 */
1579	/* options were tossed already */
1580	if (inp->inp_flags & INP_RECVOPTS) {
1581		*mp = sbcreatecontrol((caddr_t) opts_deleted_above,
1582		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
1583		if (*mp)
1584			mp = &(*mp)->m_next;
1585	}
1586	/* ip_srcroute doesn't do what we want here, need to fix */
1587	if (inp->inp_flags & INP_RECVRETOPTS) {
1588		*mp = sbcreatecontrol((caddr_t) ip_srcroute(),
1589		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
1590		if (*mp)
1591			mp = &(*mp)->m_next;
1592	}
1593#endif
1594	if (inp->inp_flags & INP_RECVIF) {
1595		struct sockaddr_dl sdl;
1596
1597		sdl.sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]);
1598		sdl.sdl_family = AF_LINK;
1599		sdl.sdl_index = m->m_pkthdr.rcvif ?
1600		    m->m_pkthdr.rcvif->if_index : 0;
1601		sdl.sdl_nlen = sdl.sdl_alen = sdl.sdl_slen = 0;
1602		*mp = sbcreatecontrol((caddr_t) &sdl, sdl.sdl_len,
1603		    IP_RECVIF, IPPROTO_IP);
1604		if (*mp)
1605			mp = &(*mp)->m_next;
1606	}
1607}
1608
1609int
1610ip_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
1611	int *name;
1612	u_int namelen;
1613	void *oldp;
1614	size_t *oldlenp;
1615	void *newp;
1616	size_t newlen;
1617{
1618	extern int subnetsarelocal, hostzeroisbroadcast;
1619
1620	int error, old;
1621
1622	/* All sysctl names at this level are terminal. */
1623	if (namelen != 1)
1624		return (ENOTDIR);
1625
1626	switch (name[0]) {
1627	case IPCTL_FORWARDING:
1628		return (sysctl_int(oldp, oldlenp, newp, newlen, &ipforwarding));
1629	case IPCTL_SENDREDIRECTS:
1630		return (sysctl_int(oldp, oldlenp, newp, newlen,
1631			&ipsendredirects));
1632	case IPCTL_DEFTTL:
1633		return (sysctl_int(oldp, oldlenp, newp, newlen, &ip_defttl));
1634#ifdef notyet
1635	case IPCTL_DEFMTU:
1636		return (sysctl_int(oldp, oldlenp, newp, newlen, &ip_mtu));
1637#endif
1638	case IPCTL_FORWSRCRT:
1639		/* Don't allow this to change in a secure environment.  */
1640		if (securelevel > 0)
1641			return (sysctl_rdint(oldp, oldlenp, newp,
1642			    ip_forwsrcrt));
1643		else
1644			return (sysctl_int(oldp, oldlenp, newp, newlen,
1645			    &ip_forwsrcrt));
1646	case IPCTL_DIRECTEDBCAST:
1647		return (sysctl_int(oldp, oldlenp, newp, newlen,
1648		    &ip_directedbcast));
1649	case IPCTL_ALLOWSRCRT:
1650		return (sysctl_int(oldp, oldlenp, newp, newlen,
1651		    &ip_allowsrcrt));
1652	case IPCTL_SUBNETSARELOCAL:
1653		return (sysctl_int(oldp, oldlenp, newp, newlen,
1654		    &subnetsarelocal));
1655	case IPCTL_MTUDISC:
1656		error = sysctl_int(oldp, oldlenp, newp, newlen,
1657		    &ip_mtudisc);
1658		if (ip_mtudisc != 0 && ip_mtudisc_timeout_q == NULL) {
1659			ip_mtudisc_timeout_q =
1660			    rt_timer_queue_create(ip_mtudisc_timeout);
1661		} else if (ip_mtudisc == 0 && ip_mtudisc_timeout_q != NULL) {
1662			rt_timer_queue_destroy(ip_mtudisc_timeout_q, TRUE);
1663			ip_mtudisc_timeout_q = NULL;
1664		}
1665		return error;
1666	case IPCTL_ANONPORTMIN:
1667		old = anonportmin;
1668		error = sysctl_int(oldp, oldlenp, newp, newlen, &anonportmin);
1669		if (anonportmin >= anonportmax || anonportmin > 65535
1670#ifndef IPNOPRIVPORTS
1671		    || anonportmin < IPPORT_RESERVED
1672#endif
1673		    ) {
1674			anonportmin = old;
1675			return (EINVAL);
1676		}
1677		return (error);
1678	case IPCTL_ANONPORTMAX:
1679		old = anonportmax;
1680		error = sysctl_int(oldp, oldlenp, newp, newlen, &anonportmax);
1681		if (anonportmin >= anonportmax || anonportmax > 65535
1682#ifndef IPNOPRIVPORTS
1683		    || anonportmax < IPPORT_RESERVED
1684#endif
1685		    ) {
1686			anonportmax = old;
1687			return (EINVAL);
1688		}
1689		return (error);
1690	case IPCTL_MTUDISCTIMEOUT:
1691		error = sysctl_int(oldp, oldlenp, newp, newlen,
1692		   &ip_mtudisc_timeout);
1693		if (ip_mtudisc_timeout_q != NULL)
1694			rt_timer_queue_change(ip_mtudisc_timeout_q,
1695					      ip_mtudisc_timeout);
1696		return (error);
1697#ifdef GATEWAY
1698	case IPCTL_MAXFLOWS:
1699	    {
1700		int s;
1701
1702		error = sysctl_int(oldp, oldlenp, newp, newlen,
1703		   &ip_maxflows);
1704		s = splsoftnet();
1705		ipflow_reap(0);
1706		splx(s);
1707		return (error);
1708	    }
1709#endif
1710	case IPCTL_HOSTZEROBROADCAST:
1711		return (sysctl_int(oldp, oldlenp, newp, newlen,
1712		    &hostzeroisbroadcast));
1713#if NGIF > 0
1714	case IPCTL_GIF_TTL:
1715		return(sysctl_int(oldp, oldlenp, newp, newlen,
1716				  &ip_gif_ttl));
1717#endif
1718
1719	default:
1720		return (EOPNOTSUPP);
1721	}
1722	/* NOTREACHED */
1723}
1724