ip_input.c revision 1.182
1/*	$NetBSD: ip_input.c,v 1.182 2003/11/12 15:00:05 itojun Exp $	*/
2
3/*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*-
33 * Copyright (c) 1998 The NetBSD Foundation, Inc.
34 * All rights reserved.
35 *
36 * This code is derived from software contributed to The NetBSD Foundation
37 * by Public Access Networks Corporation ("Panix").  It was developed under
38 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 *    notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 *    notice, this list of conditions and the following disclaimer in the
47 *    documentation and/or other materials provided with the distribution.
48 * 3. All advertising materials mentioning features or use of this software
49 *    must display the following acknowledgement:
50 *	This product includes software developed by the NetBSD
51 *	Foundation, Inc. and its contributors.
52 * 4. Neither the name of The NetBSD Foundation nor the names of its
53 *    contributors may be used to endorse or promote products derived
54 *    from this software without specific prior written permission.
55 *
56 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
57 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
58 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
59 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
60 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
61 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
62 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
63 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
64 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
65 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
66 * POSSIBILITY OF SUCH DAMAGE.
67 */
68
69/*
70 * Copyright (c) 1982, 1986, 1988, 1993
71 *	The Regents of the University of California.  All rights reserved.
72 *
73 * Redistribution and use in source and binary forms, with or without
74 * modification, are permitted provided that the following conditions
75 * are met:
76 * 1. Redistributions of source code must retain the above copyright
77 *    notice, this list of conditions and the following disclaimer.
78 * 2. Redistributions in binary form must reproduce the above copyright
79 *    notice, this list of conditions and the following disclaimer in the
80 *    documentation and/or other materials provided with the distribution.
81 * 3. Neither the name of the University nor the names of its contributors
82 *    may be used to endorse or promote products derived from this software
83 *    without specific prior written permission.
84 *
85 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
86 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
87 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
88 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
89 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
90 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
91 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
92 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
93 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
94 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
95 * SUCH DAMAGE.
96 *
97 *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
98 */
99
100#include <sys/cdefs.h>
101__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.182 2003/11/12 15:00:05 itojun Exp $");
102
103#include "opt_gateway.h"
104#include "opt_pfil_hooks.h"
105#include "opt_ipsec.h"
106#include "opt_mrouting.h"
107#include "opt_mbuftrace.h"
108#include "opt_inet_csum.h"
109
110#include <sys/param.h>
111#include <sys/systm.h>
112#include <sys/malloc.h>
113#include <sys/mbuf.h>
114#include <sys/domain.h>
115#include <sys/protosw.h>
116#include <sys/socket.h>
117#include <sys/socketvar.h>
118#include <sys/errno.h>
119#include <sys/time.h>
120#include <sys/kernel.h>
121#include <sys/pool.h>
122#include <sys/sysctl.h>
123
124#include <net/if.h>
125#include <net/if_dl.h>
126#include <net/route.h>
127#include <net/pfil.h>
128
129#include <netinet/in.h>
130#include <netinet/in_systm.h>
131#include <netinet/ip.h>
132#include <netinet/in_pcb.h>
133#include <netinet/in_var.h>
134#include <netinet/ip_var.h>
135#include <netinet/ip_icmp.h>
136/* just for gif_ttl */
137#include <netinet/in_gif.h>
138#include "gif.h"
139#include <net/if_gre.h>
140#include "gre.h"
141
142#ifdef MROUTING
143#include <netinet/ip_mroute.h>
144#endif
145
146#ifdef IPSEC
147#include <netinet6/ipsec.h>
148#include <netkey/key.h>
149#endif
150#ifdef FAST_IPSEC
151#include <netipsec/ipsec.h>
152#include <netipsec/key.h>
153#endif	/* FAST_IPSEC*/
154
155#ifndef	IPFORWARDING
156#ifdef GATEWAY
157#define	IPFORWARDING	1	/* forward IP packets not for us */
158#else /* GATEWAY */
159#define	IPFORWARDING	0	/* don't forward IP packets not for us */
160#endif /* GATEWAY */
161#endif /* IPFORWARDING */
162#ifndef	IPSENDREDIRECTS
163#define	IPSENDREDIRECTS	1
164#endif
165#ifndef IPFORWSRCRT
166#define	IPFORWSRCRT	1	/* forward source-routed packets */
167#endif
168#ifndef IPALLOWSRCRT
169#define	IPALLOWSRCRT	1	/* allow source-routed packets */
170#endif
171#ifndef IPMTUDISC
172#define IPMTUDISC	1
173#endif
174#ifndef IPMTUDISCTIMEOUT
175#define IPMTUDISCTIMEOUT (10 * 60)	/* as per RFC 1191 */
176#endif
177
178/*
179 * Note: DIRECTED_BROADCAST is handled this way so that previous
180 * configuration using this option will Just Work.
181 */
182#ifndef IPDIRECTEDBCAST
183#ifdef DIRECTED_BROADCAST
184#define IPDIRECTEDBCAST	1
185#else
186#define	IPDIRECTEDBCAST	0
187#endif /* DIRECTED_BROADCAST */
188#endif /* IPDIRECTEDBCAST */
189int	ipforwarding = IPFORWARDING;
190int	ipsendredirects = IPSENDREDIRECTS;
191int	ip_defttl = IPDEFTTL;
192int	ip_forwsrcrt = IPFORWSRCRT;
193int	ip_directedbcast = IPDIRECTEDBCAST;
194int	ip_allowsrcrt = IPALLOWSRCRT;
195int	ip_mtudisc = IPMTUDISC;
196int	ip_mtudisc_timeout = IPMTUDISCTIMEOUT;
197#ifdef DIAGNOSTIC
198int	ipprintfs = 0;
199#endif
200/*
201 * XXX - Setting ip_checkinterface mostly implements the receive side of
202 * the Strong ES model described in RFC 1122, but since the routing table
203 * and transmit implementation do not implement the Strong ES model,
204 * setting this to 1 results in an odd hybrid.
205 *
206 * XXX - ip_checkinterface currently must be disabled if you use ipnat
207 * to translate the destination address to another local interface.
208 *
209 * XXX - ip_checkinterface must be disabled if you add IP aliases
210 * to the loopback interface instead of the interface where the
211 * packets for those addresses are received.
212 */
213int	ip_checkinterface = 0;
214
215
216struct rttimer_queue *ip_mtudisc_timeout_q = NULL;
217
218extern	struct domain inetdomain;
219int	ipqmaxlen = IFQ_MAXLEN;
220u_long	in_ifaddrhash;				/* size of hash table - 1 */
221int	in_ifaddrentries;			/* total number of addrs */
222struct in_ifaddrhead in_ifaddrhead;
223struct	in_ifaddrhashhead *in_ifaddrhashtbl;
224u_long	in_multihash;				/* size of hash table - 1 */
225int	in_multientries;			/* total number of addrs */
226struct	in_multihashhead *in_multihashtbl;
227struct	ifqueue ipintrq;
228struct	ipstat	ipstat;
229
230#ifdef PFIL_HOOKS
231struct pfil_head inet_pfil_hook;
232#endif
233
234struct ipqhead ipq;
235int	ipq_locked;
236int	ip_nfragpackets = 0;
237int	ip_maxfragpackets = 200;
238
239static __inline int ipq_lock_try __P((void));
240static __inline void ipq_unlock __P((void));
241
242static __inline int
243ipq_lock_try()
244{
245	int s;
246
247	/*
248	 * Use splvm() -- we're blocking things that would cause
249	 * mbuf allocation.
250	 */
251	s = splvm();
252	if (ipq_locked) {
253		splx(s);
254		return (0);
255	}
256	ipq_locked = 1;
257	splx(s);
258	return (1);
259}
260
261static __inline void
262ipq_unlock()
263{
264	int s;
265
266	s = splvm();
267	ipq_locked = 0;
268	splx(s);
269}
270
271#ifdef DIAGNOSTIC
272#define	IPQ_LOCK()							\
273do {									\
274	if (ipq_lock_try() == 0) {					\
275		printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \
276		panic("ipq_lock");					\
277	}								\
278} while (/*CONSTCOND*/ 0)
279#define	IPQ_LOCK_CHECK()						\
280do {									\
281	if (ipq_locked == 0) {						\
282		printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \
283		panic("ipq lock check");				\
284	}								\
285} while (/*CONSTCOND*/ 0)
286#else
287#define	IPQ_LOCK()		(void) ipq_lock_try()
288#define	IPQ_LOCK_CHECK()	/* nothing */
289#endif
290
291#define	IPQ_UNLOCK()		ipq_unlock()
292
293struct pool inmulti_pool;
294struct pool ipqent_pool;
295
296#ifdef INET_CSUM_COUNTERS
297#include <sys/device.h>
298
299struct evcnt ip_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
300    NULL, "inet", "hwcsum bad");
301struct evcnt ip_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
302    NULL, "inet", "hwcsum ok");
303struct evcnt ip_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
304    NULL, "inet", "swcsum");
305
306#define	INET_CSUM_COUNTER_INCR(ev)	(ev)->ev_count++
307
308#else
309
310#define	INET_CSUM_COUNTER_INCR(ev)	/* nothing */
311
312#endif /* INET_CSUM_COUNTERS */
313
314/*
315 * We need to save the IP options in case a protocol wants to respond
316 * to an incoming packet over the same route if the packet got here
317 * using IP source routing.  This allows connection establishment and
318 * maintenance when the remote end is on a network that is not known
319 * to us.
320 */
321int	ip_nhops = 0;
322static	struct ip_srcrt {
323	struct	in_addr dst;			/* final destination */
324	char	nop;				/* one NOP to align */
325	char	srcopt[IPOPT_OFFSET + 1];	/* OPTVAL, OLEN and OFFSET */
326	struct	in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)];
327} ip_srcrt;
328
329static void save_rte __P((u_char *, struct in_addr));
330
331#ifdef MBUFTRACE
332struct mowner ip_rx_mowner = { "internet", "rx" };
333struct mowner ip_tx_mowner = { "internet", "tx" };
334#endif
335
336/*
337 * IP initialization: fill in IP protocol switch table.
338 * All protocols not implemented in kernel go to raw IP protocol handler.
339 */
340void
341ip_init()
342{
343	struct protosw *pr;
344	int i;
345
346	pool_init(&inmulti_pool, sizeof(struct in_multi), 0, 0, 0, "inmltpl",
347	    NULL);
348	pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
349	    NULL);
350
351	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
352	if (pr == 0)
353		panic("ip_init");
354	for (i = 0; i < IPPROTO_MAX; i++)
355		ip_protox[i] = pr - inetsw;
356	for (pr = inetdomain.dom_protosw;
357	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
358		if (pr->pr_domain->dom_family == PF_INET &&
359		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
360			ip_protox[pr->pr_protocol] = pr - inetsw;
361	LIST_INIT(&ipq);
362	ipintrq.ifq_maxlen = ipqmaxlen;
363	TAILQ_INIT(&in_ifaddrhead);
364	in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, M_IFADDR,
365	    M_WAITOK, &in_ifaddrhash);
366	in_multihashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, M_IPMADDR,
367	    M_WAITOK, &in_multihash);
368	ip_mtudisc_timeout_q = rt_timer_queue_create(ip_mtudisc_timeout);
369#ifdef GATEWAY
370	ipflow_init();
371#endif
372
373#ifdef PFIL_HOOKS
374	/* Register our Packet Filter hook. */
375	inet_pfil_hook.ph_type = PFIL_TYPE_AF;
376	inet_pfil_hook.ph_af   = AF_INET;
377	i = pfil_head_register(&inet_pfil_hook);
378	if (i != 0)
379		printf("ip_init: WARNING: unable to register pfil hook, "
380		    "error %d\n", i);
381#endif /* PFIL_HOOKS */
382
383#ifdef INET_CSUM_COUNTERS
384	evcnt_attach_static(&ip_hwcsum_bad);
385	evcnt_attach_static(&ip_hwcsum_ok);
386	evcnt_attach_static(&ip_swcsum);
387#endif /* INET_CSUM_COUNTERS */
388
389#ifdef MBUFTRACE
390	MOWNER_ATTACH(&ip_tx_mowner);
391	MOWNER_ATTACH(&ip_rx_mowner);
392#endif /* MBUFTRACE */
393}
394
395struct	sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
396struct	route ipforward_rt;
397
398/*
399 * IP software interrupt routine
400 */
401void
402ipintr()
403{
404	int s;
405	struct mbuf *m;
406
407	while (1) {
408		s = splnet();
409		IF_DEQUEUE(&ipintrq, m);
410		splx(s);
411		if (m == 0)
412			return;
413		MCLAIM(m, &ip_rx_mowner);
414		ip_input(m);
415	}
416}
417
418/*
419 * Ip input routine.  Checksum and byte swap header.  If fragmented
420 * try to reassemble.  Process options.  Pass to next level.
421 */
422void
423ip_input(struct mbuf *m)
424{
425	struct ip *ip = NULL;
426	struct ipq *fp;
427	struct in_ifaddr *ia;
428	struct ifaddr *ifa;
429	struct ipqent *ipqe;
430	int hlen = 0, mff, len;
431	int downmatch;
432	int checkif;
433	int srcrt = 0;
434#ifdef FAST_IPSEC
435	struct m_tag *mtag;
436	struct tdb_ident *tdbi;
437	struct secpolicy *sp;
438	int s, error;
439#endif /* FAST_IPSEC */
440
441	MCLAIM(m, &ip_rx_mowner);
442#ifdef	DIAGNOSTIC
443	if ((m->m_flags & M_PKTHDR) == 0)
444		panic("ipintr no HDR");
445#endif
446
447	/*
448	 * If no IP addresses have been set yet but the interfaces
449	 * are receiving, can't do anything with incoming packets yet.
450	 */
451	if (TAILQ_FIRST(&in_ifaddrhead) == 0)
452		goto bad;
453	ipstat.ips_total++;
454	/*
455	 * If the IP header is not aligned, slurp it up into a new
456	 * mbuf with space for link headers, in the event we forward
457	 * it.  Otherwise, if it is aligned, make sure the entire
458	 * base IP header is in the first mbuf of the chain.
459	 */
460	if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) {
461		if ((m = m_copyup(m, sizeof(struct ip),
462				  (max_linkhdr + 3) & ~3)) == NULL) {
463			/* XXXJRT new stat, please */
464			ipstat.ips_toosmall++;
465			return;
466		}
467	} else if (__predict_false(m->m_len < sizeof (struct ip))) {
468		if ((m = m_pullup(m, sizeof (struct ip))) == NULL) {
469			ipstat.ips_toosmall++;
470			return;
471		}
472	}
473	ip = mtod(m, struct ip *);
474	if (ip->ip_v != IPVERSION) {
475		ipstat.ips_badvers++;
476		goto bad;
477	}
478	hlen = ip->ip_hl << 2;
479	if (hlen < sizeof(struct ip)) {	/* minimum header length */
480		ipstat.ips_badhlen++;
481		goto bad;
482	}
483	if (hlen > m->m_len) {
484		if ((m = m_pullup(m, hlen)) == 0) {
485			ipstat.ips_badhlen++;
486			return;
487		}
488		ip = mtod(m, struct ip *);
489	}
490
491	/*
492	 * RFC1122: packets with a multicast source address are
493	 * not allowed.
494	 */
495	if (IN_MULTICAST(ip->ip_src.s_addr)) {
496		ipstat.ips_badaddr++;
497		goto bad;
498	}
499
500	/* 127/8 must not appear on wire - RFC1122 */
501	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
502	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
503		if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) {
504			ipstat.ips_badaddr++;
505			goto bad;
506		}
507	}
508
509	switch (m->m_pkthdr.csum_flags &
510		((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_IPv4) |
511		 M_CSUM_IPv4_BAD)) {
512	case M_CSUM_IPv4|M_CSUM_IPv4_BAD:
513		INET_CSUM_COUNTER_INCR(&ip_hwcsum_bad);
514		goto badcsum;
515
516	case M_CSUM_IPv4:
517		/* Checksum was okay. */
518		INET_CSUM_COUNTER_INCR(&ip_hwcsum_ok);
519		break;
520
521	default:
522		/* Must compute it ourselves. */
523		INET_CSUM_COUNTER_INCR(&ip_swcsum);
524		if (in_cksum(m, hlen) != 0)
525			goto bad;
526		break;
527	}
528
529	/* Retrieve the packet length. */
530	len = ntohs(ip->ip_len);
531
532	/*
533	 * Check for additional length bogosity
534	 */
535	if (len < hlen) {
536	 	ipstat.ips_badlen++;
537		goto bad;
538	}
539
540	/*
541	 * Check that the amount of data in the buffers
542	 * is as at least much as the IP header would have us expect.
543	 * Trim mbufs if longer than we expect.
544	 * Drop packet if shorter than we expect.
545	 */
546	if (m->m_pkthdr.len < len) {
547		ipstat.ips_tooshort++;
548		goto bad;
549	}
550	if (m->m_pkthdr.len > len) {
551		if (m->m_len == m->m_pkthdr.len) {
552			m->m_len = len;
553			m->m_pkthdr.len = len;
554		} else
555			m_adj(m, len - m->m_pkthdr.len);
556	}
557
558#ifdef IPSEC
559	/* ipflow (IP fast forwarding) is not compatible with IPsec. */
560	m->m_flags &= ~M_CANFASTFWD;
561#else
562	/*
563	 * Assume that we can create a fast-forward IP flow entry
564	 * based on this packet.
565	 */
566	m->m_flags |= M_CANFASTFWD;
567#endif
568
569#ifdef PFIL_HOOKS
570	/*
571	 * Run through list of hooks for input packets.  If there are any
572	 * filters which require that additional packets in the flow are
573	 * not fast-forwarded, they must clear the M_CANFASTFWD flag.
574	 * Note that filters must _never_ set this flag, as another filter
575	 * in the list may have previously cleared it.
576	 */
577	/*
578	 * let ipfilter look at packet on the wire,
579	 * not the decapsulated packet.
580	 */
581#ifdef IPSEC
582	if (!ipsec_getnhist(m))
583#else
584	if (1)
585#endif
586	{
587		struct in_addr odst;
588
589		odst = ip->ip_dst;
590		if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif,
591		    PFIL_IN) != 0)
592			return;
593		if (m == NULL)
594			return;
595		ip = mtod(m, struct ip *);
596		hlen = ip->ip_hl << 2;
597		srcrt = (odst.s_addr != ip->ip_dst.s_addr);
598	}
599#endif /* PFIL_HOOKS */
600
601#ifdef ALTQ
602	/* XXX Temporary until ALTQ is changed to use a pfil hook */
603	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) {
604		/* packet dropped by traffic conditioner */
605		return;
606	}
607#endif
608
609	/*
610	 * Process options and, if not destined for us,
611	 * ship it on.  ip_dooptions returns 1 when an
612	 * error was detected (causing an icmp message
613	 * to be sent and the original packet to be freed).
614	 */
615	ip_nhops = 0;		/* for source routed packets */
616	if (hlen > sizeof (struct ip) && ip_dooptions(m))
617		return;
618
619	/*
620	 * Enable a consistency check between the destination address
621	 * and the arrival interface for a unicast packet (the RFC 1122
622	 * strong ES model) if IP forwarding is disabled and the packet
623	 * is not locally generated.
624	 *
625	 * XXX - Checking also should be disabled if the destination
626	 * address is ipnat'ed to a different interface.
627	 *
628	 * XXX - Checking is incompatible with IP aliases added
629	 * to the loopback interface instead of the interface where
630	 * the packets are received.
631	 *
632	 * XXX - We need to add a per ifaddr flag for this so that
633	 * we get finer grain control.
634	 */
635	checkif = ip_checkinterface && (ipforwarding == 0) &&
636	    (m->m_pkthdr.rcvif != NULL) &&
637	    ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0);
638
639	/*
640	 * Check our list of addresses, to see if the packet is for us.
641	 *
642	 * Traditional 4.4BSD did not consult IFF_UP at all.
643	 * The behavior here is to treat addresses on !IFF_UP interface
644	 * as not mine.
645	 */
646	downmatch = 0;
647	LIST_FOREACH(ia, &IN_IFADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
648		if (in_hosteq(ia->ia_addr.sin_addr, ip->ip_dst)) {
649			if (checkif && ia->ia_ifp != m->m_pkthdr.rcvif)
650				continue;
651			if ((ia->ia_ifp->if_flags & IFF_UP) != 0)
652				break;
653			else
654				downmatch++;
655		}
656	}
657	if (ia != NULL)
658		goto ours;
659	if (m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) {
660		TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrlist, ifa_list) {
661			if (ifa->ifa_addr->sa_family != AF_INET)
662				continue;
663			ia = ifatoia(ifa);
664			if (in_hosteq(ip->ip_dst, ia->ia_broadaddr.sin_addr) ||
665			    in_hosteq(ip->ip_dst, ia->ia_netbroadcast) ||
666			    /*
667			     * Look for all-0's host part (old broadcast addr),
668			     * either for subnet or net.
669			     */
670			    ip->ip_dst.s_addr == ia->ia_subnet ||
671			    ip->ip_dst.s_addr == ia->ia_net)
672				goto ours;
673			/*
674			 * An interface with IP address zero accepts
675			 * all packets that arrive on that interface.
676			 */
677			if (in_nullhost(ia->ia_addr.sin_addr))
678				goto ours;
679		}
680	}
681	if (IN_MULTICAST(ip->ip_dst.s_addr)) {
682		struct in_multi *inm;
683#ifdef MROUTING
684		extern struct socket *ip_mrouter;
685
686		if (M_READONLY(m)) {
687			if ((m = m_pullup(m, hlen)) == 0) {
688				ipstat.ips_toosmall++;
689				return;
690			}
691			ip = mtod(m, struct ip *);
692		}
693
694		if (ip_mrouter) {
695			/*
696			 * If we are acting as a multicast router, all
697			 * incoming multicast packets are passed to the
698			 * kernel-level multicast forwarding function.
699			 * The packet is returned (relatively) intact; if
700			 * ip_mforward() returns a non-zero value, the packet
701			 * must be discarded, else it may be accepted below.
702			 *
703			 * (The IP ident field is put in the same byte order
704			 * as expected when ip_mforward() is called from
705			 * ip_output().)
706			 */
707			if (ip_mforward(m, m->m_pkthdr.rcvif) != 0) {
708				ipstat.ips_cantforward++;
709				m_freem(m);
710				return;
711			}
712
713			/*
714			 * The process-level routing demon needs to receive
715			 * all multicast IGMP packets, whether or not this
716			 * host belongs to their destination groups.
717			 */
718			if (ip->ip_p == IPPROTO_IGMP)
719				goto ours;
720			ipstat.ips_forward++;
721		}
722#endif
723		/*
724		 * See if we belong to the destination multicast group on the
725		 * arrival interface.
726		 */
727		IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm);
728		if (inm == NULL) {
729			ipstat.ips_cantforward++;
730			m_freem(m);
731			return;
732		}
733		goto ours;
734	}
735	if (ip->ip_dst.s_addr == INADDR_BROADCAST ||
736	    in_nullhost(ip->ip_dst))
737		goto ours;
738
739	/*
740	 * Not for us; forward if possible and desirable.
741	 */
742	if (ipforwarding == 0) {
743		ipstat.ips_cantforward++;
744		m_freem(m);
745	} else {
746		/*
747		 * If ip_dst matched any of my address on !IFF_UP interface,
748		 * and there's no IFF_UP interface that matches ip_dst,
749		 * send icmp unreach.  Forwarding it will result in in-kernel
750		 * forwarding loop till TTL goes to 0.
751		 */
752		if (downmatch) {
753			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
754			ipstat.ips_cantforward++;
755			return;
756		}
757#ifdef IPSEC
758		if (ipsec4_in_reject(m, NULL)) {
759			ipsecstat.in_polvio++;
760			goto bad;
761		}
762#endif
763#ifdef FAST_IPSEC
764		mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
765		s = splsoftnet();
766		if (mtag != NULL) {
767			tdbi = (struct tdb_ident *)(mtag + 1);
768			sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
769		} else {
770			sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
771						   IP_FORWARDING, &error);
772		}
773		if (sp == NULL) {	/* NB: can happen if error */
774			splx(s);
775			/*XXX error stat???*/
776			DPRINTF(("ip_input: no SP for forwarding\n"));	/*XXX*/
777			goto bad;
778		}
779
780		/*
781		 * Check security policy against packet attributes.
782		 */
783		error = ipsec_in_reject(sp, m);
784		KEY_FREESP(&sp);
785		splx(s);
786		if (error) {
787			ipstat.ips_cantforward++;
788			goto bad;
789		}
790#endif	/* FAST_IPSEC */
791
792		ip_forward(m, srcrt);
793	}
794	return;
795
796ours:
797	/*
798	 * If offset or IP_MF are set, must reassemble.
799	 * Otherwise, nothing need be done.
800	 * (We could look in the reassembly queue to see
801	 * if the packet was previously fragmented,
802	 * but it's not worth the time; just let them time out.)
803	 */
804	if (ip->ip_off & ~htons(IP_DF|IP_RF)) {
805		if (M_READONLY(m)) {
806			if ((m = m_pullup(m, hlen)) == NULL) {
807				ipstat.ips_toosmall++;
808				goto bad;
809			}
810			ip = mtod(m, struct ip *);
811		}
812
813		/*
814		 * Look for queue of fragments
815		 * of this datagram.
816		 */
817		IPQ_LOCK();
818		LIST_FOREACH(fp, &ipq, ipq_q)
819			if (ip->ip_id == fp->ipq_id &&
820			    in_hosteq(ip->ip_src, fp->ipq_src) &&
821			    in_hosteq(ip->ip_dst, fp->ipq_dst) &&
822			    ip->ip_p == fp->ipq_p)
823				goto found;
824		fp = 0;
825found:
826
827		/*
828		 * Adjust ip_len to not reflect header,
829		 * set ipqe_mff if more fragments are expected,
830		 * convert offset of this to bytes.
831		 */
832		ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
833		mff = (ip->ip_off & htons(IP_MF)) != 0;
834		if (mff) {
835		        /*
836		         * Make sure that fragments have a data length
837			 * that's a non-zero multiple of 8 bytes.
838		         */
839			if (ntohs(ip->ip_len) == 0 ||
840			    (ntohs(ip->ip_len) & 0x7) != 0) {
841				ipstat.ips_badfrags++;
842				IPQ_UNLOCK();
843				goto bad;
844			}
845		}
846		ip->ip_off = htons((ntohs(ip->ip_off) & IP_OFFMASK) << 3);
847
848		/*
849		 * If datagram marked as having more fragments
850		 * or if this is not the first fragment,
851		 * attempt reassembly; if it succeeds, proceed.
852		 */
853		if (mff || ip->ip_off != htons(0)) {
854			ipstat.ips_fragments++;
855			ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
856			if (ipqe == NULL) {
857				ipstat.ips_rcvmemdrop++;
858				IPQ_UNLOCK();
859				goto bad;
860			}
861			ipqe->ipqe_mff = mff;
862			ipqe->ipqe_m = m;
863			ipqe->ipqe_ip = ip;
864			m = ip_reass(ipqe, fp);
865			if (m == 0) {
866				IPQ_UNLOCK();
867				return;
868			}
869			ipstat.ips_reassembled++;
870			ip = mtod(m, struct ip *);
871			hlen = ip->ip_hl << 2;
872			ip->ip_len = htons(ntohs(ip->ip_len) + hlen);
873		} else
874			if (fp)
875				ip_freef(fp);
876		IPQ_UNLOCK();
877	}
878
879#if defined(IPSEC)
880	/*
881	 * enforce IPsec policy checking if we are seeing last header.
882	 * note that we do not visit this with protocols with pcb layer
883	 * code - like udp/tcp/raw ip.
884	 */
885	if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0 &&
886	    ipsec4_in_reject(m, NULL)) {
887		ipsecstat.in_polvio++;
888		goto bad;
889	}
890#endif
891#if FAST_IPSEC
892	/*
893	 * enforce IPsec policy checking if we are seeing last header.
894	 * note that we do not visit this with protocols with pcb layer
895	 * code - like udp/tcp/raw ip.
896	 */
897	if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0) {
898		/*
899		 * Check if the packet has already had IPsec processing
900		 * done.  If so, then just pass it along.  This tag gets
901		 * set during AH, ESP, etc. input handling, before the
902		 * packet is returned to the ip input queue for delivery.
903		 */
904		mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
905		s = splsoftnet();
906		if (mtag != NULL) {
907			tdbi = (struct tdb_ident *)(mtag + 1);
908			sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
909		} else {
910			sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
911						   IP_FORWARDING, &error);
912		}
913		if (sp != NULL) {
914			/*
915			 * Check security policy against packet attributes.
916			 */
917			error = ipsec_in_reject(sp, m);
918			KEY_FREESP(&sp);
919		} else {
920			/* XXX error stat??? */
921			error = EINVAL;
922DPRINTF(("ip_input: no SP, packet discarded\n"));/*XXX*/
923			goto bad;
924		}
925		splx(s);
926		if (error)
927			goto bad;
928	}
929#endif /* FAST_IPSEC */
930
931	/*
932	 * Switch out to protocol's input routine.
933	 */
934#if IFA_STATS
935	if (ia && ip)
936		ia->ia_ifa.ifa_data.ifad_inbytes += ntohs(ip->ip_len);
937#endif
938	ipstat.ips_delivered++;
939    {
940	int off = hlen, nh = ip->ip_p;
941
942	(*inetsw[ip_protox[nh]].pr_input)(m, off, nh);
943	return;
944    }
945bad:
946	m_freem(m);
947	return;
948
949badcsum:
950	ipstat.ips_badsum++;
951	m_freem(m);
952}
953
954/*
955 * Take incoming datagram fragment and try to
956 * reassemble it into whole datagram.  If a chain for
957 * reassembly of this datagram already exists, then it
958 * is given as fp; otherwise have to make a chain.
959 */
960struct mbuf *
961ip_reass(ipqe, fp)
962	struct ipqent *ipqe;
963	struct ipq *fp;
964{
965	struct mbuf *m = ipqe->ipqe_m;
966	struct ipqent *nq, *p, *q;
967	struct ip *ip;
968	struct mbuf *t;
969	int hlen = ipqe->ipqe_ip->ip_hl << 2;
970	int i, next;
971
972	IPQ_LOCK_CHECK();
973
974	/*
975	 * Presence of header sizes in mbufs
976	 * would confuse code below.
977	 */
978	m->m_data += hlen;
979	m->m_len -= hlen;
980
981	/*
982	 * If first fragment to arrive, create a reassembly queue.
983	 */
984	if (fp == 0) {
985		/*
986		 * Enforce upper bound on number of fragmented packets
987		 * for which we attempt reassembly;
988		 * If maxfrag is 0, never accept fragments.
989		 * If maxfrag is -1, accept all fragments without limitation.
990		 */
991		if (ip_maxfragpackets < 0)
992			;
993		else if (ip_nfragpackets >= ip_maxfragpackets)
994			goto dropfrag;
995		ip_nfragpackets++;
996		MALLOC(fp, struct ipq *, sizeof (struct ipq),
997		    M_FTABLE, M_NOWAIT);
998		if (fp == NULL)
999			goto dropfrag;
1000		LIST_INSERT_HEAD(&ipq, fp, ipq_q);
1001		fp->ipq_ttl = IPFRAGTTL;
1002		fp->ipq_p = ipqe->ipqe_ip->ip_p;
1003		fp->ipq_id = ipqe->ipqe_ip->ip_id;
1004		TAILQ_INIT(&fp->ipq_fragq);
1005		fp->ipq_src = ipqe->ipqe_ip->ip_src;
1006		fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
1007		p = NULL;
1008		goto insert;
1009	}
1010
1011	/*
1012	 * Find a segment which begins after this one does.
1013	 */
1014	for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
1015	    p = q, q = TAILQ_NEXT(q, ipqe_q))
1016		if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off))
1017			break;
1018
1019	/*
1020	 * If there is a preceding segment, it may provide some of
1021	 * our data already.  If so, drop the data from the incoming
1022	 * segment.  If it provides all of our data, drop us.
1023	 */
1024	if (p != NULL) {
1025		i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) -
1026		    ntohs(ipqe->ipqe_ip->ip_off);
1027		if (i > 0) {
1028			if (i >= ntohs(ipqe->ipqe_ip->ip_len))
1029				goto dropfrag;
1030			m_adj(ipqe->ipqe_m, i);
1031			ipqe->ipqe_ip->ip_off =
1032			    htons(ntohs(ipqe->ipqe_ip->ip_off) + i);
1033			ipqe->ipqe_ip->ip_len =
1034			    htons(ntohs(ipqe->ipqe_ip->ip_len) - i);
1035		}
1036	}
1037
1038	/*
1039	 * While we overlap succeeding segments trim them or,
1040	 * if they are completely covered, dequeue them.
1041	 */
1042	for (; q != NULL &&
1043	    ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) >
1044	    ntohs(q->ipqe_ip->ip_off); q = nq) {
1045		i = (ntohs(ipqe->ipqe_ip->ip_off) +
1046		    ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off);
1047		if (i < ntohs(q->ipqe_ip->ip_len)) {
1048			q->ipqe_ip->ip_len =
1049			    htons(ntohs(q->ipqe_ip->ip_len) - i);
1050			q->ipqe_ip->ip_off =
1051			    htons(ntohs(q->ipqe_ip->ip_off) + i);
1052			m_adj(q->ipqe_m, i);
1053			break;
1054		}
1055		nq = TAILQ_NEXT(q, ipqe_q);
1056		m_freem(q->ipqe_m);
1057		TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
1058		pool_put(&ipqent_pool, q);
1059	}
1060
1061insert:
1062	/*
1063	 * Stick new segment in its place;
1064	 * check for complete reassembly.
1065	 */
1066	if (p == NULL) {
1067		TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
1068	} else {
1069		TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q);
1070	}
1071	next = 0;
1072	for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
1073	    p = q, q = TAILQ_NEXT(q, ipqe_q)) {
1074		if (ntohs(q->ipqe_ip->ip_off) != next)
1075			return (0);
1076		next += ntohs(q->ipqe_ip->ip_len);
1077	}
1078	if (p->ipqe_mff)
1079		return (0);
1080
1081	/*
1082	 * Reassembly is complete.  Check for a bogus message size and
1083	 * concatenate fragments.
1084	 */
1085	q = TAILQ_FIRST(&fp->ipq_fragq);
1086	ip = q->ipqe_ip;
1087	if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
1088		ipstat.ips_toolong++;
1089		ip_freef(fp);
1090		return (0);
1091	}
1092	m = q->ipqe_m;
1093	t = m->m_next;
1094	m->m_next = 0;
1095	m_cat(m, t);
1096	nq = TAILQ_NEXT(q, ipqe_q);
1097	pool_put(&ipqent_pool, q);
1098	for (q = nq; q != NULL; q = nq) {
1099		t = q->ipqe_m;
1100		nq = TAILQ_NEXT(q, ipqe_q);
1101		pool_put(&ipqent_pool, q);
1102		m_cat(m, t);
1103	}
1104
1105	/*
1106	 * Create header for new ip packet by
1107	 * modifying header of first packet;
1108	 * dequeue and discard fragment reassembly header.
1109	 * Make header visible.
1110	 */
1111	ip->ip_len = htons(next);
1112	ip->ip_src = fp->ipq_src;
1113	ip->ip_dst = fp->ipq_dst;
1114	LIST_REMOVE(fp, ipq_q);
1115	FREE(fp, M_FTABLE);
1116	ip_nfragpackets--;
1117	m->m_len += (ip->ip_hl << 2);
1118	m->m_data -= (ip->ip_hl << 2);
1119	/* some debugging cruft by sklower, below, will go away soon */
1120	if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
1121		int plen = 0;
1122		for (t = m; t; t = t->m_next)
1123			plen += t->m_len;
1124		m->m_pkthdr.len = plen;
1125	}
1126	return (m);
1127
1128dropfrag:
1129	ipstat.ips_fragdropped++;
1130	m_freem(m);
1131	pool_put(&ipqent_pool, ipqe);
1132	return (0);
1133}
1134
1135/*
1136 * Free a fragment reassembly header and all
1137 * associated datagrams.
1138 */
1139void
1140ip_freef(fp)
1141	struct ipq *fp;
1142{
1143	struct ipqent *q, *p;
1144
1145	IPQ_LOCK_CHECK();
1146
1147	for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) {
1148		p = TAILQ_NEXT(q, ipqe_q);
1149		m_freem(q->ipqe_m);
1150		TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
1151		pool_put(&ipqent_pool, q);
1152	}
1153	LIST_REMOVE(fp, ipq_q);
1154	FREE(fp, M_FTABLE);
1155	ip_nfragpackets--;
1156}
1157
1158/*
1159 * IP timer processing;
1160 * if a timer expires on a reassembly
1161 * queue, discard it.
1162 */
1163void
1164ip_slowtimo()
1165{
1166	struct ipq *fp, *nfp;
1167	int s = splsoftnet();
1168
1169	IPQ_LOCK();
1170	for (fp = LIST_FIRST(&ipq); fp != NULL; fp = nfp) {
1171		nfp = LIST_NEXT(fp, ipq_q);
1172		if (--fp->ipq_ttl == 0) {
1173			ipstat.ips_fragtimeout++;
1174			ip_freef(fp);
1175		}
1176	}
1177	/*
1178	 * If we are over the maximum number of fragments
1179	 * (due to the limit being lowered), drain off
1180	 * enough to get down to the new limit.
1181	 */
1182	if (ip_maxfragpackets < 0)
1183		;
1184	else {
1185		while (ip_nfragpackets > ip_maxfragpackets && LIST_FIRST(&ipq))
1186			ip_freef(LIST_FIRST(&ipq));
1187	}
1188	IPQ_UNLOCK();
1189#ifdef GATEWAY
1190	ipflow_slowtimo();
1191#endif
1192	splx(s);
1193}
1194
1195/*
1196 * Drain off all datagram fragments.
1197 */
1198void
1199ip_drain()
1200{
1201
1202	/*
1203	 * We may be called from a device's interrupt context.  If
1204	 * the ipq is already busy, just bail out now.
1205	 */
1206	if (ipq_lock_try() == 0)
1207		return;
1208
1209	while (LIST_FIRST(&ipq) != NULL) {
1210		ipstat.ips_fragdropped++;
1211		ip_freef(LIST_FIRST(&ipq));
1212	}
1213
1214	IPQ_UNLOCK();
1215}
1216
1217/*
1218 * Do option processing on a datagram,
1219 * possibly discarding it if bad options are encountered,
1220 * or forwarding it if source-routed.
1221 * Returns 1 if packet has been forwarded/freed,
1222 * 0 if the packet should be processed further.
1223 */
1224int
1225ip_dooptions(m)
1226	struct mbuf *m;
1227{
1228	struct ip *ip = mtod(m, struct ip *);
1229	u_char *cp, *cp0;
1230	struct ip_timestamp *ipt;
1231	struct in_ifaddr *ia;
1232	int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
1233	struct in_addr dst;
1234	n_time ntime;
1235
1236	dst = ip->ip_dst;
1237	cp = (u_char *)(ip + 1);
1238	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
1239	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1240		opt = cp[IPOPT_OPTVAL];
1241		if (opt == IPOPT_EOL)
1242			break;
1243		if (opt == IPOPT_NOP)
1244			optlen = 1;
1245		else {
1246			if (cnt < IPOPT_OLEN + sizeof(*cp)) {
1247				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1248				goto bad;
1249			}
1250			optlen = cp[IPOPT_OLEN];
1251			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
1252				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1253				goto bad;
1254			}
1255		}
1256		switch (opt) {
1257
1258		default:
1259			break;
1260
1261		/*
1262		 * Source routing with record.
1263		 * Find interface with current destination address.
1264		 * If none on this machine then drop if strictly routed,
1265		 * or do nothing if loosely routed.
1266		 * Record interface address and bring up next address
1267		 * component.  If strictly routed make sure next
1268		 * address is on directly accessible net.
1269		 */
1270		case IPOPT_LSRR:
1271		case IPOPT_SSRR:
1272			if (ip_allowsrcrt == 0) {
1273				type = ICMP_UNREACH;
1274				code = ICMP_UNREACH_NET_PROHIB;
1275				goto bad;
1276			}
1277			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1278				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1279				goto bad;
1280			}
1281			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1282				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1283				goto bad;
1284			}
1285			ipaddr.sin_addr = ip->ip_dst;
1286			ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr)));
1287			if (ia == 0) {
1288				if (opt == IPOPT_SSRR) {
1289					type = ICMP_UNREACH;
1290					code = ICMP_UNREACH_SRCFAIL;
1291					goto bad;
1292				}
1293				/*
1294				 * Loose routing, and not at next destination
1295				 * yet; nothing to do except forward.
1296				 */
1297				break;
1298			}
1299			off--;			/* 0 origin */
1300			if ((off + sizeof(struct in_addr)) > optlen) {
1301				/*
1302				 * End of source route.  Should be for us.
1303				 */
1304				save_rte(cp, ip->ip_src);
1305				break;
1306			}
1307			/*
1308			 * locate outgoing interface
1309			 */
1310			bcopy((caddr_t)(cp + off), (caddr_t)&ipaddr.sin_addr,
1311			    sizeof(ipaddr.sin_addr));
1312			if (opt == IPOPT_SSRR)
1313				ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr)));
1314			else
1315				ia = ip_rtaddr(ipaddr.sin_addr);
1316			if (ia == 0) {
1317				type = ICMP_UNREACH;
1318				code = ICMP_UNREACH_SRCFAIL;
1319				goto bad;
1320			}
1321			ip->ip_dst = ipaddr.sin_addr;
1322			bcopy((caddr_t)&ia->ia_addr.sin_addr,
1323			    (caddr_t)(cp + off), sizeof(struct in_addr));
1324			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1325			/*
1326			 * Let ip_intr's mcast routing check handle mcast pkts
1327			 */
1328			forward = !IN_MULTICAST(ip->ip_dst.s_addr);
1329			break;
1330
1331		case IPOPT_RR:
1332			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1333				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1334				goto bad;
1335			}
1336			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1337				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1338				goto bad;
1339			}
1340			/*
1341			 * If no space remains, ignore.
1342			 */
1343			off--;			/* 0 origin */
1344			if ((off + sizeof(struct in_addr)) > optlen)
1345				break;
1346			bcopy((caddr_t)(&ip->ip_dst), (caddr_t)&ipaddr.sin_addr,
1347			    sizeof(ipaddr.sin_addr));
1348			/*
1349			 * locate outgoing interface; if we're the destination,
1350			 * use the incoming interface (should be same).
1351			 */
1352			if ((ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr))))
1353			    == NULL &&
1354			    (ia = ip_rtaddr(ipaddr.sin_addr)) == NULL) {
1355				type = ICMP_UNREACH;
1356				code = ICMP_UNREACH_HOST;
1357				goto bad;
1358			}
1359			bcopy((caddr_t)&ia->ia_addr.sin_addr,
1360			    (caddr_t)(cp + off), sizeof(struct in_addr));
1361			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1362			break;
1363
1364		case IPOPT_TS:
1365			code = cp - (u_char *)ip;
1366			ipt = (struct ip_timestamp *)cp;
1367			if (ipt->ipt_len < 4 || ipt->ipt_len > 40) {
1368				code = (u_char *)&ipt->ipt_len - (u_char *)ip;
1369				goto bad;
1370			}
1371			if (ipt->ipt_ptr < 5) {
1372				code = (u_char *)&ipt->ipt_ptr - (u_char *)ip;
1373				goto bad;
1374			}
1375			if (ipt->ipt_ptr > ipt->ipt_len - sizeof (int32_t)) {
1376				if (++ipt->ipt_oflw == 0) {
1377					code = (u_char *)&ipt->ipt_ptr -
1378					    (u_char *)ip;
1379					goto bad;
1380				}
1381				break;
1382			}
1383			cp0 = (cp + ipt->ipt_ptr - 1);
1384			switch (ipt->ipt_flg) {
1385
1386			case IPOPT_TS_TSONLY:
1387				break;
1388
1389			case IPOPT_TS_TSANDADDR:
1390				if (ipt->ipt_ptr - 1 + sizeof(n_time) +
1391				    sizeof(struct in_addr) > ipt->ipt_len) {
1392					code = (u_char *)&ipt->ipt_ptr -
1393					    (u_char *)ip;
1394					goto bad;
1395				}
1396				ipaddr.sin_addr = dst;
1397				ia = ifatoia(ifaof_ifpforaddr(sintosa(&ipaddr),
1398				    m->m_pkthdr.rcvif));
1399				if (ia == 0)
1400					continue;
1401				bcopy(&ia->ia_addr.sin_addr,
1402				    cp0, sizeof(struct in_addr));
1403				ipt->ipt_ptr += sizeof(struct in_addr);
1404				break;
1405
1406			case IPOPT_TS_PRESPEC:
1407				if (ipt->ipt_ptr - 1 + sizeof(n_time) +
1408				    sizeof(struct in_addr) > ipt->ipt_len) {
1409					code = (u_char *)&ipt->ipt_ptr -
1410					    (u_char *)ip;
1411					goto bad;
1412				}
1413				bcopy(cp0, &ipaddr.sin_addr,
1414				    sizeof(struct in_addr));
1415				if (ifatoia(ifa_ifwithaddr(sintosa(&ipaddr)))
1416				    == NULL)
1417					continue;
1418				ipt->ipt_ptr += sizeof(struct in_addr);
1419				break;
1420
1421			default:
1422				/* XXX can't take &ipt->ipt_flg */
1423				code = (u_char *)&ipt->ipt_ptr -
1424				    (u_char *)ip + 1;
1425				goto bad;
1426			}
1427			ntime = iptime();
1428			cp0 = (u_char *) &ntime; /* XXX grumble, GCC... */
1429			bcopy(cp0, (caddr_t)cp + ipt->ipt_ptr - 1,
1430			    sizeof(n_time));
1431			ipt->ipt_ptr += sizeof(n_time);
1432		}
1433	}
1434	if (forward) {
1435		if (ip_forwsrcrt == 0) {
1436			type = ICMP_UNREACH;
1437			code = ICMP_UNREACH_SRCFAIL;
1438			goto bad;
1439		}
1440		ip_forward(m, 1);
1441		return (1);
1442	}
1443	return (0);
1444bad:
1445	icmp_error(m, type, code, 0, 0);
1446	ipstat.ips_badoptions++;
1447	return (1);
1448}
1449
1450/*
1451 * Given address of next destination (final or next hop),
1452 * return internet address info of interface to be used to get there.
1453 */
1454struct in_ifaddr *
1455ip_rtaddr(dst)
1456	 struct in_addr dst;
1457{
1458	struct sockaddr_in *sin;
1459
1460	sin = satosin(&ipforward_rt.ro_dst);
1461
1462	if (ipforward_rt.ro_rt == 0 || !in_hosteq(dst, sin->sin_addr)) {
1463		if (ipforward_rt.ro_rt) {
1464			RTFREE(ipforward_rt.ro_rt);
1465			ipforward_rt.ro_rt = 0;
1466		}
1467		sin->sin_family = AF_INET;
1468		sin->sin_len = sizeof(*sin);
1469		sin->sin_addr = dst;
1470
1471		rtalloc(&ipforward_rt);
1472	}
1473	if (ipforward_rt.ro_rt == 0)
1474		return ((struct in_ifaddr *)0);
1475	return (ifatoia(ipforward_rt.ro_rt->rt_ifa));
1476}
1477
1478/*
1479 * Save incoming source route for use in replies,
1480 * to be picked up later by ip_srcroute if the receiver is interested.
1481 */
1482void
1483save_rte(option, dst)
1484	u_char *option;
1485	struct in_addr dst;
1486{
1487	unsigned olen;
1488
1489	olen = option[IPOPT_OLEN];
1490#ifdef DIAGNOSTIC
1491	if (ipprintfs)
1492		printf("save_rte: olen %d\n", olen);
1493#endif /* 0 */
1494	if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst)))
1495		return;
1496	bcopy((caddr_t)option, (caddr_t)ip_srcrt.srcopt, olen);
1497	ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
1498	ip_srcrt.dst = dst;
1499}
1500
1501/*
1502 * Retrieve incoming source route for use in replies,
1503 * in the same form used by setsockopt.
1504 * The first hop is placed before the options, will be removed later.
1505 */
1506struct mbuf *
1507ip_srcroute()
1508{
1509	struct in_addr *p, *q;
1510	struct mbuf *m;
1511
1512	if (ip_nhops == 0)
1513		return ((struct mbuf *)0);
1514	m = m_get(M_DONTWAIT, MT_SOOPTS);
1515	if (m == 0)
1516		return ((struct mbuf *)0);
1517
1518	MCLAIM(m, &inetdomain.dom_mowner);
1519#define OPTSIZ	(sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt))
1520
1521	/* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
1522	m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) +
1523	    OPTSIZ;
1524#ifdef DIAGNOSTIC
1525	if (ipprintfs)
1526		printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
1527#endif
1528
1529	/*
1530	 * First save first hop for return route
1531	 */
1532	p = &ip_srcrt.route[ip_nhops - 1];
1533	*(mtod(m, struct in_addr *)) = *p--;
1534#ifdef DIAGNOSTIC
1535	if (ipprintfs)
1536		printf(" hops %x", ntohl(mtod(m, struct in_addr *)->s_addr));
1537#endif
1538
1539	/*
1540	 * Copy option fields and padding (nop) to mbuf.
1541	 */
1542	ip_srcrt.nop = IPOPT_NOP;
1543	ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
1544	bcopy((caddr_t)&ip_srcrt.nop,
1545	    mtod(m, caddr_t) + sizeof(struct in_addr), OPTSIZ);
1546	q = (struct in_addr *)(mtod(m, caddr_t) +
1547	    sizeof(struct in_addr) + OPTSIZ);
1548#undef OPTSIZ
1549	/*
1550	 * Record return path as an IP source route,
1551	 * reversing the path (pointers are now aligned).
1552	 */
1553	while (p >= ip_srcrt.route) {
1554#ifdef DIAGNOSTIC
1555		if (ipprintfs)
1556			printf(" %x", ntohl(q->s_addr));
1557#endif
1558		*q++ = *p--;
1559	}
1560	/*
1561	 * Last hop goes to final destination.
1562	 */
1563	*q = ip_srcrt.dst;
1564#ifdef DIAGNOSTIC
1565	if (ipprintfs)
1566		printf(" %x\n", ntohl(q->s_addr));
1567#endif
1568	return (m);
1569}
1570
1571/*
1572 * Strip out IP options, at higher
1573 * level protocol in the kernel.
1574 * Second argument is buffer to which options
1575 * will be moved, and return value is their length.
1576 * XXX should be deleted; last arg currently ignored.
1577 */
1578void
1579ip_stripoptions(m, mopt)
1580	struct mbuf *m;
1581	struct mbuf *mopt;
1582{
1583	int i;
1584	struct ip *ip = mtod(m, struct ip *);
1585	caddr_t opts;
1586	int olen;
1587
1588	olen = (ip->ip_hl << 2) - sizeof (struct ip);
1589	opts = (caddr_t)(ip + 1);
1590	i = m->m_len - (sizeof (struct ip) + olen);
1591	bcopy(opts  + olen, opts, (unsigned)i);
1592	m->m_len -= olen;
1593	if (m->m_flags & M_PKTHDR)
1594		m->m_pkthdr.len -= olen;
1595	ip->ip_len = htons(ntohs(ip->ip_len) - olen);
1596	ip->ip_hl = sizeof (struct ip) >> 2;
1597}
1598
1599const int inetctlerrmap[PRC_NCMDS] = {
1600	0,		0,		0,		0,
1601	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
1602	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
1603	EMSGSIZE,	EHOSTUNREACH,	0,		0,
1604	0,		0,		0,		0,
1605	ENOPROTOOPT
1606};
1607
1608/*
1609 * Forward a packet.  If some error occurs return the sender
1610 * an icmp packet.  Note we can't always generate a meaningful
1611 * icmp message because icmp doesn't have a large enough repertoire
1612 * of codes and types.
1613 *
1614 * If not forwarding, just drop the packet.  This could be confusing
1615 * if ipforwarding was zero but some routing protocol was advancing
1616 * us as a gateway to somewhere.  However, we must let the routing
1617 * protocol deal with that.
1618 *
1619 * The srcrt parameter indicates whether the packet is being forwarded
1620 * via a source route.
1621 */
1622void
1623ip_forward(m, srcrt)
1624	struct mbuf *m;
1625	int srcrt;
1626{
1627	struct ip *ip = mtod(m, struct ip *);
1628	struct sockaddr_in *sin;
1629	struct rtentry *rt;
1630	int error, type = 0, code = 0;
1631	struct mbuf *mcopy;
1632	n_long dest;
1633	struct ifnet *destifp;
1634#if defined(IPSEC) || defined(FAST_IPSEC)
1635	struct ifnet dummyifp;
1636#endif
1637
1638	/*
1639	 * We are now in the output path.
1640	 */
1641	MCLAIM(m, &ip_tx_mowner);
1642
1643	/*
1644	 * Clear any in-bound checksum flags for this packet.
1645	 */
1646	m->m_pkthdr.csum_flags = 0;
1647
1648	dest = 0;
1649#ifdef DIAGNOSTIC
1650	if (ipprintfs)
1651		printf("forward: src %2.2x dst %2.2x ttl %x\n",
1652		    ntohl(ip->ip_src.s_addr),
1653		    ntohl(ip->ip_dst.s_addr), ip->ip_ttl);
1654#endif
1655	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
1656		ipstat.ips_cantforward++;
1657		m_freem(m);
1658		return;
1659	}
1660	if (ip->ip_ttl <= IPTTLDEC) {
1661		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0);
1662		return;
1663	}
1664	ip->ip_ttl -= IPTTLDEC;
1665
1666	sin = satosin(&ipforward_rt.ro_dst);
1667	if ((rt = ipforward_rt.ro_rt) == 0 ||
1668	    !in_hosteq(ip->ip_dst, sin->sin_addr)) {
1669		if (ipforward_rt.ro_rt) {
1670			RTFREE(ipforward_rt.ro_rt);
1671			ipforward_rt.ro_rt = 0;
1672		}
1673		sin->sin_family = AF_INET;
1674		sin->sin_len = sizeof(struct sockaddr_in);
1675		sin->sin_addr = ip->ip_dst;
1676
1677		rtalloc(&ipforward_rt);
1678		if (ipforward_rt.ro_rt == 0) {
1679			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
1680			return;
1681		}
1682		rt = ipforward_rt.ro_rt;
1683	}
1684
1685	/*
1686	 * Save at most 68 bytes of the packet in case
1687	 * we need to generate an ICMP message to the src.
1688	 * Pullup to avoid sharing mbuf cluster between m and mcopy.
1689	 */
1690	mcopy = m_copym(m, 0, imin(ntohs(ip->ip_len), 68), M_DONTWAIT);
1691	if (mcopy)
1692		mcopy = m_pullup(mcopy, ip->ip_hl << 2);
1693
1694	/*
1695	 * If forwarding packet using same interface that it came in on,
1696	 * perhaps should send a redirect to sender to shortcut a hop.
1697	 * Only send redirect if source is sending directly to us,
1698	 * and if packet was not source routed (or has any options).
1699	 * Also, don't send redirect if forwarding using a default route
1700	 * or a route modified by a redirect.
1701	 */
1702	if (rt->rt_ifp == m->m_pkthdr.rcvif &&
1703	    (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
1704	    !in_nullhost(satosin(rt_key(rt))->sin_addr) &&
1705	    ipsendredirects && !srcrt) {
1706		if (rt->rt_ifa &&
1707		    (ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_subnetmask) ==
1708		    ifatoia(rt->rt_ifa)->ia_subnet) {
1709			if (rt->rt_flags & RTF_GATEWAY)
1710				dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
1711			else
1712				dest = ip->ip_dst.s_addr;
1713			/*
1714			 * Router requirements says to only send host
1715			 * redirects.
1716			 */
1717			type = ICMP_REDIRECT;
1718			code = ICMP_REDIRECT_HOST;
1719#ifdef DIAGNOSTIC
1720			if (ipprintfs)
1721				printf("redirect (%d) to %x\n", code,
1722				    (u_int32_t)dest);
1723#endif
1724		}
1725	}
1726
1727	error = ip_output(m, (struct mbuf *)0, &ipforward_rt,
1728	    (IP_FORWARDING | (ip_directedbcast ? IP_ALLOWBROADCAST : 0)),
1729	    (struct ip_moptions *)NULL, (struct socket *)NULL);
1730
1731	if (error)
1732		ipstat.ips_cantforward++;
1733	else {
1734		ipstat.ips_forward++;
1735		if (type)
1736			ipstat.ips_redirectsent++;
1737		else {
1738			if (mcopy) {
1739#ifdef GATEWAY
1740				if (mcopy->m_flags & M_CANFASTFWD)
1741					ipflow_create(&ipforward_rt, mcopy);
1742#endif
1743				m_freem(mcopy);
1744			}
1745			return;
1746		}
1747	}
1748	if (mcopy == NULL)
1749		return;
1750	destifp = NULL;
1751
1752	switch (error) {
1753
1754	case 0:				/* forwarded, but need redirect */
1755		/* type, code set above */
1756		break;
1757
1758	case ENETUNREACH:		/* shouldn't happen, checked above */
1759	case EHOSTUNREACH:
1760	case ENETDOWN:
1761	case EHOSTDOWN:
1762	default:
1763		type = ICMP_UNREACH;
1764		code = ICMP_UNREACH_HOST;
1765		break;
1766
1767	case EMSGSIZE:
1768		type = ICMP_UNREACH;
1769		code = ICMP_UNREACH_NEEDFRAG;
1770#if !defined(IPSEC) && !defined(FAST_IPSEC)
1771		if (ipforward_rt.ro_rt)
1772			destifp = ipforward_rt.ro_rt->rt_ifp;
1773#else
1774		/*
1775		 * If the packet is routed over IPsec tunnel, tell the
1776		 * originator the tunnel MTU.
1777		 *	tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
1778		 * XXX quickhack!!!
1779		 */
1780		if (ipforward_rt.ro_rt) {
1781			struct secpolicy *sp;
1782			int ipsecerror;
1783			size_t ipsechdr;
1784			struct route *ro;
1785
1786			sp = ipsec4_getpolicybyaddr(mcopy,
1787			    IPSEC_DIR_OUTBOUND, IP_FORWARDING,
1788			    &ipsecerror);
1789
1790			if (sp == NULL)
1791				destifp = ipforward_rt.ro_rt->rt_ifp;
1792			else {
1793				/* count IPsec header size */
1794				ipsechdr = ipsec4_hdrsiz(mcopy,
1795				    IPSEC_DIR_OUTBOUND, NULL);
1796
1797				/*
1798				 * find the correct route for outer IPv4
1799				 * header, compute tunnel MTU.
1800				 *
1801				 * XXX BUG ALERT
1802				 * The "dummyifp" code relies upon the fact
1803				 * that icmp_error() touches only ifp->if_mtu.
1804				 */
1805				/*XXX*/
1806				destifp = NULL;
1807				if (sp->req != NULL
1808				 && sp->req->sav != NULL
1809				 && sp->req->sav->sah != NULL) {
1810					ro = &sp->req->sav->sah->sa_route;
1811					if (ro->ro_rt && ro->ro_rt->rt_ifp) {
1812						dummyifp.if_mtu =
1813						    ro->ro_rt->rt_rmx.rmx_mtu ?
1814						    ro->ro_rt->rt_rmx.rmx_mtu :
1815						    ro->ro_rt->rt_ifp->if_mtu;
1816						dummyifp.if_mtu -= ipsechdr;
1817						destifp = &dummyifp;
1818					}
1819				}
1820
1821#ifdef	IPSEC
1822				key_freesp(sp);
1823#else
1824				KEY_FREESP(&sp);
1825#endif
1826			}
1827		}
1828#endif /*IPSEC*/
1829		ipstat.ips_cantfrag++;
1830		break;
1831
1832	case ENOBUFS:
1833#if 1
1834		/*
1835		 * a router should not generate ICMP_SOURCEQUENCH as
1836		 * required in RFC1812 Requirements for IP Version 4 Routers.
1837		 * source quench could be a big problem under DoS attacks,
1838		 * or if the underlying interface is rate-limited.
1839		 */
1840		if (mcopy)
1841			m_freem(mcopy);
1842		return;
1843#else
1844		type = ICMP_SOURCEQUENCH;
1845		code = 0;
1846		break;
1847#endif
1848	}
1849	icmp_error(mcopy, type, code, dest, destifp);
1850}
1851
1852void
1853ip_savecontrol(inp, mp, ip, m)
1854	struct inpcb *inp;
1855	struct mbuf **mp;
1856	struct ip *ip;
1857	struct mbuf *m;
1858{
1859
1860	if (inp->inp_socket->so_options & SO_TIMESTAMP) {
1861		struct timeval tv;
1862
1863		microtime(&tv);
1864		*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
1865		    SCM_TIMESTAMP, SOL_SOCKET);
1866		if (*mp)
1867			mp = &(*mp)->m_next;
1868	}
1869	if (inp->inp_flags & INP_RECVDSTADDR) {
1870		*mp = sbcreatecontrol((caddr_t) &ip->ip_dst,
1871		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
1872		if (*mp)
1873			mp = &(*mp)->m_next;
1874	}
1875#ifdef notyet
1876	/*
1877	 * XXX
1878	 * Moving these out of udp_input() made them even more broken
1879	 * than they already were.
1880	 *	- fenner@parc.xerox.com
1881	 */
1882	/* options were tossed already */
1883	if (inp->inp_flags & INP_RECVOPTS) {
1884		*mp = sbcreatecontrol((caddr_t) opts_deleted_above,
1885		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
1886		if (*mp)
1887			mp = &(*mp)->m_next;
1888	}
1889	/* ip_srcroute doesn't do what we want here, need to fix */
1890	if (inp->inp_flags & INP_RECVRETOPTS) {
1891		*mp = sbcreatecontrol((caddr_t) ip_srcroute(),
1892		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
1893		if (*mp)
1894			mp = &(*mp)->m_next;
1895	}
1896#endif
1897	if (inp->inp_flags & INP_RECVIF) {
1898		struct sockaddr_dl sdl;
1899
1900		sdl.sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]);
1901		sdl.sdl_family = AF_LINK;
1902		sdl.sdl_index = m->m_pkthdr.rcvif ?
1903		    m->m_pkthdr.rcvif->if_index : 0;
1904		sdl.sdl_nlen = sdl.sdl_alen = sdl.sdl_slen = 0;
1905		*mp = sbcreatecontrol((caddr_t) &sdl, sdl.sdl_len,
1906		    IP_RECVIF, IPPROTO_IP);
1907		if (*mp)
1908			mp = &(*mp)->m_next;
1909	}
1910}
1911
1912int
1913ip_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
1914	int *name;
1915	u_int namelen;
1916	void *oldp;
1917	size_t *oldlenp;
1918	void *newp;
1919	size_t newlen;
1920{
1921	extern int subnetsarelocal, hostzeroisbroadcast;
1922
1923	int error, old;
1924
1925	/* All sysctl names (except ifq.*) at this level are terminal. */
1926	if ((namelen != 1) && !(namelen == 2 && name[0] == IPCTL_IFQ))
1927		return (ENOTDIR);
1928
1929	switch (name[0]) {
1930	case IPCTL_FORWARDING:
1931		return (sysctl_int(oldp, oldlenp, newp, newlen, &ipforwarding));
1932	case IPCTL_SENDREDIRECTS:
1933		return (sysctl_int(oldp, oldlenp, newp, newlen,
1934			&ipsendredirects));
1935	case IPCTL_DEFTTL:
1936		return (sysctl_int(oldp, oldlenp, newp, newlen, &ip_defttl));
1937#ifdef notyet
1938	case IPCTL_DEFMTU:
1939		return (sysctl_int(oldp, oldlenp, newp, newlen, &ip_mtu));
1940#endif
1941	case IPCTL_FORWSRCRT:
1942		/* Don't allow this to change in a secure environment.  */
1943		if (securelevel > 0)
1944			return (sysctl_rdint(oldp, oldlenp, newp,
1945			    ip_forwsrcrt));
1946		else
1947			return (sysctl_int(oldp, oldlenp, newp, newlen,
1948			    &ip_forwsrcrt));
1949	case IPCTL_DIRECTEDBCAST:
1950		return (sysctl_int(oldp, oldlenp, newp, newlen,
1951		    &ip_directedbcast));
1952	case IPCTL_ALLOWSRCRT:
1953		return (sysctl_int(oldp, oldlenp, newp, newlen,
1954		    &ip_allowsrcrt));
1955	case IPCTL_SUBNETSARELOCAL:
1956		return (sysctl_int(oldp, oldlenp, newp, newlen,
1957		    &subnetsarelocal));
1958	case IPCTL_MTUDISC:
1959		error = sysctl_int(oldp, oldlenp, newp, newlen,
1960		    &ip_mtudisc);
1961		if (error == 0 && ip_mtudisc == 0)
1962			rt_timer_queue_remove_all(ip_mtudisc_timeout_q, TRUE);
1963		return error;
1964	case IPCTL_ANONPORTMIN:
1965		old = anonportmin;
1966		error = sysctl_int(oldp, oldlenp, newp, newlen, &anonportmin);
1967		if (anonportmin >= anonportmax || anonportmin < 0
1968		    || anonportmin > 65535
1969#ifndef IPNOPRIVPORTS
1970		    || anonportmin < IPPORT_RESERVED
1971#endif
1972		    ) {
1973			anonportmin = old;
1974			return (EINVAL);
1975		}
1976		return (error);
1977	case IPCTL_ANONPORTMAX:
1978		old = anonportmax;
1979		error = sysctl_int(oldp, oldlenp, newp, newlen, &anonportmax);
1980		if (anonportmin >= anonportmax || anonportmax < 0
1981		    || anonportmax > 65535
1982#ifndef IPNOPRIVPORTS
1983		    || anonportmax < IPPORT_RESERVED
1984#endif
1985		    ) {
1986			anonportmax = old;
1987			return (EINVAL);
1988		}
1989		return (error);
1990	case IPCTL_MTUDISCTIMEOUT:
1991		old = ip_mtudisc_timeout;
1992		error = sysctl_int(oldp, oldlenp, newp, newlen,
1993		   &ip_mtudisc_timeout);
1994		if (ip_mtudisc_timeout < 0) {
1995			ip_mtudisc_timeout = old;
1996			return (EINVAL);
1997		}
1998		if (error == 0)
1999			rt_timer_queue_change(ip_mtudisc_timeout_q,
2000					      ip_mtudisc_timeout);
2001		return (error);
2002#ifdef GATEWAY
2003	case IPCTL_MAXFLOWS:
2004	    {
2005		int s;
2006
2007		error = sysctl_int(oldp, oldlenp, newp, newlen,
2008		   &ip_maxflows);
2009		s = splsoftnet();
2010		ipflow_reap(0);
2011		splx(s);
2012		return (error);
2013	    }
2014#endif
2015	case IPCTL_HOSTZEROBROADCAST:
2016		return (sysctl_int(oldp, oldlenp, newp, newlen,
2017		    &hostzeroisbroadcast));
2018#if NGIF > 0
2019	case IPCTL_GIF_TTL:
2020		return (sysctl_int(oldp, oldlenp, newp, newlen,
2021				  &ip_gif_ttl));
2022#endif
2023
2024#if NGRE > 0
2025	case IPCTL_GRE_TTL:
2026		return (sysctl_int(oldp, oldlenp, newp, newlen,
2027				  &ip_gre_ttl));
2028#endif
2029
2030#ifndef IPNOPRIVPORTS
2031	case IPCTL_LOWPORTMIN:
2032		old = lowportmin;
2033		error = sysctl_int(oldp, oldlenp, newp, newlen, &lowportmin);
2034		if (lowportmin >= lowportmax
2035		    || lowportmin > IPPORT_RESERVEDMAX
2036		    || lowportmin < IPPORT_RESERVEDMIN
2037		    ) {
2038			lowportmin = old;
2039			return (EINVAL);
2040		}
2041		return (error);
2042	case IPCTL_LOWPORTMAX:
2043		old = lowportmax;
2044		error = sysctl_int(oldp, oldlenp, newp, newlen, &lowportmax);
2045		if (lowportmin >= lowportmax
2046		    || lowportmax > IPPORT_RESERVEDMAX
2047		    || lowportmax < IPPORT_RESERVEDMIN
2048		    ) {
2049			lowportmax = old;
2050			return (EINVAL);
2051		}
2052		return (error);
2053#endif
2054
2055	case IPCTL_MAXFRAGPACKETS:
2056		return (sysctl_int(oldp, oldlenp, newp, newlen,
2057		    &ip_maxfragpackets));
2058
2059	case IPCTL_CHECKINTERFACE:
2060		return (sysctl_int(oldp, oldlenp, newp, newlen,
2061		    &ip_checkinterface));
2062
2063	case IPCTL_IFQ:
2064		return (sysctl_ifq(name + 1, namelen - 1, oldp, oldlenp,
2065		    newp, newlen, &ipintrq));
2066
2067	default:
2068		return (EOPNOTSUPP);
2069	}
2070	/* NOTREACHED */
2071}
2072