ip_reass.c revision 271293
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/netinet/ip_input.c 271293 2014-09-09 01:45:39Z adrian $");
34
35#include "opt_bootp.h"
36#include "opt_ipfw.h"
37#include "opt_ipstealth.h"
38#include "opt_ipsec.h"
39#include "opt_route.h"
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/mbuf.h>
44#include <sys/malloc.h>
45#include <sys/domain.h>
46#include <sys/protosw.h>
47#include <sys/socket.h>
48#include <sys/time.h>
49#include <sys/kernel.h>
50#include <sys/lock.h>
51#include <sys/rwlock.h>
52#include <sys/sdt.h>
53#include <sys/syslog.h>
54#include <sys/sysctl.h>
55
56#include <net/pfil.h>
57#include <net/if.h>
58#include <net/if_types.h>
59#include <net/if_var.h>
60#include <net/if_dl.h>
61#include <net/route.h>
62#include <net/netisr.h>
63#include <net/vnet.h>
64
65#include <netinet/in.h>
66#include <netinet/in_kdtrace.h>
67#include <netinet/in_systm.h>
68#include <netinet/in_var.h>
69#include <netinet/ip.h>
70#include <netinet/in_pcb.h>
71#include <netinet/ip_var.h>
72#include <netinet/ip_fw.h>
73#include <netinet/ip_icmp.h>
74#include <netinet/ip_options.h>
75#include <machine/in_cksum.h>
76#include <netinet/ip_carp.h>
77#ifdef IPSEC
78#include <netinet/ip_ipsec.h>
79#endif /* IPSEC */
80
81#include <sys/socketvar.h>
82
83#include <security/mac/mac_framework.h>
84
85#ifdef CTASSERT
86CTASSERT(sizeof(struct ip) == 20);
87#endif
88
89struct	rwlock in_ifaddr_lock;
90RW_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock");
91
92VNET_DEFINE(int, rsvp_on);
93
94VNET_DEFINE(int, ipforwarding);
95SYSCTL_VNET_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW,
96    &VNET_NAME(ipforwarding), 0,
97    "Enable IP forwarding between interfaces");
98
99static VNET_DEFINE(int, ipsendredirects) = 1;	/* XXX */
100#define	V_ipsendredirects	VNET(ipsendredirects)
101SYSCTL_VNET_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW,
102    &VNET_NAME(ipsendredirects), 0,
103    "Enable sending IP redirects");
104
105static VNET_DEFINE(int, ip_keepfaith);
106#define	V_ip_keepfaith		VNET(ip_keepfaith)
107SYSCTL_VNET_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW,
108    &VNET_NAME(ip_keepfaith), 0,
109    "Enable packet capture for FAITH IPv4->IPv6 translater daemon");
110
111static VNET_DEFINE(int, ip_sendsourcequench);
112#define	V_ip_sendsourcequench	VNET(ip_sendsourcequench)
113SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW,
114    &VNET_NAME(ip_sendsourcequench), 0,
115    "Enable the transmission of source quench packets");
116
117VNET_DEFINE(int, ip_do_randomid);
118SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW,
119    &VNET_NAME(ip_do_randomid), 0,
120    "Assign random ip_id values");
121
122/*
123 * XXX - Setting ip_checkinterface mostly implements the receive side of
124 * the Strong ES model described in RFC 1122, but since the routing table
125 * and transmit implementation do not implement the Strong ES model,
126 * setting this to 1 results in an odd hybrid.
127 *
128 * XXX - ip_checkinterface currently must be disabled if you use ipnat
129 * to translate the destination address to another local interface.
130 *
131 * XXX - ip_checkinterface must be disabled if you add IP aliases
132 * to the loopback interface instead of the interface where the
133 * packets for those addresses are received.
134 */
135static VNET_DEFINE(int, ip_checkinterface);
136#define	V_ip_checkinterface	VNET(ip_checkinterface)
137SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW,
138    &VNET_NAME(ip_checkinterface), 0,
139    "Verify packet arrives on correct interface");
140
141VNET_DEFINE(struct pfil_head, inet_pfil_hook);	/* Packet filter hooks */
142
143static struct netisr_handler ip_nh = {
144	.nh_name = "ip",
145	.nh_handler = ip_input,
146	.nh_proto = NETISR_IP,
147	.nh_policy = NETISR_POLICY_FLOW,
148};
149
150extern	struct domain inetdomain;
151extern	struct protosw inetsw[];
152u_char	ip_protox[IPPROTO_MAX];
153VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead);  /* first inet address */
154VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table  */
155VNET_DEFINE(u_long, in_ifaddrhmask);		/* mask for hash table */
156
157static VNET_DEFINE(uma_zone_t, ipq_zone);
158static VNET_DEFINE(TAILQ_HEAD(ipqhead, ipq), ipq[IPREASS_NHASH]);
159static struct mtx ipqlock;
160
161#define	V_ipq_zone		VNET(ipq_zone)
162#define	V_ipq			VNET(ipq)
163
164#define	IPQ_LOCK()	mtx_lock(&ipqlock)
165#define	IPQ_UNLOCK()	mtx_unlock(&ipqlock)
166#define	IPQ_LOCK_INIT()	mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF)
167#define	IPQ_LOCK_ASSERT()	mtx_assert(&ipqlock, MA_OWNED)
168
169static void	maxnipq_update(void);
170static void	ipq_zone_change(void *);
171static void	ip_drain_locked(void);
172
173static VNET_DEFINE(int, maxnipq);  /* Administrative limit on # reass queues. */
174static VNET_DEFINE(int, nipq);			/* Total # of reass queues */
175#define	V_maxnipq		VNET(maxnipq)
176#define	V_nipq			VNET(nipq)
177SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD,
178    &VNET_NAME(nipq), 0,
179    "Current number of IPv4 fragment reassembly queue entries");
180
181static VNET_DEFINE(int, maxfragsperpacket);
182#define	V_maxfragsperpacket	VNET(maxfragsperpacket)
183SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW,
184    &VNET_NAME(maxfragsperpacket), 0,
185    "Maximum number of IPv4 fragments allowed per packet");
186
187#ifdef IPCTL_DEFMTU
188SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
189    &ip_mtu, 0, "Default MTU");
190#endif
191
192#ifdef IPSTEALTH
193VNET_DEFINE(int, ipstealth);
194SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW,
195    &VNET_NAME(ipstealth), 0,
196    "IP stealth mode, no TTL decrementation on forwarding");
197#endif
198
199static void	ip_freef(struct ipqhead *, struct ipq *);
200
201/*
202 * IP statistics are stored in the "array" of counter(9)s.
203 */
204VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat);
205VNET_PCPUSTAT_SYSINIT(ipstat);
206SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat,
207    "IP statistics (struct ipstat, netinet/ip_var.h)");
208
209#ifdef VIMAGE
210VNET_PCPUSTAT_SYSUNINIT(ipstat);
211#endif /* VIMAGE */
212
213/*
214 * Kernel module interface for updating ipstat.  The argument is an index
215 * into ipstat treated as an array.
216 */
217void
218kmod_ipstat_inc(int statnum)
219{
220
221	counter_u64_add(VNET(ipstat)[statnum], 1);
222}
223
224void
225kmod_ipstat_dec(int statnum)
226{
227
228	counter_u64_add(VNET(ipstat)[statnum], -1);
229}
230
231static int
232sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS)
233{
234	int error, qlimit;
235
236	netisr_getqlimit(&ip_nh, &qlimit);
237	error = sysctl_handle_int(oidp, &qlimit, 0, req);
238	if (error || !req->newptr)
239		return (error);
240	if (qlimit < 1)
241		return (EINVAL);
242	return (netisr_setqlimit(&ip_nh, qlimit));
243}
244SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen,
245    CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I",
246    "Maximum size of the IP input queue");
247
248static int
249sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS)
250{
251	u_int64_t qdrops_long;
252	int error, qdrops;
253
254	netisr_getqdrops(&ip_nh, &qdrops_long);
255	qdrops = qdrops_long;
256	error = sysctl_handle_int(oidp, &qdrops, 0, req);
257	if (error || !req->newptr)
258		return (error);
259	if (qdrops != 0)
260		return (EINVAL);
261	netisr_clearqdrops(&ip_nh);
262	return (0);
263}
264
265SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops,
266    CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I",
267    "Number of packets dropped from the IP input queue");
268
269/*
270 * IP initialization: fill in IP protocol switch table.
271 * All protocols not implemented in kernel go to raw IP protocol handler.
272 */
273void
274ip_init(void)
275{
276	struct protosw *pr;
277	int i;
278
279	V_ip_id = time_second & 0xffff;
280
281	TAILQ_INIT(&V_in_ifaddrhead);
282	V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
283
284	/* Initialize IP reassembly queue. */
285	for (i = 0; i < IPREASS_NHASH; i++)
286		TAILQ_INIT(&V_ipq[i]);
287	V_maxnipq = nmbclusters / 32;
288	V_maxfragsperpacket = 16;
289	V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
290	    NULL, UMA_ALIGN_PTR, 0);
291	maxnipq_update();
292
293	/* Initialize packet filter hooks. */
294	V_inet_pfil_hook.ph_type = PFIL_TYPE_AF;
295	V_inet_pfil_hook.ph_af = AF_INET;
296	if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0)
297		printf("%s: WARNING: unable to register pfil hook, "
298			"error %d\n", __func__, i);
299
300	/* Skip initialization of globals for non-default instances. */
301	if (!IS_DEFAULT_VNET(curvnet))
302		return;
303
304	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
305	if (pr == NULL)
306		panic("ip_init: PF_INET not found");
307
308	/* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
309	for (i = 0; i < IPPROTO_MAX; i++)
310		ip_protox[i] = pr - inetsw;
311	/*
312	 * Cycle through IP protocols and put them into the appropriate place
313	 * in ip_protox[].
314	 */
315	for (pr = inetdomain.dom_protosw;
316	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
317		if (pr->pr_domain->dom_family == PF_INET &&
318		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
319			/* Be careful to only index valid IP protocols. */
320			if (pr->pr_protocol < IPPROTO_MAX)
321				ip_protox[pr->pr_protocol] = pr - inetsw;
322		}
323
324	EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change,
325		NULL, EVENTHANDLER_PRI_ANY);
326
327	/* Initialize various other remaining things. */
328	IPQ_LOCK_INIT();
329	netisr_register(&ip_nh);
330}
331
332#ifdef VIMAGE
333void
334ip_destroy(void)
335{
336	int i;
337
338	if ((i = pfil_head_unregister(&V_inet_pfil_hook)) != 0)
339		printf("%s: WARNING: unable to unregister pfil hook, "
340		    "error %d\n", __func__, i);
341
342	/* Cleanup in_ifaddr hash table; should be empty. */
343	hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask);
344
345	IPQ_LOCK();
346	ip_drain_locked();
347	IPQ_UNLOCK();
348
349	uma_zdestroy(V_ipq_zone);
350}
351#endif
352
353/*
354 * Ip input routine.  Checksum and byte swap header.  If fragmented
355 * try to reassemble.  Process options.  Pass to next level.
356 */
357void
358ip_input(struct mbuf *m)
359{
360	struct ip *ip = NULL;
361	struct in_ifaddr *ia = NULL;
362	struct ifaddr *ifa;
363	struct ifnet *ifp;
364	int    checkif, hlen = 0;
365	uint16_t sum, ip_len;
366	int dchg = 0;				/* dest changed after fw */
367	struct in_addr odst;			/* original dst address */
368
369	M_ASSERTPKTHDR(m);
370
371	if (m->m_flags & M_FASTFWD_OURS) {
372		m->m_flags &= ~M_FASTFWD_OURS;
373		/* Set up some basics that will be used later. */
374		ip = mtod(m, struct ip *);
375		hlen = ip->ip_hl << 2;
376		ip_len = ntohs(ip->ip_len);
377		goto ours;
378	}
379
380	IPSTAT_INC(ips_total);
381
382	if (m->m_pkthdr.len < sizeof(struct ip))
383		goto tooshort;
384
385	if (m->m_len < sizeof (struct ip) &&
386	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
387		IPSTAT_INC(ips_toosmall);
388		return;
389	}
390	ip = mtod(m, struct ip *);
391
392	if (ip->ip_v != IPVERSION) {
393		IPSTAT_INC(ips_badvers);
394		goto bad;
395	}
396
397	hlen = ip->ip_hl << 2;
398	if (hlen < sizeof(struct ip)) {	/* minimum header length */
399		IPSTAT_INC(ips_badhlen);
400		goto bad;
401	}
402	if (hlen > m->m_len) {
403		if ((m = m_pullup(m, hlen)) == NULL) {
404			IPSTAT_INC(ips_badhlen);
405			return;
406		}
407		ip = mtod(m, struct ip *);
408	}
409
410	IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL);
411
412	/* 127/8 must not appear on wire - RFC1122 */
413	ifp = m->m_pkthdr.rcvif;
414	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
415	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
416		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
417			IPSTAT_INC(ips_badaddr);
418			goto bad;
419		}
420	}
421
422	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
423		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
424	} else {
425		if (hlen == sizeof(struct ip)) {
426			sum = in_cksum_hdr(ip);
427		} else {
428			sum = in_cksum(m, hlen);
429		}
430	}
431	if (sum) {
432		IPSTAT_INC(ips_badsum);
433		goto bad;
434	}
435
436#ifdef ALTQ
437	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
438		/* packet is dropped by traffic conditioner */
439		return;
440#endif
441
442	ip_len = ntohs(ip->ip_len);
443	if (ip_len < hlen) {
444		IPSTAT_INC(ips_badlen);
445		goto bad;
446	}
447
448	/*
449	 * Check that the amount of data in the buffers
450	 * is as at least much as the IP header would have us expect.
451	 * Trim mbufs if longer than we expect.
452	 * Drop packet if shorter than we expect.
453	 */
454	if (m->m_pkthdr.len < ip_len) {
455tooshort:
456		IPSTAT_INC(ips_tooshort);
457		goto bad;
458	}
459	if (m->m_pkthdr.len > ip_len) {
460		if (m->m_len == m->m_pkthdr.len) {
461			m->m_len = ip_len;
462			m->m_pkthdr.len = ip_len;
463		} else
464			m_adj(m, ip_len - m->m_pkthdr.len);
465	}
466#ifdef IPSEC
467	/*
468	 * Bypass packet filtering for packets previously handled by IPsec.
469	 */
470	if (ip_ipsec_filtertunnel(m))
471		goto passin;
472#endif /* IPSEC */
473
474	/*
475	 * Run through list of hooks for input packets.
476	 *
477	 * NB: Beware of the destination address changing (e.g.
478	 *     by NAT rewriting).  When this happens, tell
479	 *     ip_forward to do the right thing.
480	 */
481
482	/* Jump over all PFIL processing if hooks are not active. */
483	if (!PFIL_HOOKED(&V_inet_pfil_hook))
484		goto passin;
485
486	odst = ip->ip_dst;
487	if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0)
488		return;
489	if (m == NULL)			/* consumed by filter */
490		return;
491
492	ip = mtod(m, struct ip *);
493	dchg = (odst.s_addr != ip->ip_dst.s_addr);
494	ifp = m->m_pkthdr.rcvif;
495
496	if (m->m_flags & M_FASTFWD_OURS) {
497		m->m_flags &= ~M_FASTFWD_OURS;
498		goto ours;
499	}
500	if (m->m_flags & M_IP_NEXTHOP) {
501		dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL);
502		if (dchg != 0) {
503			/*
504			 * Directly ship the packet on.  This allows
505			 * forwarding packets originally destined to us
506			 * to some other directly connected host.
507			 */
508			ip_forward(m, 1);
509			return;
510		}
511	}
512passin:
513
514	/*
515	 * Process options and, if not destined for us,
516	 * ship it on.  ip_dooptions returns 1 when an
517	 * error was detected (causing an icmp message
518	 * to be sent and the original packet to be freed).
519	 */
520	if (hlen > sizeof (struct ip) && ip_dooptions(m, 0))
521		return;
522
523        /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
524         * matter if it is destined to another node, or whether it is
525         * a multicast one, RSVP wants it! and prevents it from being forwarded
526         * anywhere else. Also checks if the rsvp daemon is running before
527	 * grabbing the packet.
528         */
529	if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP)
530		goto ours;
531
532	/*
533	 * Check our list of addresses, to see if the packet is for us.
534	 * If we don't have any addresses, assume any unicast packet
535	 * we receive might be for us (and let the upper layers deal
536	 * with it).
537	 */
538	if (TAILQ_EMPTY(&V_in_ifaddrhead) &&
539	    (m->m_flags & (M_MCAST|M_BCAST)) == 0)
540		goto ours;
541
542	/*
543	 * Enable a consistency check between the destination address
544	 * and the arrival interface for a unicast packet (the RFC 1122
545	 * strong ES model) if IP forwarding is disabled and the packet
546	 * is not locally generated and the packet is not subject to
547	 * 'ipfw fwd'.
548	 *
549	 * XXX - Checking also should be disabled if the destination
550	 * address is ipnat'ed to a different interface.
551	 *
552	 * XXX - Checking is incompatible with IP aliases added
553	 * to the loopback interface instead of the interface where
554	 * the packets are received.
555	 *
556	 * XXX - This is the case for carp vhost IPs as well so we
557	 * insert a workaround. If the packet got here, we already
558	 * checked with carp_iamatch() and carp_forus().
559	 */
560	checkif = V_ip_checkinterface && (V_ipforwarding == 0) &&
561	    ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) &&
562	    ifp->if_carp == NULL && (dchg == 0);
563
564	/*
565	 * Check for exact addresses in the hash bucket.
566	 */
567	/* IN_IFADDR_RLOCK(); */
568	LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
569		/*
570		 * If the address matches, verify that the packet
571		 * arrived via the correct interface if checking is
572		 * enabled.
573		 */
574		if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr &&
575		    (!checkif || ia->ia_ifp == ifp)) {
576			counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
577			counter_u64_add(ia->ia_ifa.ifa_ibytes,
578			    m->m_pkthdr.len);
579			/* IN_IFADDR_RUNLOCK(); */
580			goto ours;
581		}
582	}
583	/* IN_IFADDR_RUNLOCK(); */
584
585	/*
586	 * Check for broadcast addresses.
587	 *
588	 * Only accept broadcast packets that arrive via the matching
589	 * interface.  Reception of forwarded directed broadcasts would
590	 * be handled via ip_forward() and ether_output() with the loopback
591	 * into the stack for SIMPLEX interfaces handled by ether_output().
592	 */
593	if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
594		IF_ADDR_RLOCK(ifp);
595	        TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
596			if (ifa->ifa_addr->sa_family != AF_INET)
597				continue;
598			ia = ifatoia(ifa);
599			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
600			    ip->ip_dst.s_addr) {
601				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
602				counter_u64_add(ia->ia_ifa.ifa_ibytes,
603				    m->m_pkthdr.len);
604				IF_ADDR_RUNLOCK(ifp);
605				goto ours;
606			}
607#ifdef BOOTP_COMPAT
608			if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) {
609				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
610				counter_u64_add(ia->ia_ifa.ifa_ibytes,
611				    m->m_pkthdr.len);
612				IF_ADDR_RUNLOCK(ifp);
613				goto ours;
614			}
615#endif
616		}
617		IF_ADDR_RUNLOCK(ifp);
618		ia = NULL;
619	}
620	/* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */
621	if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
622		IPSTAT_INC(ips_cantforward);
623		m_freem(m);
624		return;
625	}
626	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
627		if (V_ip_mrouter) {
628			/*
629			 * If we are acting as a multicast router, all
630			 * incoming multicast packets are passed to the
631			 * kernel-level multicast forwarding function.
632			 * The packet is returned (relatively) intact; if
633			 * ip_mforward() returns a non-zero value, the packet
634			 * must be discarded, else it may be accepted below.
635			 */
636			if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) {
637				IPSTAT_INC(ips_cantforward);
638				m_freem(m);
639				return;
640			}
641
642			/*
643			 * The process-level routing daemon needs to receive
644			 * all multicast IGMP packets, whether or not this
645			 * host belongs to their destination groups.
646			 */
647			if (ip->ip_p == IPPROTO_IGMP)
648				goto ours;
649			IPSTAT_INC(ips_forward);
650		}
651		/*
652		 * Assume the packet is for us, to avoid prematurely taking
653		 * a lock on the in_multi hash. Protocols must perform
654		 * their own filtering and update statistics accordingly.
655		 */
656		goto ours;
657	}
658	if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
659		goto ours;
660	if (ip->ip_dst.s_addr == INADDR_ANY)
661		goto ours;
662
663	/*
664	 * FAITH(Firewall Aided Internet Translator)
665	 */
666	if (ifp && ifp->if_type == IFT_FAITH) {
667		if (V_ip_keepfaith) {
668			if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP)
669				goto ours;
670		}
671		m_freem(m);
672		return;
673	}
674
675	/*
676	 * Not for us; forward if possible and desirable.
677	 */
678	if (V_ipforwarding == 0) {
679		IPSTAT_INC(ips_cantforward);
680		m_freem(m);
681	} else {
682#ifdef IPSEC
683		if (ip_ipsec_fwd(m))
684			goto bad;
685#endif /* IPSEC */
686		ip_forward(m, dchg);
687	}
688	return;
689
690ours:
691#ifdef IPSTEALTH
692	/*
693	 * IPSTEALTH: Process non-routing options only
694	 * if the packet is destined for us.
695	 */
696	if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1))
697		return;
698#endif /* IPSTEALTH */
699
700	/*
701	 * Attempt reassembly; if it succeeds, proceed.
702	 * ip_reass() will return a different mbuf.
703	 */
704	if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) {
705		/* XXXGL: shouldn't we save & set m_flags? */
706		m = ip_reass(m);
707		if (m == NULL)
708			return;
709		ip = mtod(m, struct ip *);
710		/* Get the header length of the reassembled packet */
711		hlen = ip->ip_hl << 2;
712	}
713
714#ifdef IPSEC
715	/*
716	 * enforce IPsec policy checking if we are seeing last header.
717	 * note that we do not visit this with protocols with pcb layer
718	 * code - like udp/tcp/raw ip.
719	 */
720	if (ip_ipsec_input(m))
721		goto bad;
722#endif /* IPSEC */
723
724	/*
725	 * Switch out to protocol's input routine.
726	 */
727	IPSTAT_INC(ips_delivered);
728
729	(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
730	return;
731bad:
732	m_freem(m);
733}
734
735/*
736 * After maxnipq has been updated, propagate the change to UMA.  The UMA zone
737 * max has slightly different semantics than the sysctl, for historical
738 * reasons.
739 */
740static void
741maxnipq_update(void)
742{
743
744	/*
745	 * -1 for unlimited allocation.
746	 */
747	if (V_maxnipq < 0)
748		uma_zone_set_max(V_ipq_zone, 0);
749	/*
750	 * Positive number for specific bound.
751	 */
752	if (V_maxnipq > 0)
753		uma_zone_set_max(V_ipq_zone, V_maxnipq);
754	/*
755	 * Zero specifies no further fragment queue allocation -- set the
756	 * bound very low, but rely on implementation elsewhere to actually
757	 * prevent allocation and reclaim current queues.
758	 */
759	if (V_maxnipq == 0)
760		uma_zone_set_max(V_ipq_zone, 1);
761}
762
763static void
764ipq_zone_change(void *tag)
765{
766
767	if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) {
768		V_maxnipq = nmbclusters / 32;
769		maxnipq_update();
770	}
771}
772
773static int
774sysctl_maxnipq(SYSCTL_HANDLER_ARGS)
775{
776	int error, i;
777
778	i = V_maxnipq;
779	error = sysctl_handle_int(oidp, &i, 0, req);
780	if (error || !req->newptr)
781		return (error);
782
783	/*
784	 * XXXRW: Might be a good idea to sanity check the argument and place
785	 * an extreme upper bound.
786	 */
787	if (i < -1)
788		return (EINVAL);
789	V_maxnipq = i;
790	maxnipq_update();
791	return (0);
792}
793
794SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW,
795    NULL, 0, sysctl_maxnipq, "I",
796    "Maximum number of IPv4 fragment reassembly queue entries");
797
798#define	M_IP_FRAG	M_PROTO9
799
800/*
801 * Take incoming datagram fragment and try to reassemble it into
802 * whole datagram.  If the argument is the first fragment or one
803 * in between the function will return NULL and store the mbuf
804 * in the fragment chain.  If the argument is the last fragment
805 * the packet will be reassembled and the pointer to the new
806 * mbuf returned for further processing.  Only m_tags attached
807 * to the first packet/fragment are preserved.
808 * The IP header is *NOT* adjusted out of iplen.
809 */
810struct mbuf *
811ip_reass(struct mbuf *m)
812{
813	struct ip *ip;
814	struct mbuf *p, *q, *nq, *t;
815	struct ipq *fp = NULL;
816	struct ipqhead *head;
817	int i, hlen, next;
818	u_int8_t ecn, ecn0;
819	u_short hash;
820
821	/* If maxnipq or maxfragsperpacket are 0, never accept fragments. */
822	if (V_maxnipq == 0 || V_maxfragsperpacket == 0) {
823		IPSTAT_INC(ips_fragments);
824		IPSTAT_INC(ips_fragdropped);
825		m_freem(m);
826		return (NULL);
827	}
828
829	ip = mtod(m, struct ip *);
830	hlen = ip->ip_hl << 2;
831
832	hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
833	head = &V_ipq[hash];
834	IPQ_LOCK();
835
836	/*
837	 * Look for queue of fragments
838	 * of this datagram.
839	 */
840	TAILQ_FOREACH(fp, head, ipq_list)
841		if (ip->ip_id == fp->ipq_id &&
842		    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
843		    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
844#ifdef MAC
845		    mac_ipq_match(m, fp) &&
846#endif
847		    ip->ip_p == fp->ipq_p)
848			goto found;
849
850	fp = NULL;
851
852	/*
853	 * Attempt to trim the number of allocated fragment queues if it
854	 * exceeds the administrative limit.
855	 */
856	if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) {
857		/*
858		 * drop something from the tail of the current queue
859		 * before proceeding further
860		 */
861		struct ipq *q = TAILQ_LAST(head, ipqhead);
862		if (q == NULL) {   /* gak */
863			for (i = 0; i < IPREASS_NHASH; i++) {
864				struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead);
865				if (r) {
866					IPSTAT_ADD(ips_fragtimeout,
867					    r->ipq_nfrags);
868					ip_freef(&V_ipq[i], r);
869					break;
870				}
871			}
872		} else {
873			IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags);
874			ip_freef(head, q);
875		}
876	}
877
878found:
879	/*
880	 * Adjust ip_len to not reflect header,
881	 * convert offset of this to bytes.
882	 */
883	ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
884	if (ip->ip_off & htons(IP_MF)) {
885		/*
886		 * Make sure that fragments have a data length
887		 * that's a non-zero multiple of 8 bytes.
888		 */
889		if (ip->ip_len == htons(0) || (ntohs(ip->ip_len) & 0x7) != 0) {
890			IPSTAT_INC(ips_toosmall); /* XXX */
891			goto dropfrag;
892		}
893		m->m_flags |= M_IP_FRAG;
894	} else
895		m->m_flags &= ~M_IP_FRAG;
896	ip->ip_off = htons(ntohs(ip->ip_off) << 3);
897
898	/*
899	 * Attempt reassembly; if it succeeds, proceed.
900	 * ip_reass() will return a different mbuf.
901	 */
902	IPSTAT_INC(ips_fragments);
903	m->m_pkthdr.PH_loc.ptr = ip;
904
905	/* Previous ip_reass() started here. */
906	/*
907	 * Presence of header sizes in mbufs
908	 * would confuse code below.
909	 */
910	m->m_data += hlen;
911	m->m_len -= hlen;
912
913	/*
914	 * If first fragment to arrive, create a reassembly queue.
915	 */
916	if (fp == NULL) {
917		fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
918		if (fp == NULL)
919			goto dropfrag;
920#ifdef MAC
921		if (mac_ipq_init(fp, M_NOWAIT) != 0) {
922			uma_zfree(V_ipq_zone, fp);
923			fp = NULL;
924			goto dropfrag;
925		}
926		mac_ipq_create(m, fp);
927#endif
928		TAILQ_INSERT_HEAD(head, fp, ipq_list);
929		V_nipq++;
930		fp->ipq_nfrags = 1;
931		fp->ipq_ttl = IPFRAGTTL;
932		fp->ipq_p = ip->ip_p;
933		fp->ipq_id = ip->ip_id;
934		fp->ipq_src = ip->ip_src;
935		fp->ipq_dst = ip->ip_dst;
936		fp->ipq_frags = m;
937		m->m_nextpkt = NULL;
938		goto done;
939	} else {
940		fp->ipq_nfrags++;
941#ifdef MAC
942		mac_ipq_update(m, fp);
943#endif
944	}
945
946#define GETIP(m)	((struct ip*)((m)->m_pkthdr.PH_loc.ptr))
947
948	/*
949	 * Handle ECN by comparing this segment with the first one;
950	 * if CE is set, do not lose CE.
951	 * drop if CE and not-ECT are mixed for the same packet.
952	 */
953	ecn = ip->ip_tos & IPTOS_ECN_MASK;
954	ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
955	if (ecn == IPTOS_ECN_CE) {
956		if (ecn0 == IPTOS_ECN_NOTECT)
957			goto dropfrag;
958		if (ecn0 != IPTOS_ECN_CE)
959			GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
960	}
961	if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
962		goto dropfrag;
963
964	/*
965	 * Find a segment which begins after this one does.
966	 */
967	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
968		if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off))
969			break;
970
971	/*
972	 * If there is a preceding segment, it may provide some of
973	 * our data already.  If so, drop the data from the incoming
974	 * segment.  If it provides all of our data, drop us, otherwise
975	 * stick new segment in the proper place.
976	 *
977	 * If some of the data is dropped from the preceding
978	 * segment, then it's checksum is invalidated.
979	 */
980	if (p) {
981		i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) -
982		    ntohs(ip->ip_off);
983		if (i > 0) {
984			if (i >= ntohs(ip->ip_len))
985				goto dropfrag;
986			m_adj(m, i);
987			m->m_pkthdr.csum_flags = 0;
988			ip->ip_off = htons(ntohs(ip->ip_off) + i);
989			ip->ip_len = htons(ntohs(ip->ip_len) - i);
990		}
991		m->m_nextpkt = p->m_nextpkt;
992		p->m_nextpkt = m;
993	} else {
994		m->m_nextpkt = fp->ipq_frags;
995		fp->ipq_frags = m;
996	}
997
998	/*
999	 * While we overlap succeeding segments trim them or,
1000	 * if they are completely covered, dequeue them.
1001	 */
1002	for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) >
1003	    ntohs(GETIP(q)->ip_off); q = nq) {
1004		i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) -
1005		    ntohs(GETIP(q)->ip_off);
1006		if (i < ntohs(GETIP(q)->ip_len)) {
1007			GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i);
1008			GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i);
1009			m_adj(q, i);
1010			q->m_pkthdr.csum_flags = 0;
1011			break;
1012		}
1013		nq = q->m_nextpkt;
1014		m->m_nextpkt = nq;
1015		IPSTAT_INC(ips_fragdropped);
1016		fp->ipq_nfrags--;
1017		m_freem(q);
1018	}
1019
1020	/*
1021	 * Check for complete reassembly and perform frag per packet
1022	 * limiting.
1023	 *
1024	 * Frag limiting is performed here so that the nth frag has
1025	 * a chance to complete the packet before we drop the packet.
1026	 * As a result, n+1 frags are actually allowed per packet, but
1027	 * only n will ever be stored. (n = maxfragsperpacket.)
1028	 *
1029	 */
1030	next = 0;
1031	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
1032		if (ntohs(GETIP(q)->ip_off) != next) {
1033			if (fp->ipq_nfrags > V_maxfragsperpacket) {
1034				IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
1035				ip_freef(head, fp);
1036			}
1037			goto done;
1038		}
1039		next += ntohs(GETIP(q)->ip_len);
1040	}
1041	/* Make sure the last packet didn't have the IP_MF flag */
1042	if (p->m_flags & M_IP_FRAG) {
1043		if (fp->ipq_nfrags > V_maxfragsperpacket) {
1044			IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
1045			ip_freef(head, fp);
1046		}
1047		goto done;
1048	}
1049
1050	/*
1051	 * Reassembly is complete.  Make sure the packet is a sane size.
1052	 */
1053	q = fp->ipq_frags;
1054	ip = GETIP(q);
1055	if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
1056		IPSTAT_INC(ips_toolong);
1057		IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
1058		ip_freef(head, fp);
1059		goto done;
1060	}
1061
1062	/*
1063	 * Concatenate fragments.
1064	 */
1065	m = q;
1066	t = m->m_next;
1067	m->m_next = NULL;
1068	m_cat(m, t);
1069	nq = q->m_nextpkt;
1070	q->m_nextpkt = NULL;
1071	for (q = nq; q != NULL; q = nq) {
1072		nq = q->m_nextpkt;
1073		q->m_nextpkt = NULL;
1074		m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
1075		m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
1076		m_cat(m, q);
1077	}
1078	/*
1079	 * In order to do checksumming faster we do 'end-around carry' here
1080	 * (and not in for{} loop), though it implies we are not going to
1081	 * reassemble more than 64k fragments.
1082	 */
1083	while (m->m_pkthdr.csum_data & 0xffff0000)
1084		m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
1085		    (m->m_pkthdr.csum_data >> 16);
1086#ifdef MAC
1087	mac_ipq_reassemble(fp, m);
1088	mac_ipq_destroy(fp);
1089#endif
1090
1091	/*
1092	 * Create header for new ip packet by modifying header of first
1093	 * packet;  dequeue and discard fragment reassembly header.
1094	 * Make header visible.
1095	 */
1096	ip->ip_len = htons((ip->ip_hl << 2) + next);
1097	ip->ip_src = fp->ipq_src;
1098	ip->ip_dst = fp->ipq_dst;
1099	TAILQ_REMOVE(head, fp, ipq_list);
1100	V_nipq--;
1101	uma_zfree(V_ipq_zone, fp);
1102	m->m_len += (ip->ip_hl << 2);
1103	m->m_data -= (ip->ip_hl << 2);
1104	/* some debugging cruft by sklower, below, will go away soon */
1105	if (m->m_flags & M_PKTHDR)	/* XXX this should be done elsewhere */
1106		m_fixhdr(m);
1107	IPSTAT_INC(ips_reassembled);
1108	IPQ_UNLOCK();
1109	return (m);
1110
1111dropfrag:
1112	IPSTAT_INC(ips_fragdropped);
1113	if (fp != NULL)
1114		fp->ipq_nfrags--;
1115	m_freem(m);
1116done:
1117	IPQ_UNLOCK();
1118	return (NULL);
1119
1120#undef GETIP
1121}
1122
1123/*
1124 * Free a fragment reassembly header and all
1125 * associated datagrams.
1126 */
1127static void
1128ip_freef(struct ipqhead *fhp, struct ipq *fp)
1129{
1130	struct mbuf *q;
1131
1132	IPQ_LOCK_ASSERT();
1133
1134	while (fp->ipq_frags) {
1135		q = fp->ipq_frags;
1136		fp->ipq_frags = q->m_nextpkt;
1137		m_freem(q);
1138	}
1139	TAILQ_REMOVE(fhp, fp, ipq_list);
1140	uma_zfree(V_ipq_zone, fp);
1141	V_nipq--;
1142}
1143
1144/*
1145 * IP timer processing;
1146 * if a timer expires on a reassembly
1147 * queue, discard it.
1148 */
1149void
1150ip_slowtimo(void)
1151{
1152	VNET_ITERATOR_DECL(vnet_iter);
1153	struct ipq *fp;
1154	int i;
1155
1156	VNET_LIST_RLOCK_NOSLEEP();
1157	IPQ_LOCK();
1158	VNET_FOREACH(vnet_iter) {
1159		CURVNET_SET(vnet_iter);
1160		for (i = 0; i < IPREASS_NHASH; i++) {
1161			for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) {
1162				struct ipq *fpp;
1163
1164				fpp = fp;
1165				fp = TAILQ_NEXT(fp, ipq_list);
1166				if(--fpp->ipq_ttl == 0) {
1167					IPSTAT_ADD(ips_fragtimeout,
1168					    fpp->ipq_nfrags);
1169					ip_freef(&V_ipq[i], fpp);
1170				}
1171			}
1172		}
1173		/*
1174		 * If we are over the maximum number of fragments
1175		 * (due to the limit being lowered), drain off
1176		 * enough to get down to the new limit.
1177		 */
1178		if (V_maxnipq >= 0 && V_nipq > V_maxnipq) {
1179			for (i = 0; i < IPREASS_NHASH; i++) {
1180				while (V_nipq > V_maxnipq &&
1181				    !TAILQ_EMPTY(&V_ipq[i])) {
1182					IPSTAT_ADD(ips_fragdropped,
1183					    TAILQ_FIRST(&V_ipq[i])->ipq_nfrags);
1184					ip_freef(&V_ipq[i],
1185					    TAILQ_FIRST(&V_ipq[i]));
1186				}
1187			}
1188		}
1189		CURVNET_RESTORE();
1190	}
1191	IPQ_UNLOCK();
1192	VNET_LIST_RUNLOCK_NOSLEEP();
1193}
1194
1195/*
1196 * Drain off all datagram fragments.
1197 */
1198static void
1199ip_drain_locked(void)
1200{
1201	int     i;
1202
1203	IPQ_LOCK_ASSERT();
1204
1205	for (i = 0; i < IPREASS_NHASH; i++) {
1206		while(!TAILQ_EMPTY(&V_ipq[i])) {
1207			IPSTAT_ADD(ips_fragdropped,
1208			    TAILQ_FIRST(&V_ipq[i])->ipq_nfrags);
1209			ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i]));
1210		}
1211	}
1212}
1213
1214void
1215ip_drain(void)
1216{
1217	VNET_ITERATOR_DECL(vnet_iter);
1218
1219	VNET_LIST_RLOCK_NOSLEEP();
1220	IPQ_LOCK();
1221	VNET_FOREACH(vnet_iter) {
1222		CURVNET_SET(vnet_iter);
1223		ip_drain_locked();
1224		CURVNET_RESTORE();
1225	}
1226	IPQ_UNLOCK();
1227	VNET_LIST_RUNLOCK_NOSLEEP();
1228	in_rtqdrain();
1229}
1230
1231/*
1232 * The protocol to be inserted into ip_protox[] must be already registered
1233 * in inetsw[], either statically or through pf_proto_register().
1234 */
1235int
1236ipproto_register(short ipproto)
1237{
1238	struct protosw *pr;
1239
1240	/* Sanity checks. */
1241	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
1242		return (EPROTONOSUPPORT);
1243
1244	/*
1245	 * The protocol slot must not be occupied by another protocol
1246	 * already.  An index pointing to IPPROTO_RAW is unused.
1247	 */
1248	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
1249	if (pr == NULL)
1250		return (EPFNOSUPPORT);
1251	if (ip_protox[ipproto] != pr - inetsw)	/* IPPROTO_RAW */
1252		return (EEXIST);
1253
1254	/* Find the protocol position in inetsw[] and set the index. */
1255	for (pr = inetdomain.dom_protosw;
1256	     pr < inetdomain.dom_protoswNPROTOSW; pr++) {
1257		if (pr->pr_domain->dom_family == PF_INET &&
1258		    pr->pr_protocol && pr->pr_protocol == ipproto) {
1259			ip_protox[pr->pr_protocol] = pr - inetsw;
1260			return (0);
1261		}
1262	}
1263	return (EPROTONOSUPPORT);
1264}
1265
1266int
1267ipproto_unregister(short ipproto)
1268{
1269	struct protosw *pr;
1270
1271	/* Sanity checks. */
1272	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
1273		return (EPROTONOSUPPORT);
1274
1275	/* Check if the protocol was indeed registered. */
1276	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
1277	if (pr == NULL)
1278		return (EPFNOSUPPORT);
1279	if (ip_protox[ipproto] == pr - inetsw)  /* IPPROTO_RAW */
1280		return (ENOENT);
1281
1282	/* Reset the protocol slot to IPPROTO_RAW. */
1283	ip_protox[ipproto] = pr - inetsw;
1284	return (0);
1285}
1286
1287/*
1288 * Given address of next destination (final or next hop), return (referenced)
1289 * internet address info of interface to be used to get there.
1290 */
1291struct in_ifaddr *
1292ip_rtaddr(struct in_addr dst, u_int fibnum)
1293{
1294	struct route sro;
1295	struct sockaddr_in *sin;
1296	struct in_ifaddr *ia;
1297
1298	bzero(&sro, sizeof(sro));
1299	sin = (struct sockaddr_in *)&sro.ro_dst;
1300	sin->sin_family = AF_INET;
1301	sin->sin_len = sizeof(*sin);
1302	sin->sin_addr = dst;
1303	in_rtalloc_ign(&sro, 0, fibnum);
1304
1305	if (sro.ro_rt == NULL)
1306		return (NULL);
1307
1308	ia = ifatoia(sro.ro_rt->rt_ifa);
1309	ifa_ref(&ia->ia_ifa);
1310	RTFREE(sro.ro_rt);
1311	return (ia);
1312}
1313
1314u_char inetctlerrmap[PRC_NCMDS] = {
1315	0,		0,		0,		0,
1316	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
1317	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
1318	EMSGSIZE,	EHOSTUNREACH,	0,		0,
1319	0,		0,		EHOSTUNREACH,	0,
1320	ENOPROTOOPT,	ECONNREFUSED
1321};
1322
1323/*
1324 * Forward a packet.  If some error occurs return the sender
1325 * an icmp packet.  Note we can't always generate a meaningful
1326 * icmp message because icmp doesn't have a large enough repertoire
1327 * of codes and types.
1328 *
1329 * If not forwarding, just drop the packet.  This could be confusing
1330 * if ipforwarding was zero but some routing protocol was advancing
1331 * us as a gateway to somewhere.  However, we must let the routing
1332 * protocol deal with that.
1333 *
1334 * The srcrt parameter indicates whether the packet is being forwarded
1335 * via a source route.
1336 */
1337void
1338ip_forward(struct mbuf *m, int srcrt)
1339{
1340	struct ip *ip = mtod(m, struct ip *);
1341	struct in_ifaddr *ia;
1342	struct mbuf *mcopy;
1343	struct in_addr dest;
1344	struct route ro;
1345	int error, type = 0, code = 0, mtu = 0;
1346
1347	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
1348		IPSTAT_INC(ips_cantforward);
1349		m_freem(m);
1350		return;
1351	}
1352#ifdef IPSTEALTH
1353	if (!V_ipstealth) {
1354#endif
1355		if (ip->ip_ttl <= IPTTLDEC) {
1356			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
1357			    0, 0);
1358			return;
1359		}
1360#ifdef IPSTEALTH
1361	}
1362#endif
1363
1364	ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
1365#ifndef IPSEC
1366	/*
1367	 * 'ia' may be NULL if there is no route for this destination.
1368	 * In case of IPsec, Don't discard it just yet, but pass it to
1369	 * ip_output in case of outgoing IPsec policy.
1370	 */
1371	if (!srcrt && ia == NULL) {
1372		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
1373		return;
1374	}
1375#endif
1376
1377	/*
1378	 * Save the IP header and at most 8 bytes of the payload,
1379	 * in case we need to generate an ICMP message to the src.
1380	 *
1381	 * XXX this can be optimized a lot by saving the data in a local
1382	 * buffer on the stack (72 bytes at most), and only allocating the
1383	 * mbuf if really necessary. The vast majority of the packets
1384	 * are forwarded without having to send an ICMP back (either
1385	 * because unnecessary, or because rate limited), so we are
1386	 * really we are wasting a lot of work here.
1387	 *
1388	 * We don't use m_copy() because it might return a reference
1389	 * to a shared cluster. Both this function and ip_output()
1390	 * assume exclusive access to the IP header in `m', so any
1391	 * data in a cluster may change before we reach icmp_error().
1392	 */
1393	mcopy = m_gethdr(M_NOWAIT, m->m_type);
1394	if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) {
1395		/*
1396		 * It's probably ok if the pkthdr dup fails (because
1397		 * the deep copy of the tag chain failed), but for now
1398		 * be conservative and just discard the copy since
1399		 * code below may some day want the tags.
1400		 */
1401		m_free(mcopy);
1402		mcopy = NULL;
1403	}
1404	if (mcopy != NULL) {
1405		mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy));
1406		mcopy->m_pkthdr.len = mcopy->m_len;
1407		m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
1408	}
1409
1410#ifdef IPSTEALTH
1411	if (!V_ipstealth) {
1412#endif
1413		ip->ip_ttl -= IPTTLDEC;
1414#ifdef IPSTEALTH
1415	}
1416#endif
1417
1418	/*
1419	 * If forwarding packet using same interface that it came in on,
1420	 * perhaps should send a redirect to sender to shortcut a hop.
1421	 * Only send redirect if source is sending directly to us,
1422	 * and if packet was not source routed (or has any options).
1423	 * Also, don't send redirect if forwarding using a default route
1424	 * or a route modified by a redirect.
1425	 */
1426	dest.s_addr = 0;
1427	if (!srcrt && V_ipsendredirects &&
1428	    ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) {
1429		struct sockaddr_in *sin;
1430		struct rtentry *rt;
1431
1432		bzero(&ro, sizeof(ro));
1433		sin = (struct sockaddr_in *)&ro.ro_dst;
1434		sin->sin_family = AF_INET;
1435		sin->sin_len = sizeof(*sin);
1436		sin->sin_addr = ip->ip_dst;
1437		in_rtalloc_ign(&ro, 0, M_GETFIB(m));
1438
1439		rt = ro.ro_rt;
1440
1441		if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
1442		    satosin(rt_key(rt))->sin_addr.s_addr != 0) {
1443#define	RTA(rt)	((struct in_ifaddr *)(rt->rt_ifa))
1444			u_long src = ntohl(ip->ip_src.s_addr);
1445
1446			if (RTA(rt) &&
1447			    (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
1448				if (rt->rt_flags & RTF_GATEWAY)
1449					dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr;
1450				else
1451					dest.s_addr = ip->ip_dst.s_addr;
1452				/* Router requirements says to only send host redirects */
1453				type = ICMP_REDIRECT;
1454				code = ICMP_REDIRECT_HOST;
1455			}
1456		}
1457		if (rt)
1458			RTFREE(rt);
1459	}
1460
1461	/*
1462	 * Try to cache the route MTU from ip_output so we can consider it for
1463	 * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191.
1464	 */
1465	bzero(&ro, sizeof(ro));
1466
1467	error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
1468
1469	if (error == EMSGSIZE && ro.ro_rt)
1470		mtu = ro.ro_rt->rt_mtu;
1471	RO_RTFREE(&ro);
1472
1473	if (error)
1474		IPSTAT_INC(ips_cantforward);
1475	else {
1476		IPSTAT_INC(ips_forward);
1477		if (type)
1478			IPSTAT_INC(ips_redirectsent);
1479		else {
1480			if (mcopy)
1481				m_freem(mcopy);
1482			if (ia != NULL)
1483				ifa_free(&ia->ia_ifa);
1484			return;
1485		}
1486	}
1487	if (mcopy == NULL) {
1488		if (ia != NULL)
1489			ifa_free(&ia->ia_ifa);
1490		return;
1491	}
1492
1493	switch (error) {
1494
1495	case 0:				/* forwarded, but need redirect */
1496		/* type, code set above */
1497		break;
1498
1499	case ENETUNREACH:
1500	case EHOSTUNREACH:
1501	case ENETDOWN:
1502	case EHOSTDOWN:
1503	default:
1504		type = ICMP_UNREACH;
1505		code = ICMP_UNREACH_HOST;
1506		break;
1507
1508	case EMSGSIZE:
1509		type = ICMP_UNREACH;
1510		code = ICMP_UNREACH_NEEDFRAG;
1511
1512#ifdef IPSEC
1513		/*
1514		 * If IPsec is configured for this path,
1515		 * override any possibly mtu value set by ip_output.
1516		 */
1517		mtu = ip_ipsec_mtu(mcopy, mtu);
1518#endif /* IPSEC */
1519		/*
1520		 * If the MTU was set before make sure we are below the
1521		 * interface MTU.
1522		 * If the MTU wasn't set before use the interface mtu or
1523		 * fall back to the next smaller mtu step compared to the
1524		 * current packet size.
1525		 */
1526		if (mtu != 0) {
1527			if (ia != NULL)
1528				mtu = min(mtu, ia->ia_ifp->if_mtu);
1529		} else {
1530			if (ia != NULL)
1531				mtu = ia->ia_ifp->if_mtu;
1532			else
1533				mtu = ip_next_mtu(ntohs(ip->ip_len), 0);
1534		}
1535		IPSTAT_INC(ips_cantfrag);
1536		break;
1537
1538	case ENOBUFS:
1539		/*
1540		 * A router should not generate ICMP_SOURCEQUENCH as
1541		 * required in RFC1812 Requirements for IP Version 4 Routers.
1542		 * Source quench could be a big problem under DoS attacks,
1543		 * or if the underlying interface is rate-limited.
1544		 * Those who need source quench packets may re-enable them
1545		 * via the net.inet.ip.sendsourcequench sysctl.
1546		 */
1547		if (V_ip_sendsourcequench == 0) {
1548			m_freem(mcopy);
1549			if (ia != NULL)
1550				ifa_free(&ia->ia_ifa);
1551			return;
1552		} else {
1553			type = ICMP_SOURCEQUENCH;
1554			code = 0;
1555		}
1556		break;
1557
1558	case EACCES:			/* ipfw denied packet */
1559		m_freem(mcopy);
1560		if (ia != NULL)
1561			ifa_free(&ia->ia_ifa);
1562		return;
1563	}
1564	if (ia != NULL)
1565		ifa_free(&ia->ia_ifa);
1566	icmp_error(mcopy, type, code, dest.s_addr, mtu);
1567}
1568
1569void
1570ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
1571    struct mbuf *m)
1572{
1573
1574	if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) {
1575		struct bintime bt;
1576
1577		bintime(&bt);
1578		if (inp->inp_socket->so_options & SO_BINTIME) {
1579			*mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt),
1580			    SCM_BINTIME, SOL_SOCKET);
1581			if (*mp)
1582				mp = &(*mp)->m_next;
1583		}
1584		if (inp->inp_socket->so_options & SO_TIMESTAMP) {
1585			struct timeval tv;
1586
1587			bintime2timeval(&bt, &tv);
1588			*mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv),
1589			    SCM_TIMESTAMP, SOL_SOCKET);
1590			if (*mp)
1591				mp = &(*mp)->m_next;
1592		}
1593	}
1594	if (inp->inp_flags & INP_RECVDSTADDR) {
1595		*mp = sbcreatecontrol((caddr_t)&ip->ip_dst,
1596		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
1597		if (*mp)
1598			mp = &(*mp)->m_next;
1599	}
1600	if (inp->inp_flags & INP_RECVTTL) {
1601		*mp = sbcreatecontrol((caddr_t)&ip->ip_ttl,
1602		    sizeof(u_char), IP_RECVTTL, IPPROTO_IP);
1603		if (*mp)
1604			mp = &(*mp)->m_next;
1605	}
1606#ifdef notyet
1607	/* XXX
1608	 * Moving these out of udp_input() made them even more broken
1609	 * than they already were.
1610	 */
1611	/* options were tossed already */
1612	if (inp->inp_flags & INP_RECVOPTS) {
1613		*mp = sbcreatecontrol((caddr_t)opts_deleted_above,
1614		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
1615		if (*mp)
1616			mp = &(*mp)->m_next;
1617	}
1618	/* ip_srcroute doesn't do what we want here, need to fix */
1619	if (inp->inp_flags & INP_RECVRETOPTS) {
1620		*mp = sbcreatecontrol((caddr_t)ip_srcroute(m),
1621		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
1622		if (*mp)
1623			mp = &(*mp)->m_next;
1624	}
1625#endif
1626	if (inp->inp_flags & INP_RECVIF) {
1627		struct ifnet *ifp;
1628		struct sdlbuf {
1629			struct sockaddr_dl sdl;
1630			u_char	pad[32];
1631		} sdlbuf;
1632		struct sockaddr_dl *sdp;
1633		struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
1634
1635		if ((ifp = m->m_pkthdr.rcvif) &&
1636		    ifp->if_index && ifp->if_index <= V_if_index) {
1637			sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
1638			/*
1639			 * Change our mind and don't try copy.
1640			 */
1641			if (sdp->sdl_family != AF_LINK ||
1642			    sdp->sdl_len > sizeof(sdlbuf)) {
1643				goto makedummy;
1644			}
1645			bcopy(sdp, sdl2, sdp->sdl_len);
1646		} else {
1647makedummy:
1648			sdl2->sdl_len =
1649			    offsetof(struct sockaddr_dl, sdl_data[0]);
1650			sdl2->sdl_family = AF_LINK;
1651			sdl2->sdl_index = 0;
1652			sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
1653		}
1654		*mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len,
1655		    IP_RECVIF, IPPROTO_IP);
1656		if (*mp)
1657			mp = &(*mp)->m_next;
1658	}
1659	if (inp->inp_flags & INP_RECVTOS) {
1660		*mp = sbcreatecontrol((caddr_t)&ip->ip_tos,
1661		    sizeof(u_char), IP_RECVTOS, IPPROTO_IP);
1662		if (*mp)
1663			mp = &(*mp)->m_next;
1664	}
1665
1666	if (inp->inp_flags2 & INP_RECVFLOWID) {
1667		uint32_t flowid, flow_type;
1668
1669		flowid = m->m_pkthdr.flowid;
1670		flow_type = M_HASHTYPE_GET(m);
1671
1672		/*
1673		 * XXX should handle the failure of one or the
1674		 * other - don't populate both?
1675		 */
1676		*mp = sbcreatecontrol((caddr_t) &flowid,
1677		    sizeof(uint32_t), IP_FLOWID, IPPROTO_IP);
1678		if (*mp)
1679			mp = &(*mp)->m_next;
1680		*mp = sbcreatecontrol((caddr_t) &flow_type,
1681		    sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP);
1682		if (*mp)
1683			mp = &(*mp)->m_next;
1684	}
1685
1686#ifdef	RSS
1687	if (inp->inp_flags2 & INP_RECVRSSBUCKETID) {
1688		uint32_t flowid, flow_type;
1689		uint32_t rss_bucketid;
1690
1691		flowid = m->m_pkthdr.flowid;
1692		flow_type = M_HASHTYPE_GET(m);
1693
1694		if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
1695			*mp = sbcreatecontrol((caddr_t) &rss_bucketid,
1696			   sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP);
1697			if (*mp)
1698				mp = &(*mp)->m_next;
1699		}
1700	}
1701#endif
1702}
1703
1704/*
1705 * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the
1706 * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on
1707 * locking.  This code remains in ip_input.c as ip_mroute.c is optionally
1708 * compiled.
1709 */
1710static VNET_DEFINE(int, ip_rsvp_on);
1711VNET_DEFINE(struct socket *, ip_rsvpd);
1712
1713#define	V_ip_rsvp_on		VNET(ip_rsvp_on)
1714
1715int
1716ip_rsvp_init(struct socket *so)
1717{
1718
1719	if (so->so_type != SOCK_RAW ||
1720	    so->so_proto->pr_protocol != IPPROTO_RSVP)
1721		return EOPNOTSUPP;
1722
1723	if (V_ip_rsvpd != NULL)
1724		return EADDRINUSE;
1725
1726	V_ip_rsvpd = so;
1727	/*
1728	 * This may seem silly, but we need to be sure we don't over-increment
1729	 * the RSVP counter, in case something slips up.
1730	 */
1731	if (!V_ip_rsvp_on) {
1732		V_ip_rsvp_on = 1;
1733		V_rsvp_on++;
1734	}
1735
1736	return 0;
1737}
1738
1739int
1740ip_rsvp_done(void)
1741{
1742
1743	V_ip_rsvpd = NULL;
1744	/*
1745	 * This may seem silly, but we need to be sure we don't over-decrement
1746	 * the RSVP counter, in case something slips up.
1747	 */
1748	if (V_ip_rsvp_on) {
1749		V_ip_rsvp_on = 0;
1750		V_rsvp_on--;
1751	}
1752	return 0;
1753}
1754
1755int
1756rsvp_input(struct mbuf **mp, int *offp, int proto)
1757{
1758	struct mbuf *m;
1759
1760	m = *mp;
1761	*mp = NULL;
1762
1763	if (rsvp_input_p) { /* call the real one if loaded */
1764		*mp = m;
1765		rsvp_input_p(mp, offp, proto);
1766		return (IPPROTO_DONE);
1767	}
1768
1769	/* Can still get packets with rsvp_on = 0 if there is a local member
1770	 * of the group to which the RSVP packet is addressed.  But in this
1771	 * case we want to throw the packet away.
1772	 */
1773
1774	if (!V_rsvp_on) {
1775		m_freem(m);
1776		return (IPPROTO_DONE);
1777	}
1778
1779	if (V_ip_rsvpd != NULL) {
1780		*mp = m;
1781		rip_input(mp, offp, proto);
1782		return (IPPROTO_DONE);
1783	}
1784	/* Drop the packet */
1785	m_freem(m);
1786	return (IPPROTO_DONE);
1787}
1788