ip_output.c revision 196234
133965Sjdp/*-
2104834Sobrien * Copyright (c) 1982, 1986, 1988, 1990, 1993
3218822Sdim *	The Regents of the University of California.  All rights reserved.
433965Sjdp *
533965Sjdp * Redistribution and use in source and binary forms, with or without
633965Sjdp * modification, are permitted provided that the following conditions
733965Sjdp * are met:
860484Sobrien * 1. Redistributions of source code must retain the above copyright
933965Sjdp *    notice, this list of conditions and the following disclaimer.
1033965Sjdp * 2. Redistributions in binary form must reproduce the above copyright
1133965Sjdp *    notice, this list of conditions and the following disclaimer in the
1233965Sjdp *    documentation and/or other materials provided with the distribution.
1333965Sjdp * 4. Neither the name of the University nor the names of its contributors
1433965Sjdp *    may be used to endorse or promote products derived from this software
1533965Sjdp *    without specific prior written permission.
1633965Sjdp *
1733965Sjdp * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
1833965Sjdp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1933965Sjdp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2033965Sjdp * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2133965Sjdp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2233965Sjdp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2333965Sjdp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24218822Sdim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25218822Sdim * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2633965Sjdp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27218822Sdim * SUCH DAMAGE.
28218822Sdim *
29218822Sdim *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
3033965Sjdp */
31218822Sdim
32218822Sdim#include <sys/cdefs.h>
33218822Sdim__FBSDID("$FreeBSD: head/sys/netinet/ip_output.c 196234 2009-08-14 23:44:59Z qingli $");
34218822Sdim
35218822Sdim#include "opt_ipfw.h"
36218822Sdim#include "opt_ipsec.h"
37218822Sdim#include "opt_route.h"
38218822Sdim#include "opt_mbuf_stress_test.h"
39218822Sdim#include "opt_mpath.h"
40218822Sdim#include "opt_sctp.h"
41218822Sdim
42218822Sdim#include <sys/param.h>
43218822Sdim#include <sys/systm.h>
44218822Sdim#include <sys/kernel.h>
45218822Sdim#include <sys/malloc.h>
46218822Sdim#include <sys/mbuf.h>
47218822Sdim#include <sys/priv.h>
48218822Sdim#include <sys/proc.h>
49218822Sdim#include <sys/protosw.h>
50218822Sdim#include <sys/socket.h>
5133965Sjdp#include <sys/socketvar.h>
5233965Sjdp#include <sys/sysctl.h>
5360484Sobrien#include <sys/ucred.h>
5460484Sobrien
5533965Sjdp#include <net/if.h>
5633965Sjdp#include <net/if_llatbl.h>
5733965Sjdp#include <net/netisr.h>
5833965Sjdp#include <net/pfil.h>
5933965Sjdp#include <net/route.h>
6077298Sobrien#include <net/flowtable.h>
6133965Sjdp#ifdef RADIX_MPATH
6233965Sjdp#include <net/radix_mpath.h>
63218822Sdim#endif
6489857Sobrien#include <net/vnet.h>
6533965Sjdp
66130561Sobrien#include <netinet/in.h>
6733965Sjdp#include <netinet/in_systm.h>
68218822Sdim#include <netinet/ip.h>
69218822Sdim#include <netinet/in_pcb.h>
70218822Sdim#include <netinet/in_var.h>
71218822Sdim#include <netinet/ip_var.h>
7260484Sobrien#include <netinet/ip_options.h>
73130561Sobrien#ifdef SCTP
74130561Sobrien#include <netinet/sctp.h>
7533965Sjdp#include <netinet/sctp_crc32.h>
7633965Sjdp#endif
7733965Sjdp
7833965Sjdp#ifdef IPSEC
79130561Sobrien#include <netinet/ip_ipsec.h>
80130561Sobrien#include <netipsec/ipsec.h>
8133965Sjdp#endif /* IPSEC*/
8233965Sjdp
8333965Sjdp#include <machine/in_cksum.h>
8489857Sobrien
8533965Sjdp#include <security/mac/mac_framework.h>
8633965Sjdp
87130561Sobrien#define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
88130561Sobrien				x, (ntohl(a.s_addr)>>24)&0xFF,\
89130561Sobrien				  (ntohl(a.s_addr)>>16)&0xFF,\
90130561Sobrien				  (ntohl(a.s_addr)>>8)&0xFF,\
91130561Sobrien				  (ntohl(a.s_addr))&0xFF, y);
92130561Sobrien
93130561SobrienVNET_DEFINE(u_short, ip_id);
94130561Sobrien
95130561Sobrien#ifdef MBUF_STRESS_TEST
96130561Sobrienint mbuf_frag_size = 0;
9733965SjdpSYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
98130561Sobrien	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
99130561Sobrien#endif
10033965Sjdp
101218822Sdimstatic void	ip_mloopback
102218822Sdim	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
10333965Sjdp
10433965Sjdp
10533965Sjdpextern int in_mcast_loop;
10677298Sobrienextern	struct protosw inetsw[];
10733965Sjdp
108218822Sdim/*
109218822Sdim * IP output.  The packet in mbuf chain m contains a skeletal IP
11033965Sjdp * header (with len, off, ttl, proto, tos, src, dst).
111104834Sobrien * The mbuf chain containing the packet will be freed.
112218822Sdim * The mbuf opt, if present, will not be freed.
113218822Sdim * In the IP forwarding case, the packet will arrive with options already
114218822Sdim * inserted, so must have a NULL opt pointer.
115218822Sdim */
116218822Sdimint
117218822Sdimip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
118218822Sdim    struct ip_moptions *imo, struct inpcb *inp)
119218822Sdim{
120218822Sdim	struct ip *ip;
121218822Sdim	struct ifnet *ifp = NULL;	/* keep compiler happy */
122218822Sdim	struct mbuf *m0;
123218822Sdim	int hlen = sizeof (struct ip);
124218822Sdim	int mtu;
125218822Sdim	int len, error = 0;
126218822Sdim	int nortfree = 0;
127218822Sdim	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
128218822Sdim	struct in_ifaddr *ia = NULL;
129218822Sdim	int isbroadcast, sw_csum;
130218822Sdim	struct route iproute;
131218822Sdim	struct in_addr odst;
132218822Sdim#ifdef IPFIREWALL_FORWARD
133218822Sdim	struct m_tag *fwd_tag = NULL;
134218822Sdim#endif
13560484Sobrien#ifdef IPSEC
13689857Sobrien	int no_route_but_check_spd = 0;
13789857Sobrien#endif
13889857Sobrien	M_ASSERTPKTHDR(m);
13989857Sobrien
140104834Sobrien	if (inp != NULL) {
141104834Sobrien		INP_LOCK_ASSERT(inp);
142218822Sdim		M_SETFIB(m, inp->inp_inc.inc_fibnum);
14360484Sobrien		if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
144104834Sobrien			m->m_pkthdr.flowid = inp->inp_flowid;
14589857Sobrien			m->m_flags |= M_FLOWID;
146130561Sobrien		}
14733965Sjdp	}
14833965Sjdp
14933965Sjdp	if (ro == NULL) {
15033965Sjdp		ro = &iproute;
15133965Sjdp		bzero(ro, sizeof (*ro));
152130561Sobrien
15333965Sjdp#ifdef FLOWTABLE
15433965Sjdp		/*
15533965Sjdp		 * The flow table returns route entries valid for up to 30
15633965Sjdp		 * seconds; we rely on the remainder of ip_output() taking no
15733965Sjdp		 * longer than that long for the stability of ro_rt.  The
15838889Sjdp		 * flow ID assignment must have happened before this point.
15938889Sjdp		 */
16033965Sjdp		if (flowtable_lookup(V_ip_ft, m, ro) == 0)
16138889Sjdp			nortfree = 1;
16238889Sjdp#endif
16377298Sobrien	}
16438889Sjdp
16577298Sobrien	if (opt) {
16638889Sjdp		len = 0;
16777298Sobrien		m = ip_insertoptions(m, opt, &len);
16877298Sobrien		if (len != 0)
16977298Sobrien			hlen = len;
17077298Sobrien	}
17138889Sjdp	ip = mtod(m, struct ip *);
17233965Sjdp
17333965Sjdp	/*
17433965Sjdp	 * Fill in IP header.  If we are not allowing fragmentation,
17533965Sjdp	 * then the ip_id field is meaningless, but we don't set it
17633965Sjdp	 * to zero.  Doing so causes various problems when devices along
17733965Sjdp	 * the path (routers, load balancers, firewalls, etc.) illegally
17877298Sobrien	 * disable DF on our packet.  Note that a 16-bit counter
17977298Sobrien	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
18033965Sjdp	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
18177298Sobrien	 * for Counting NATted Hosts", Proc. IMW'02, available at
18277298Sobrien	 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
18333965Sjdp	 */
18433965Sjdp	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
18533965Sjdp		ip->ip_v = IPVERSION;
186130561Sobrien		ip->ip_hl = hlen >> 2;
18733965Sjdp		ip->ip_id = ip_newid();
18877298Sobrien		IPSTAT_INC(ips_localout);
18933965Sjdp	} else {
19033965Sjdp		hlen = ip->ip_hl << 2;
19177298Sobrien	}
19233965Sjdp
19333965Sjdp	dst = (struct sockaddr_in *)&ro->ro_dst;
19477298Sobrienagain:
19533965Sjdp	/*
19633965Sjdp	 * If there is a cached route,
19777298Sobrien	 * check that it is to the same destination
19833965Sjdp	 * and is still up.  If not, free it and try again.
19977298Sobrien	 * The address family should also be checked in case of sharing the
20077298Sobrien	 * cache with IPv6.
20133965Sjdp	 */
20233965Sjdp	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
20333965Sjdp			  dst->sin_family != AF_INET ||
20477298Sobrien			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
20533965Sjdp		if (!nortfree) {
20633965Sjdp			RTFREE(ro->ro_rt);
20733965Sjdp			LLE_FREE(ro->ro_lle);
20833965Sjdp		}
20933965Sjdp		ro->ro_rt = (struct rtentry *)NULL;
21033965Sjdp		ro->ro_lle = (struct llentry *)NULL;
21133965Sjdp	}
21233965Sjdp#ifdef IPFIREWALL_FORWARD
21333965Sjdp	if (ro->ro_rt == NULL && fwd_tag == NULL) {
21433965Sjdp#else
21533965Sjdp	if (ro->ro_rt == NULL) {
21633965Sjdp#endif
21733965Sjdp		bzero(dst, sizeof(*dst));
21833965Sjdp		dst->sin_family = AF_INET;
21933965Sjdp		dst->sin_len = sizeof(*dst);
22033965Sjdp		dst->sin_addr = ip->ip_dst;
22133965Sjdp	}
22233965Sjdp	/*
22333965Sjdp	 * If routing to interface only, short circuit routing lookup.
22433965Sjdp	 * The use of an all-ones broadcast address implies this; an
22533965Sjdp	 * interface is specified by the broadcast address of an interface,
22633965Sjdp	 * or the destination address of a ptp interface.
22733965Sjdp	 */
22833965Sjdp	if (flags & IP_SENDONES) {
22933965Sjdp		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
23033965Sjdp		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
23138889Sjdp			IPSTAT_INC(ips_noroute);
23233965Sjdp			error = ENETUNREACH;
23338889Sjdp			goto bad;
23433965Sjdp		}
23533965Sjdp		ip->ip_dst.s_addr = INADDR_BROADCAST;
236130561Sobrien		dst->sin_addr = ip->ip_dst;
237130561Sobrien		ifp = ia->ia_ifp;
23833965Sjdp		ip->ip_ttl = 1;
239130561Sobrien		isbroadcast = 1;
24077298Sobrien	} else if (flags & IP_ROUTETOIF) {
241130561Sobrien		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
24260484Sobrien		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
24377298Sobrien			IPSTAT_INC(ips_noroute);
24477298Sobrien			error = ENETUNREACH;
245130561Sobrien			goto bad;
246130561Sobrien		}
24760484Sobrien		ifp = ia->ia_ifp;
248130561Sobrien		ip->ip_ttl = 1;
249130561Sobrien		isbroadcast = in_broadcast(dst->sin_addr, ifp);
25033965Sjdp	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
25177298Sobrien	    imo != NULL && imo->imo_multicast_ifp != NULL) {
25233965Sjdp		/*
25333965Sjdp		 * Bypass the normal routing lookup for multicast
25433965Sjdp		 * packets if the interface is specified.
25533965Sjdp		 */
25633965Sjdp		ifp = imo->imo_multicast_ifp;
25733965Sjdp		IFP_TO_IA(ifp, ia);
25833965Sjdp		isbroadcast = 0;	/* fool gcc */
25933965Sjdp	} else {
26033965Sjdp		/*
26133965Sjdp		 * We want to do any cloning requested by the link layer,
26233965Sjdp		 * as this is probably required in all cases for correct
26333965Sjdp		 * operation (as it is for ARP).
26433965Sjdp		 */
26533965Sjdp		if (ro->ro_rt == NULL)
26633965Sjdp#ifdef RADIX_MPATH
26733965Sjdp			rtalloc_mpath_fib(ro,
26833965Sjdp			    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
26933965Sjdp			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
27033965Sjdp#else
27133965Sjdp			in_rtalloc_ign(ro, 0,
27233965Sjdp			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
273130561Sobrien#endif
27433965Sjdp		if (ro->ro_rt == NULL) {
27533965Sjdp#ifdef IPSEC
27633965Sjdp			/*
277130561Sobrien			 * There is no route for this packet, but it is
27833965Sjdp			 * possible that a matching SPD entry exists.
27933965Sjdp			 */
280130561Sobrien			no_route_but_check_spd = 1;
28133965Sjdp			mtu = 0; /* Silence GCC warning. */
28233965Sjdp			goto sendit;
28333965Sjdp#endif
284130561Sobrien			IPSTAT_INC(ips_noroute);
28533965Sjdp			error = EHOSTUNREACH;
28633965Sjdp			goto bad;
28733965Sjdp		}
28833965Sjdp		ia = ifatoia(ro->ro_rt->rt_ifa);
28933965Sjdp		ifa_ref(&ia->ia_ifa);
29033965Sjdp		ifp = ro->ro_rt->rt_ifp;
291130561Sobrien		ro->ro_rt->rt_rmx.rmx_pksent++;
29233965Sjdp		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
29333965Sjdp			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
294130561Sobrien		if (ro->ro_rt->rt_flags & RTF_HOST)
29533965Sjdp			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
29633965Sjdp		else
297130561Sobrien			isbroadcast = in_broadcast(dst->sin_addr, ifp);
29833965Sjdp	}
29933965Sjdp	/*
30033965Sjdp	 * Calculate MTU.  If we have a route that is up, use that,
30133965Sjdp	 * otherwise use the interface's MTU.
30233965Sjdp	 */
30333965Sjdp	if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) {
30433965Sjdp		/*
30533965Sjdp		 * This case can happen if the user changed the MTU
30633965Sjdp		 * of an interface after enabling IP on it.  Because
30733965Sjdp		 * most netifs don't keep track of routes pointing to
30877298Sobrien		 * them, there is no way for one to update all its
30933965Sjdp		 * routes when the MTU is changed.
31033965Sjdp		 */
31133965Sjdp		if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)
31233965Sjdp			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
31333965Sjdp		mtu = ro->ro_rt->rt_rmx.rmx_mtu;
31433965Sjdp	} else {
31533965Sjdp		mtu = ifp->if_mtu;
31633965Sjdp	}
31733965Sjdp	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
31833965Sjdp		m->m_flags |= M_MCAST;
319130561Sobrien		/*
32033965Sjdp		 * IP destination address is multicast.  Make sure "dst"
32133965Sjdp		 * still points to the address in "ro".  (It may have been
32233965Sjdp		 * changed to point to a gateway address, above.)
32333965Sjdp		 */
324130561Sobrien		dst = (struct sockaddr_in *)&ro->ro_dst;
32533965Sjdp		/*
32633965Sjdp		 * See if the caller provided any multicast options
327130561Sobrien		 */
32833965Sjdp		if (imo != NULL) {
32933965Sjdp			ip->ip_ttl = imo->imo_multicast_ttl;
330130561Sobrien			if (imo->imo_multicast_vif != -1)
33133965Sjdp				ip->ip_src.s_addr =
33233965Sjdp				    ip_mcast_src ?
333130561Sobrien				    ip_mcast_src(imo->imo_multicast_vif) :
33433965Sjdp				    INADDR_ANY;
33533965Sjdp		} else
33633965Sjdp			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
33733965Sjdp		/*
33833965Sjdp		 * Confirm that the outgoing interface supports multicast.
33933965Sjdp		 */
34033965Sjdp		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
34133965Sjdp			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
34233965Sjdp				IPSTAT_INC(ips_noroute);
34333965Sjdp				error = ENETUNREACH;
34433965Sjdp				goto bad;
34533965Sjdp			}
34633965Sjdp		}
34733965Sjdp		/*
34833965Sjdp		 * If source address not specified yet, use address
34933965Sjdp		 * of outgoing interface.
35033965Sjdp		 */
35133965Sjdp		if (ip->ip_src.s_addr == INADDR_ANY) {
35233965Sjdp			/* Interface may have no addresses. */
35333965Sjdp			if (ia != NULL)
35433965Sjdp				ip->ip_src = IA_SIN(ia)->sin_addr;
35533965Sjdp		}
35660484Sobrien
35760484Sobrien		if ((imo == NULL && in_mcast_loop) ||
35877298Sobrien		    (imo && imo->imo_multicast_loop)) {
35960484Sobrien			/*
36060484Sobrien			 * Loop back multicast datagram if not expressly
36138889Sjdp			 * forbidden to do so, even if we are not a member
36238889Sjdp			 * of the group; ip_input() will filter it later,
36338889Sjdp			 * thus deferring a hash lookup and mutex acquisition
36438889Sjdp			 * at the expense of a cheap copy using m_copym().
36538889Sjdp			 */
36638889Sjdp			ip_mloopback(ifp, m, dst, hlen);
36733965Sjdp		} else {
36833965Sjdp			/*
369218822Sdim			 * If we are acting as a multicast router, perform
370218822Sdim			 * multicast forwarding as if the packet had just
371218822Sdim			 * arrived on the interface to which we are about
37233965Sjdp			 * to send.  The multicast forwarding function
373218822Sdim			 * recursively calls this function, using the
374218822Sdim			 * IP_FORWARDING flag to prevent infinite recursion.
37533965Sjdp			 *
37633965Sjdp			 * Multicasts that are looped back by ip_mloopback(),
37733965Sjdp			 * above, will be forwarded by the ip_input() routine,
37833965Sjdp			 * if necessary.
37933965Sjdp			 */
38033965Sjdp			if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
38133965Sjdp				/*
38233965Sjdp				 * If rsvp daemon is not running, do not
38333965Sjdp				 * set ip_moptions. This ensures that the packet
38433965Sjdp				 * is multicast and not just sent down one link
38533965Sjdp				 * as prescribed by rsvpd.
38633965Sjdp				 */
38733965Sjdp				if (!V_rsvp_on)
388130561Sobrien					imo = NULL;
38933965Sjdp				if (ip_mforward &&
39033965Sjdp				    ip_mforward(ip, ifp, m, imo) != 0) {
39133965Sjdp					m_freem(m);
39260484Sobrien					goto done;
39360484Sobrien				}
39460484Sobrien			}
39560484Sobrien		}
39660484Sobrien
39760484Sobrien		/*
39889857Sobrien		 * Multicasts with a time-to-live of zero may be looped-
39960484Sobrien		 * back, above, but must not be transmitted on a network.
40089857Sobrien		 * Also, multicasts addressed to the loopback interface
40160484Sobrien		 * are not sent -- the above call to ip_mloopback() will
40289857Sobrien		 * loop back a copy. ip_input() will drop the copy if
40389857Sobrien		 * this host does not belong to the destination group on
40460484Sobrien		 * the loopback interface.
405130561Sobrien		 */
406130561Sobrien		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
407218822Sdim			m_freem(m);
408218822Sdim			goto done;
409218822Sdim		}
410218822Sdim
411218822Sdim		goto sendit;
412218822Sdim	}
413130561Sobrien
414130561Sobrien	/*
415130561Sobrien	 * If the source address is not specified yet, use the address
416218822Sdim	 * of the outoing interface.
417218822Sdim	 */
418218822Sdim	if (ip->ip_src.s_addr == INADDR_ANY) {
419218822Sdim		/* Interface may have no addresses. */
420218822Sdim		if (ia != NULL) {
421218822Sdim			ip->ip_src = IA_SIN(ia)->sin_addr;
422218822Sdim		}
423218822Sdim	}
424218822Sdim
425218822Sdim	/*
426218822Sdim	 * Verify that we have any chance at all of being able to queue the
427218822Sdim	 * packet or packet fragments, unless ALTQ is enabled on the given
428218822Sdim	 * interface in which case packetdrop should be done by queueing.
429218822Sdim	 */
430218822Sdim#ifdef ALTQ
431218822Sdim	if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
432218822Sdim	    ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
433218822Sdim	    ifp->if_snd.ifq_maxlen))
434218822Sdim#else
435218822Sdim	if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
436218822Sdim	    ifp->if_snd.ifq_maxlen)
437218822Sdim#endif /* ALTQ */
438218822Sdim	{
43989857Sobrien		error = ENOBUFS;
44060484Sobrien		IPSTAT_INC(ips_odropped);
44189857Sobrien		ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
44260484Sobrien		goto bad;
44360484Sobrien	}
44477298Sobrien
44589857Sobrien	/*
44660484Sobrien	 * Look for broadcast address and
447218822Sdim	 * verify user is allowed to send
44860484Sobrien	 * such a packet.
44960484Sobrien	 */
450130561Sobrien	if (isbroadcast) {
45189857Sobrien		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
45260484Sobrien			error = EADDRNOTAVAIL;
453218822Sdim			goto bad;
454218822Sdim		}
455218822Sdim		if ((flags & IP_ALLOWBROADCAST) == 0) {
456218822Sdim			error = EACCES;
457218822Sdim			goto bad;
458218822Sdim		}
459218822Sdim		/* don't allow broadcast messages to be fragmented */
460104834Sobrien		if (ip->ip_len > mtu) {
461104834Sobrien			error = EMSGSIZE;
462104834Sobrien			goto bad;
463218822Sdim		}
464218822Sdim		m->m_flags |= M_BCAST;
46560484Sobrien	} else {
46660484Sobrien		m->m_flags &= ~M_BCAST;
467218822Sdim	}
46860484Sobrien
46989857Sobriensendit:
47089857Sobrien#ifdef IPSEC
47189857Sobrien	switch(ip_ipsec_output(&m, inp, &flags, &error, &ifp)) {
47260484Sobrien	case 1:
47338889Sjdp		goto bad;
47438889Sjdp	case -1:
475130561Sobrien		goto done;
476130561Sobrien	case 0:
47738889Sjdp	default:
47838889Sjdp		break;	/* Continue with packet processing. */
479130561Sobrien	}
480130561Sobrien	/*
481130561Sobrien	 * Check if there was a route for this packet; return error if not.
48238889Sjdp	 */
48377298Sobrien	if (no_route_but_check_spd) {
48438889Sjdp		IPSTAT_INC(ips_noroute);
48538889Sjdp		error = EHOSTUNREACH;
48638889Sjdp		goto bad;
48738889Sjdp	}
48838889Sjdp	/* Update variables that are affected by ipsec4_output(). */
48938889Sjdp	ip = mtod(m, struct ip *);
49038889Sjdp	hlen = ip->ip_hl << 2;
49138889Sjdp#endif /* IPSEC */
49238889Sjdp
49338889Sjdp	/* Jump over all PFIL processing if hooks are not active. */
49438889Sjdp	if (!PFIL_HOOKED(&inet_pfil_hook))
49560484Sobrien		goto passout;
49660484Sobrien
497104834Sobrien	/* Run through list of hooks for output packets. */
498104834Sobrien	odst.s_addr = ip->ip_dst.s_addr;
499104834Sobrien	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
500104834Sobrien	if (error != 0 || m == NULL)
50138889Sjdp		goto done;
50238889Sjdp
50338889Sjdp	ip = mtod(m, struct ip *);
50438889Sjdp
50538889Sjdp	/* See if destination IP address was changed by packet filter. */
50638889Sjdp	if (odst.s_addr != ip->ip_dst.s_addr) {
50738889Sjdp		m->m_flags |= M_SKIP_FIREWALL;
50860484Sobrien		/* If destination is now ourself drop to ip_input(). */
509104834Sobrien		if (in_localip(ip->ip_dst)) {
510104834Sobrien			m->m_flags |= M_FASTFWD_OURS;
51138889Sjdp			if (m->m_pkthdr.rcvif == NULL)
51238889Sjdp				m->m_pkthdr.rcvif = V_loif;
51360484Sobrien			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
51438889Sjdp				m->m_pkthdr.csum_flags |=
51538889Sjdp				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
516218822Sdim				m->m_pkthdr.csum_data = 0xffff;
517218822Sdim			}
518218822Sdim			m->m_pkthdr.csum_flags |=
519218822Sdim			    CSUM_IP_CHECKED | CSUM_IP_VALID;
520218822Sdim#ifdef SCTP
521218822Sdim			if (m->m_pkthdr.csum_flags & CSUM_SCTP)
522218822Sdim				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
523218822Sdim#endif
524218822Sdim			error = netisr_queue(NETISR_IP, m);
525218822Sdim			goto done;
52633965Sjdp		} else
527218822Sdim			goto again;	/* Redo the routing table lookup. */
528218822Sdim	}
529218822Sdim
53033965Sjdp#ifdef IPFIREWALL_FORWARD
53189857Sobrien	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
532218822Sdim	if (m->m_flags & M_FASTFWD_OURS) {
53389857Sobrien		if (m->m_pkthdr.rcvif == NULL)
53489857Sobrien			m->m_pkthdr.rcvif = V_loif;
53589857Sobrien		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
53660484Sobrien			m->m_pkthdr.csum_flags |=
53789857Sobrien			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
53889857Sobrien			m->m_pkthdr.csum_data = 0xffff;
53989857Sobrien		}
54089857Sobrien#ifdef SCTP
54189857Sobrien		if (m->m_pkthdr.csum_flags & CSUM_SCTP)
54289857Sobrien			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
54389857Sobrien#endif
54489857Sobrien		m->m_pkthdr.csum_flags |=
545218822Sdim			    CSUM_IP_CHECKED | CSUM_IP_VALID;
54689857Sobrien
54789857Sobrien		error = netisr_queue(NETISR_IP, m);
54889857Sobrien		goto done;
54989857Sobrien	}
550218822Sdim	/* Or forward to some other address? */
55189857Sobrien	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
55289857Sobrien	if (fwd_tag) {
55389857Sobrien		dst = (struct sockaddr_in *)&ro->ro_dst;
55489857Sobrien		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
55589857Sobrien		m->m_flags |= M_SKIP_FIREWALL;
55689857Sobrien		m_tag_delete(m, fwd_tag);
55789857Sobrien		goto again;
55860484Sobrien	}
559218822Sdim#endif /* IPFIREWALL_FORWARD */
560218822Sdim
561218822Sdimpassout:
56260484Sobrien	/* 127/8 must not appear on wire - RFC1122. */
56360484Sobrien	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
56460484Sobrien	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
56560484Sobrien		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
56660484Sobrien			IPSTAT_INC(ips_badaddr);
56789857Sobrien			error = EADDRNOTAVAIL;
56860484Sobrien			goto bad;
56977298Sobrien		}
57077298Sobrien	}
57177298Sobrien
57260484Sobrien	m->m_pkthdr.csum_flags |= CSUM_IP;
57360484Sobrien	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
57460484Sobrien	if (sw_csum & CSUM_DELAY_DATA) {
57560484Sobrien		in_delayed_cksum(m);
57660484Sobrien		sw_csum &= ~CSUM_DELAY_DATA;
57760484Sobrien	}
57860484Sobrien#ifdef SCTP
579130561Sobrien	if (sw_csum & CSUM_SCTP) {
58089857Sobrien		sctp_delayed_cksum(m);
58189857Sobrien		sw_csum &= ~CSUM_SCTP;
58260484Sobrien	}
58360484Sobrien#endif
58460484Sobrien	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
58577298Sobrien
58660484Sobrien	/*
58760484Sobrien	 * If small enough for interface, or the interface will take
58860484Sobrien	 * care of the fragmentation for us, we can just send directly.
58989857Sobrien	 */
59089857Sobrien	if (ip->ip_len <= mtu ||
59189857Sobrien	    (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
59289857Sobrien	    ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
59389857Sobrien		ip->ip_len = htons(ip->ip_len);
594130561Sobrien		ip->ip_off = htons(ip->ip_off);
595104834Sobrien		ip->ip_sum = 0;
596104834Sobrien		if (sw_csum & CSUM_DELAY_IP)
597104834Sobrien			ip->ip_sum = in_cksum(m, hlen);
598104834Sobrien
599104834Sobrien		/*
600104834Sobrien		 * Record statistics for this interface address.
601104834Sobrien		 * With CSUM_TSO the byte/packet count will be slightly
602104834Sobrien		 * incorrect because we count the IP+TCP headers only
603104834Sobrien		 * once instead of for every generated packet.
604104834Sobrien		 */
605104834Sobrien		if (!(flags & IP_FORWARDING) && ia) {
606104834Sobrien			if (m->m_pkthdr.csum_flags & CSUM_TSO)
607104834Sobrien				ia->ia_ifa.if_opackets +=
608104834Sobrien				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
609130561Sobrien			else
61060484Sobrien				ia->ia_ifa.if_opackets++;
61160484Sobrien			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
61289857Sobrien		}
61389857Sobrien#ifdef MBUF_STRESS_TEST
61460484Sobrien		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
61589857Sobrien			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
61660484Sobrien#endif
61760484Sobrien		/*
61889857Sobrien		 * Reset layer specific mbuf flags
619104834Sobrien		 * to avoid confusing lower layers.
620104834Sobrien		 */
621104834Sobrien		m->m_flags &= ~(M_PROTOFLAGS);
622104834Sobrien		error = (*ifp->if_output)(ifp, m,
623104834Sobrien		    		(struct sockaddr *)dst, ro);
624104834Sobrien		goto done;
625104834Sobrien	}
62689857Sobrien
62789857Sobrien	/* Balk when DF bit is set or the interface didn't support TSO. */
62889857Sobrien	if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
62989857Sobrien		error = EMSGSIZE;
63060484Sobrien		IPSTAT_INC(ips_cantfrag);
63160484Sobrien		goto bad;
63260484Sobrien	}
633104834Sobrien
634104834Sobrien	/*
635104834Sobrien	 * Too large for interface; fragment if possible. If successful,
636104834Sobrien	 * on return, m will point to a list of packets to be sent.
63789857Sobrien	 */
63889857Sobrien	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum);
63989857Sobrien	if (error)
64089857Sobrien		goto bad;
64189857Sobrien	for (; m; m = m0) {
64289857Sobrien		m0 = m->m_nextpkt;
64389857Sobrien		m->m_nextpkt = 0;
64489857Sobrien		if (error == 0) {
64589857Sobrien			/* Record statistics for this interface address. */
64689857Sobrien			if (ia != NULL) {
64789857Sobrien				ia->ia_ifa.if_opackets++;
64889857Sobrien				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
64989857Sobrien			}
65089857Sobrien			/*
65189857Sobrien			 * Reset layer specific mbuf flags
65260484Sobrien			 * to avoid confusing upper layers.
65389857Sobrien			 */
65489857Sobrien			m->m_flags &= ~(M_PROTOFLAGS);
65560484Sobrien
656104834Sobrien			error = (*ifp->if_output)(ifp, m,
657104834Sobrien			    (struct sockaddr *)dst, ro);
658104834Sobrien		} else
659104834Sobrien			m_freem(m);
660104834Sobrien	}
661104834Sobrien
662104834Sobrien	if (error == 0)
663104834Sobrien		IPSTAT_INC(ips_fragmented);
664104834Sobrien
665104834Sobriendone:
666104834Sobrien	if (ro == &iproute && ro->ro_rt && !nortfree) {
667104834Sobrien		RTFREE(ro->ro_rt);
668104834Sobrien	}
669104834Sobrien	if (ia != NULL)
670104834Sobrien		ifa_free(&ia->ia_ifa);
671104834Sobrien	return (error);
672104834Sobrienbad:
673104834Sobrien	m_freem(m);
674104834Sobrien	goto done;
675218822Sdim}
676104834Sobrien
67789857Sobrien/*
678104834Sobrien * Create a chain of fragments which fit the given mtu. m_frag points to the
679104834Sobrien * mbuf to be fragmented; on return it points to the chain with the fragments.
68089857Sobrien * Return 0 if no error. If error, m_frag may contain a partially built
68160484Sobrien * chain of fragments that should be freed by the caller.
68289857Sobrien *
68389857Sobrien * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
68489857Sobrien * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
68560484Sobrien */
68689857Sobrienint
68789857Sobrienip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
68860484Sobrien    u_long if_hwassist_flags, int sw_csum)
68989857Sobrien{
69089857Sobrien	int error = 0;
69160484Sobrien	int hlen = ip->ip_hl << 2;
69289857Sobrien	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
69389857Sobrien	int off;
69489857Sobrien	struct mbuf *m0 = *m_frag;	/* the original packet		*/
69589857Sobrien	int firstlen;
69689857Sobrien	struct mbuf **mnext;
69789857Sobrien	int nfrags;
69889857Sobrien
69989857Sobrien	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
70089857Sobrien		IPSTAT_INC(ips_cantfrag);
70189857Sobrien		return EMSGSIZE;
70260484Sobrien	}
70389857Sobrien
70489857Sobrien	/*
70589857Sobrien	 * Must be able to put at least 8 bytes per fragment.
70689857Sobrien	 */
70760484Sobrien	if (len < 8)
70860484Sobrien		return EMSGSIZE;
70960484Sobrien
710218822Sdim	/*
711218822Sdim	 * If the interface will not calculate checksums on
712218822Sdim	 * fragmented packets, then do it here.
713218822Sdim	 */
714218822Sdim	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
715218822Sdim	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
716218822Sdim		in_delayed_cksum(m0);
717218822Sdim		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
718218822Sdim	}
719218822Sdim#ifdef SCTP
720218822Sdim	if (m0->m_pkthdr.csum_flags & CSUM_SCTP &&
72160484Sobrien	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
72260484Sobrien		sctp_delayed_cksum(m0);
72360484Sobrien		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
724218822Sdim	}
72560484Sobrien#endif
72660484Sobrien	if (len > PAGE_SIZE) {
72760484Sobrien		/*
72860484Sobrien		 * Fragment large datagrams such that each segment
72960484Sobrien		 * contains a multiple of PAGE_SIZE amount of data,
73060484Sobrien		 * plus headers. This enables a receiver to perform
73160484Sobrien		 * page-flipping zero-copy optimizations.
73260484Sobrien		 *
733130561Sobrien		 * XXX When does this help given that sender and receiver
73460484Sobrien		 * could have different page sizes, and also mtu could
73560484Sobrien		 * be less than the receiver's page size ?
73660484Sobrien		 */
73760484Sobrien		int newlen;
73860484Sobrien		struct mbuf *m;
73960484Sobrien
74060484Sobrien		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
74160484Sobrien			off += m->m_len;
74260484Sobrien
74360484Sobrien		/*
74460484Sobrien		 * firstlen (off - hlen) must be aligned on an
74560484Sobrien		 * 8-byte boundary
74660484Sobrien		 */
74760484Sobrien		if (off < hlen)
74860484Sobrien			goto smart_frag_failure;
74960484Sobrien		off = ((off - hlen) & ~7) + hlen;
75060484Sobrien		newlen = (~PAGE_MASK) & mtu;
75160484Sobrien		if ((newlen + sizeof (struct ip)) > mtu) {
75260484Sobrien			/* we failed, go back the default */
75360484Sobriensmart_frag_failure:
75460484Sobrien			newlen = len;
75560484Sobrien			off = hlen + len;
75660484Sobrien		}
75760484Sobrien		len = newlen;
75860484Sobrien
75960484Sobrien	} else {
76060484Sobrien		off = hlen + len;
76160484Sobrien	}
76260484Sobrien
76360484Sobrien	firstlen = off - hlen;
76460484Sobrien	mnext = &m0->m_nextpkt;		/* pointer to next packet */
76560484Sobrien
76660484Sobrien	/*
76760484Sobrien	 * Loop through length of segment after first fragment,
76860484Sobrien	 * make new header and copy data of each part and link onto chain.
76960484Sobrien	 * Here, m0 is the original packet, m is the fragment being created.
77091041Sobrien	 * The fragments are linked off the m_nextpkt of the original
77160484Sobrien	 * packet, which after processing serves as the first fragment.
77260484Sobrien	 */
77360484Sobrien	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
774104834Sobrien		struct ip *mhip;	/* ip header on the fragment */
775104834Sobrien		struct mbuf *m;
776218822Sdim		int mhlen = sizeof (struct ip);
77760484Sobrien
77860484Sobrien		MGETHDR(m, M_DONTWAIT, MT_DATA);
77960484Sobrien		if (m == NULL) {
78089857Sobrien			error = ENOBUFS;
78191041Sobrien			IPSTAT_INC(ips_odropped);
782104834Sobrien			goto done;
783104834Sobrien		}
784104834Sobrien		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
785104834Sobrien		/*
786104834Sobrien		 * In the first mbuf, leave room for the link header, then
787104834Sobrien		 * copy the original IP header including options. The payload
788104834Sobrien		 * goes into an additional mbuf chain returned by m_copym().
789104834Sobrien		 */
790104834Sobrien		m->m_data += max_linkhdr;
791104834Sobrien		mhip = mtod(m, struct ip *);
79260484Sobrien		*mhip = *ip;
79360484Sobrien		if (hlen > sizeof (struct ip)) {
79460484Sobrien			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
79560484Sobrien			mhip->ip_v = IPVERSION;
79660484Sobrien			mhip->ip_hl = mhlen >> 2;
79777298Sobrien		}
79860484Sobrien		m->m_len = mhlen;
79960484Sobrien		/* XXX do we need to add ip->ip_off below ? */
80060484Sobrien		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
80160484Sobrien		if (off + len >= ip->ip_len) {	/* last fragment */
80260484Sobrien			len = ip->ip_len - off;
80360484Sobrien			m->m_flags |= M_LASTFRAG;
80433965Sjdp		} else
80533965Sjdp			mhip->ip_off |= IP_MF;
80633965Sjdp		mhip->ip_len = htons((u_short)(len + mhlen));
807218822Sdim		m->m_next = m_copym(m0, off, len, M_DONTWAIT);
808218822Sdim		if (m->m_next == NULL) {	/* copy failed */
809218822Sdim			m_free(m);
81033965Sjdp			error = ENOBUFS;	/* ??? */
81133965Sjdp			IPSTAT_INC(ips_odropped);
81233965Sjdp			goto done;
81333965Sjdp		}
81460484Sobrien		m->m_pkthdr.len = mhlen + len;
81560484Sobrien		m->m_pkthdr.rcvif = NULL;
81660484Sobrien#ifdef MAC
817130561Sobrien		mac_netinet_fragment(m0, m);
81860484Sobrien#endif
81960484Sobrien		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
82060484Sobrien		mhip->ip_off = htons(mhip->ip_off);
82160484Sobrien		mhip->ip_sum = 0;
82260484Sobrien		if (sw_csum & CSUM_DELAY_IP)
823130561Sobrien			mhip->ip_sum = in_cksum(m, mhlen);
82433965Sjdp		*mnext = m;
82533965Sjdp		mnext = &m->m_nextpkt;
82633965Sjdp	}
827130561Sobrien	IPSTAT_ADD(ips_ofragments, nfrags);
82833965Sjdp
82933965Sjdp	/* set first marker for fragment chain */
83033965Sjdp	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
83189857Sobrien	m0->m_pkthdr.csum_data = nfrags;
83289857Sobrien
83389857Sobrien	/*
83489857Sobrien	 * Update first fragment by trimming what's been copied out
83589857Sobrien	 * and updating header.
83633965Sjdp	 */
83733965Sjdp	m_adj(m0, hlen + firstlen - ip->ip_len);
83833965Sjdp	m0->m_pkthdr.len = hlen + firstlen;
83933965Sjdp	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
84033965Sjdp	ip->ip_off |= IP_MF;
84133965Sjdp	ip->ip_off = htons(ip->ip_off);
84233965Sjdp	ip->ip_sum = 0;
84333965Sjdp	if (sw_csum & CSUM_DELAY_IP)
84460484Sobrien		ip->ip_sum = in_cksum(m0, hlen);
84560484Sobrien
84660484Sobriendone:
84760484Sobrien	*m_frag = m0;
84860484Sobrien	return error;
84960484Sobrien}
850130561Sobrien
851130561Sobrienvoid
85260484Sobrienin_delayed_cksum(struct mbuf *m)
85360484Sobrien{
85460484Sobrien	struct ip *ip;
85560484Sobrien	u_short csum, offset;
85660484Sobrien
85760484Sobrien	ip = mtod(m, struct ip *);
85860484Sobrien	offset = ip->ip_hl << 2 ;
85989857Sobrien	csum = in_cksum_skip(m, ip->ip_len, offset);
860104834Sobrien	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
86189857Sobrien		csum = 0xffff;
86289857Sobrien	offset += m->m_pkthdr.csum_data;	/* checksum offset */
86360484Sobrien
864130561Sobrien	if (offset + sizeof(u_short) > m->m_len) {
86589857Sobrien		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
86660484Sobrien		    m->m_len, offset, ip->ip_p);
86760484Sobrien		/*
86860484Sobrien		 * XXX
86960484Sobrien		 * this shouldn't happen, but if it does, the
87060484Sobrien		 * correct behavior may be to insert the checksum
87160484Sobrien		 * in the appropriate next mbuf in the chain.
87289857Sobrien		 */
87389857Sobrien		return;
87489857Sobrien	}
87589857Sobrien	*(u_short *)(m->m_data + offset) = csum;
87689857Sobrien}
87760484Sobrien
87889857Sobrien/*
87960484Sobrien * IP socket option processing.
88060484Sobrien */
88160484Sobrienint
88289857Sobrienip_ctloutput(struct socket *so, struct sockopt *sopt)
883104834Sobrien{
88489857Sobrien	struct	inpcb *inp = sotoinpcb(so);
88560484Sobrien	int	error, optval;
88689857Sobrien
88789857Sobrien	error = optval = 0;
88889857Sobrien	if (sopt->sopt_level != IPPROTO_IP) {
88989857Sobrien		if ((sopt->sopt_level == SOL_SOCKET) &&
89089857Sobrien		    (sopt->sopt_name == SO_SETFIB)) {
89189857Sobrien			inp->inp_inc.inc_fibnum = so->so_fibnum;
89289857Sobrien			return (0);
89360484Sobrien		}
89489857Sobrien		return (EINVAL);
89560484Sobrien	}
89689857Sobrien
89789857Sobrien	switch (sopt->sopt_dir) {
89889857Sobrien	case SOPT_SET:
89989857Sobrien		switch (sopt->sopt_name) {
90089857Sobrien		case IP_OPTIONS:
90160484Sobrien#ifdef notyet
90289857Sobrien		case IP_RETOPTS:
90360484Sobrien#endif
904130561Sobrien		{
90589857Sobrien			struct mbuf *m;
90689857Sobrien			if (sopt->sopt_valsize > MLEN) {
90789857Sobrien				error = EMSGSIZE;
90889857Sobrien				break;
90989857Sobrien			}
91089857Sobrien			MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
91189857Sobrien			if (m == NULL) {
91260484Sobrien				error = ENOBUFS;
91360484Sobrien				break;
91460484Sobrien			}
91560484Sobrien			m->m_len = sopt->sopt_valsize;
91660484Sobrien			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
91760484Sobrien					    m->m_len);
91889857Sobrien			if (error) {
91960484Sobrien				m_free(m);
92033965Sjdp				break;
92133965Sjdp			}
92233965Sjdp			INP_WLOCK(inp);
92333965Sjdp			error = ip_pcbopts(inp, sopt->sopt_name, m);
92433965Sjdp			INP_WUNLOCK(inp);
92533965Sjdp			return (error);
92633965Sjdp		}
92733965Sjdp
92833965Sjdp		case IP_BINDANY:
92933965Sjdp			if (sopt->sopt_td != NULL) {
93033965Sjdp				error = priv_check(sopt->sopt_td,
93177298Sobrien				    PRIV_NETINET_BINDANY);
93233965Sjdp				if (error)
93333965Sjdp					break;
93433965Sjdp			}
93533965Sjdp			/* FALLTHROUGH */
93633965Sjdp		case IP_TOS:
93733965Sjdp		case IP_TTL:
93833965Sjdp		case IP_MINTTL:
93933965Sjdp		case IP_RECVOPTS:
94033965Sjdp		case IP_RECVRETOPTS:
94133965Sjdp		case IP_RECVDSTADDR:
94233965Sjdp		case IP_RECVTTL:
943130561Sobrien		case IP_RECVIF:
94433965Sjdp		case IP_FAITH:
94533965Sjdp		case IP_ONESBCAST:
94633965Sjdp		case IP_DONTFRAG:
94733965Sjdp			error = sooptcopyin(sopt, &optval, sizeof optval,
94833965Sjdp					    sizeof optval);
94933965Sjdp			if (error)
95060484Sobrien				break;
95133965Sjdp
95233965Sjdp			switch (sopt->sopt_name) {
95333965Sjdp			case IP_TOS:
95433965Sjdp				inp->inp_ip_tos = optval;
95533965Sjdp				break;
95633965Sjdp
95733965Sjdp			case IP_TTL:
95833965Sjdp				inp->inp_ip_ttl = optval;
95933965Sjdp				break;
96033965Sjdp
96133965Sjdp			case IP_MINTTL:
96233965Sjdp				if (optval >= 0 && optval <= MAXTTL)
96333965Sjdp					inp->inp_ip_minttl = optval;
96433965Sjdp				else
96533965Sjdp					error = EINVAL;
96660484Sobrien				break;
96760484Sobrien
96860484Sobrien#define	OPTSET(bit) do {						\
96960484Sobrien	INP_WLOCK(inp);							\
97089857Sobrien	if (optval)							\
97189857Sobrien		inp->inp_flags |= bit;					\
97289857Sobrien	else								\
97360484Sobrien		inp->inp_flags &= ~bit;					\
97433965Sjdp	INP_WUNLOCK(inp);						\
97533965Sjdp} while (0)
97633965Sjdp
97789857Sobrien			case IP_RECVOPTS:
97889857Sobrien				OPTSET(INP_RECVOPTS);
97989857Sobrien				break;
98033965Sjdp
98160484Sobrien			case IP_RECVRETOPTS:
98260484Sobrien				OPTSET(INP_RECVRETOPTS);
983130561Sobrien				break;
98489857Sobrien
98589857Sobrien			case IP_RECVDSTADDR:
98689857Sobrien				OPTSET(INP_RECVDSTADDR);
98760484Sobrien				break;
98860484Sobrien
98933965Sjdp			case IP_RECVTTL:
99033965Sjdp				OPTSET(INP_RECVTTL);
99133965Sjdp				break;
99233965Sjdp
99333965Sjdp			case IP_RECVIF:
99433965Sjdp				OPTSET(INP_RECVIF);
995218822Sdim				break;
996218822Sdim
997218822Sdim			case IP_FAITH:
998218822Sdim				OPTSET(INP_FAITH);
99933965Sjdp				break;
100033965Sjdp
100133965Sjdp			case IP_ONESBCAST:
100233965Sjdp				OPTSET(INP_ONESBCAST);
100333965Sjdp				break;
100433965Sjdp			case IP_DONTFRAG:
100533965Sjdp				OPTSET(INP_DONTFRAG);
100633965Sjdp				break;
100733965Sjdp			case IP_BINDANY:
100833965Sjdp				OPTSET(INP_BINDANY);
1009130561Sobrien				break;
101033965Sjdp			}
101133965Sjdp			break;
101233965Sjdp#undef OPTSET
101333965Sjdp
101433965Sjdp		/*
101533965Sjdp		 * Multicast socket options are processed by the in_mcast
101633965Sjdp		 * module.
101733965Sjdp		 */
101833965Sjdp		case IP_MULTICAST_IF:
1019130561Sobrien		case IP_MULTICAST_VIF:
102033965Sjdp		case IP_MULTICAST_TTL:
102133965Sjdp		case IP_MULTICAST_LOOP:
102233965Sjdp		case IP_ADD_MEMBERSHIP:
1023130561Sobrien		case IP_DROP_MEMBERSHIP:
102433965Sjdp		case IP_ADD_SOURCE_MEMBERSHIP:
102533965Sjdp		case IP_DROP_SOURCE_MEMBERSHIP:
102633965Sjdp		case IP_BLOCK_SOURCE:
102733965Sjdp		case IP_UNBLOCK_SOURCE:
102833965Sjdp		case IP_MSFILTER:
102933965Sjdp		case MCAST_JOIN_GROUP:
103033965Sjdp		case MCAST_LEAVE_GROUP:
1031130561Sobrien		case MCAST_JOIN_SOURCE_GROUP:
103233965Sjdp		case MCAST_LEAVE_SOURCE_GROUP:
103333965Sjdp		case MCAST_BLOCK_SOURCE:
103433965Sjdp		case MCAST_UNBLOCK_SOURCE:
103533965Sjdp			error = inp_setmoptions(inp, sopt);
103677298Sobrien			break;
103733965Sjdp
103833965Sjdp		case IP_PORTRANGE:
103933965Sjdp			error = sooptcopyin(sopt, &optval, sizeof optval,
104033965Sjdp					    sizeof optval);
104177298Sobrien			if (error)
104233965Sjdp				break;
104333965Sjdp
104477298Sobrien			INP_WLOCK(inp);
104533965Sjdp			switch (optval) {
104677298Sobrien			case IP_PORTRANGE_DEFAULT:
104733965Sjdp				inp->inp_flags &= ~(INP_LOWPORT);
104833965Sjdp				inp->inp_flags &= ~(INP_HIGHPORT);
104977298Sobrien				break;
105033965Sjdp
105177298Sobrien			case IP_PORTRANGE_HIGH:
105233965Sjdp				inp->inp_flags &= ~(INP_LOWPORT);
105333965Sjdp				inp->inp_flags |= INP_HIGHPORT;
105433965Sjdp				break;
105533965Sjdp
105633965Sjdp			case IP_PORTRANGE_LOW:
105733965Sjdp				inp->inp_flags &= ~(INP_HIGHPORT);
105833965Sjdp				inp->inp_flags |= INP_LOWPORT;
105933965Sjdp				break;
106033965Sjdp
106133965Sjdp			default:
106233965Sjdp				error = EINVAL;
106333965Sjdp				break;
106433965Sjdp			}
106533965Sjdp			INP_WUNLOCK(inp);
106660484Sobrien			break;
106760484Sobrien
106860484Sobrien#ifdef IPSEC
106933965Sjdp		case IP_IPSEC_POLICY:
107033965Sjdp		{
107133965Sjdp			caddr_t req;
107233965Sjdp			struct mbuf *m;
1073130561Sobrien
107477298Sobrien			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
107533965Sjdp				break;
107633965Sjdp			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
107733965Sjdp				break;
107833965Sjdp			req = mtod(m, caddr_t);
1079130561Sobrien			error = ipsec_set_policy(inp, sopt->sopt_name, req,
108033965Sjdp			    m->m_len, (sopt->sopt_td != NULL) ?
108133965Sjdp			    sopt->sopt_td->td_ucred : NULL);
108233965Sjdp			m_freem(m);
108333965Sjdp			break;
108433965Sjdp		}
108533965Sjdp#endif /* IPSEC */
108633965Sjdp
108733965Sjdp		default:
1088130561Sobrien			error = ENOPROTOOPT;
108977298Sobrien			break;
109033965Sjdp		}
109133965Sjdp		break;
109277298Sobrien
109333965Sjdp	case SOPT_GET:
109433965Sjdp		switch (sopt->sopt_name) {
109533965Sjdp		case IP_OPTIONS:
109633965Sjdp		case IP_RETOPTS:
109733965Sjdp			if (inp->inp_options)
109833965Sjdp				error = sooptcopyout(sopt,
109933965Sjdp						     mtod(inp->inp_options,
110033965Sjdp							  char *),
110133965Sjdp						     inp->inp_options->m_len);
110233965Sjdp			else
110333965Sjdp				sopt->sopt_valsize = 0;
1104218822Sdim			break;
1105218822Sdim
1106218822Sdim		case IP_TOS:
1107218822Sdim		case IP_TTL:
110833965Sjdp		case IP_MINTTL:
110933965Sjdp		case IP_RECVOPTS:
111033965Sjdp		case IP_RECVRETOPTS:
1111218822Sdim		case IP_RECVDSTADDR:
111233965Sjdp		case IP_RECVTTL:
111333965Sjdp		case IP_RECVIF:
111433965Sjdp		case IP_PORTRANGE:
111533965Sjdp		case IP_FAITH:
111633965Sjdp		case IP_ONESBCAST:
111733965Sjdp		case IP_DONTFRAG:
111833965Sjdp			switch (sopt->sopt_name) {
111989857Sobrien
112033965Sjdp			case IP_TOS:
112133965Sjdp				optval = inp->inp_ip_tos;
112233965Sjdp				break;
112360484Sobrien
112433965Sjdp			case IP_TTL:
112560484Sobrien				optval = inp->inp_ip_ttl;
112677298Sobrien				break;
112733965Sjdp
112833965Sjdp			case IP_MINTTL:
112960484Sobrien				optval = inp->inp_ip_minttl;
113033965Sjdp				break;
113133965Sjdp
1132218822Sdim#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1133218822Sdim
113433965Sjdp			case IP_RECVOPTS:
113533965Sjdp				optval = OPTBIT(INP_RECVOPTS);
1136218822Sdim				break;
1137218822Sdim
1138218822Sdim			case IP_RECVRETOPTS:
113933965Sjdp				optval = OPTBIT(INP_RECVRETOPTS);
114033965Sjdp				break;
114133965Sjdp
1142218822Sdim			case IP_RECVDSTADDR:
114333965Sjdp				optval = OPTBIT(INP_RECVDSTADDR);
1144218822Sdim				break;
1145218822Sdim
1146218822Sdim			case IP_RECVTTL:
1147218822Sdim				optval = OPTBIT(INP_RECVTTL);
1148218822Sdim				break;
1149218822Sdim
115033965Sjdp			case IP_RECVIF:
1151218822Sdim				optval = OPTBIT(INP_RECVIF);
115233965Sjdp				break;
115333965Sjdp
115433965Sjdp			case IP_PORTRANGE:
115533965Sjdp				if (inp->inp_flags & INP_HIGHPORT)
115633965Sjdp					optval = IP_PORTRANGE_HIGH;
115733965Sjdp				else if (inp->inp_flags & INP_LOWPORT)
115833965Sjdp					optval = IP_PORTRANGE_LOW;
115933965Sjdp				else
116033965Sjdp					optval = 0;
116133965Sjdp				break;
116233965Sjdp
116333965Sjdp			case IP_FAITH:
116433965Sjdp				optval = OPTBIT(INP_FAITH);
116533965Sjdp				break;
116633965Sjdp
116733965Sjdp			case IP_ONESBCAST:
116860484Sobrien				optval = OPTBIT(INP_ONESBCAST);
1169130561Sobrien				break;
117060484Sobrien			case IP_DONTFRAG:
117189857Sobrien				optval = OPTBIT(INP_DONTFRAG);
117289857Sobrien				break;
117389857Sobrien			}
117433965Sjdp			error = sooptcopyout(sopt, &optval, sizeof optval);
117589857Sobrien			break;
1176218822Sdim
1177218822Sdim		/*
1178218822Sdim		 * Multicast socket options are processed by the in_mcast
1179218822Sdim		 * module.
118033965Sjdp		 */
118189857Sobrien		case IP_MULTICAST_IF:
118233965Sjdp		case IP_MULTICAST_VIF:
118333965Sjdp		case IP_MULTICAST_TTL:
118433965Sjdp		case IP_MULTICAST_LOOP:
118533965Sjdp		case IP_MSFILTER:
118633965Sjdp			error = inp_getmoptions(inp, sopt);
118733965Sjdp			break;
118833965Sjdp
118933965Sjdp#ifdef IPSEC
119033965Sjdp		case IP_IPSEC_POLICY:
119133965Sjdp		{
119233965Sjdp			struct mbuf *m = NULL;
119333965Sjdp			caddr_t req = NULL;
1194130561Sobrien			size_t len = 0;
119533965Sjdp
119633965Sjdp			if (m != 0) {
119733965Sjdp				req = mtod(m, caddr_t);
119833965Sjdp				len = m->m_len;
119933965Sjdp			}
120033965Sjdp			error = ipsec_get_policy(sotoinpcb(so), req, len, &m);
120133965Sjdp			if (error == 0)
120233965Sjdp				error = soopt_mcopyout(sopt, m); /* XXX */
120333965Sjdp			if (error == 0)
120433965Sjdp				m_freem(m);
1205218822Sdim			break;
1206218822Sdim		}
120733965Sjdp#endif /* IPSEC */
120833965Sjdp
120960484Sobrien		default:
121033965Sjdp			error = ENOPROTOOPT;
1211218822Sdim			break;
121233965Sjdp		}
121333965Sjdp		break;
121433965Sjdp	}
121533965Sjdp	return (error);
121660484Sobrien}
121733965Sjdp
121860484Sobrien/*
121933965Sjdp * Routine called from ip_output() to loop back a copy of an IP multicast
122033965Sjdp * packet to the input queue of a specified interface.  Note that this
122133965Sjdp * calls the output routine of the loopback "driver", but with an interface
122233965Sjdp * pointer that might NOT be a loopback interface -- evil, but easier than
122333965Sjdp * replicating that code here.
1224218822Sdim */
122533965Sjdpstatic void
122633965Sjdpip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
122733965Sjdp    int hlen)
122833965Sjdp{
122933965Sjdp	register struct ip *ip;
123033965Sjdp	struct mbuf *copym;
123133965Sjdp
123289857Sobrien	/*
123333965Sjdp	 * Make a deep copy of the packet because we're going to
123433965Sjdp	 * modify the pack in order to generate checksums.
1235218822Sdim	 */
123633965Sjdp	copym = m_dup(m, M_DONTWAIT);
123733965Sjdp	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
123833965Sjdp		copym = m_pullup(copym, hlen);
123933965Sjdp	if (copym != NULL) {
124060484Sobrien		/* If needed, compute the checksum and mark it as valid. */
124133965Sjdp		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
124260484Sobrien			in_delayed_cksum(copym);
124333965Sjdp			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
124433965Sjdp			copym->m_pkthdr.csum_flags |=
124533965Sjdp			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
124633965Sjdp			copym->m_pkthdr.csum_data = 0xffff;
124733965Sjdp		}
124833965Sjdp		/*
124938889Sjdp		 * We don't bother to fragment if the IP length is greater
125038889Sjdp		 * than the interface's MTU.  Can this possibly matter?
125138889Sjdp		 */
125277298Sobrien		ip = mtod(copym, struct ip *);
125377298Sobrien		ip->ip_len = htons(ip->ip_len);
125477298Sobrien		ip->ip_off = htons(ip->ip_off);
125533965Sjdp		ip->ip_sum = 0;
125633965Sjdp		ip->ip_sum = in_cksum(copym, hlen);
125733965Sjdp#if 1 /* XXX */
125833965Sjdp		if (dst->sin_family != AF_INET) {
125933965Sjdp			printf("ip_mloopback: bad address family %d\n",
1260218822Sdim						dst->sin_family);
126133965Sjdp			dst->sin_family = AF_INET;
126233965Sjdp		}
126333965Sjdp#endif
126433965Sjdp		if_simloop(ifp, copym, dst->sin_family, 0);
126533965Sjdp	}
126633965Sjdp}
126789857Sobrien