ip_output.c revision 191621
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/netinet/ip_output.c 191621 2009-04-28 11:10:33Z trasz $");
34
35#include "opt_ipfw.h"
36#include "opt_inet.h"
37#include "opt_ipsec.h"
38#include "opt_route.h"
39#include "opt_mac.h"
40#include "opt_mbuf_stress_test.h"
41#include "opt_mpath.h"
42#include "opt_sctp.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/kernel.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#include <sys/priv.h>
50#include <sys/proc.h>
51#include <sys/protosw.h>
52#include <sys/socket.h>
53#include <sys/socketvar.h>
54#include <sys/sysctl.h>
55#include <sys/ucred.h>
56#include <sys/vimage.h>
57
58#include <net/if.h>
59#include <net/netisr.h>
60#include <net/pfil.h>
61#include <net/route.h>
62#include <net/flowtable.h>
63#ifdef RADIX_MPATH
64#include <net/radix_mpath.h>
65#endif
66#include <net/vnet.h>
67
68#include <netinet/in.h>
69#include <netinet/in_systm.h>
70#include <netinet/ip.h>
71#include <netinet/in_pcb.h>
72#include <netinet/in_var.h>
73#include <netinet/ip_var.h>
74#include <netinet/ip_options.h>
75#include <netinet/vinet.h>
76#ifdef SCTP
77#include <netinet/sctp.h>
78#include <netinet/sctp_crc32.h>
79#endif
80
81#ifdef IPSEC
82#include <netinet/ip_ipsec.h>
83#include <netipsec/ipsec.h>
84#endif /* IPSEC*/
85
86#include <machine/in_cksum.h>
87
88#include <security/mac/mac_framework.h>
89
90#define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
91				x, (ntohl(a.s_addr)>>24)&0xFF,\
92				  (ntohl(a.s_addr)>>16)&0xFF,\
93				  (ntohl(a.s_addr)>>8)&0xFF,\
94				  (ntohl(a.s_addr))&0xFF, y);
95
96#ifdef VIMAGE_GLOBALS
97u_short ip_id;
98#endif
99
100#ifdef MBUF_STRESS_TEST
101int mbuf_frag_size = 0;
102SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
103	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
104#endif
105
106#if defined(IP_NONLOCALBIND)
107static int ip_nonlocalok = 0;
108SYSCTL_INT(_net_inet_ip, OID_AUTO, nonlocalok,
109	CTLFLAG_RW|CTLFLAG_SECURE, &ip_nonlocalok, 0, "");
110#endif
111
112static void	ip_mloopback
113	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
114
115
116extern int in_mcast_loop;
117extern	struct protosw inetsw[];
118
119/*
120 * IP output.  The packet in mbuf chain m contains a skeletal IP
121 * header (with len, off, ttl, proto, tos, src, dst).
122 * The mbuf chain containing the packet will be freed.
123 * The mbuf opt, if present, will not be freed.
124 * In the IP forwarding case, the packet will arrive with options already
125 * inserted, so must have a NULL opt pointer.
126 */
127int
128ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
129    struct ip_moptions *imo, struct inpcb *inp)
130{
131	INIT_VNET_NET(curvnet);
132	INIT_VNET_INET(curvnet);
133	struct ip *ip;
134	struct ifnet *ifp = NULL;	/* keep compiler happy */
135	struct mbuf *m0;
136	int hlen = sizeof (struct ip);
137	int mtu;
138	int len, error = 0;
139	int nortfree = 0;
140	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
141	struct in_ifaddr *ia = NULL;
142	int isbroadcast, sw_csum;
143	struct route iproute;
144	struct in_addr odst;
145#ifdef IPFIREWALL_FORWARD
146	struct m_tag *fwd_tag = NULL;
147#endif
148#ifdef IPSEC
149	int no_route_but_check_spd = 0;
150#endif
151	M_ASSERTPKTHDR(m);
152
153	if (ro == NULL) {
154		ro = &iproute;
155		bzero(ro, sizeof (*ro));
156	}
157
158	if (inp != NULL) {
159		M_SETFIB(m, inp->inp_inc.inc_fibnum);
160		INP_LOCK_ASSERT(inp);
161		if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
162			m->m_pkthdr.flowid = inp->inp_flowid;
163			m->m_flags |= M_FLOWID;
164		}
165	}
166	if ((ro == &iproute) && (ro->ro_rt == NULL) && (ro->ro_lle == NULL)) {
167		if (flowtable_lookup(ip_ft, m, ro) == 0)
168			nortfree = 1;
169	}
170
171	if (opt) {
172		len = 0;
173		m = ip_insertoptions(m, opt, &len);
174		if (len != 0)
175			hlen = len;
176	}
177	ip = mtod(m, struct ip *);
178
179	/*
180	 * Fill in IP header.  If we are not allowing fragmentation,
181	 * then the ip_id field is meaningless, but we don't set it
182	 * to zero.  Doing so causes various problems when devices along
183	 * the path (routers, load balancers, firewalls, etc.) illegally
184	 * disable DF on our packet.  Note that a 16-bit counter
185	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
186	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
187	 * for Counting NATted Hosts", Proc. IMW'02, available at
188	 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
189	 */
190	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
191		ip->ip_v = IPVERSION;
192		ip->ip_hl = hlen >> 2;
193		ip->ip_id = ip_newid();
194		IPSTAT_INC(ips_localout);
195	} else {
196		hlen = ip->ip_hl << 2;
197	}
198
199	dst = (struct sockaddr_in *)&ro->ro_dst;
200again:
201	/*
202	 * If there is a cached route,
203	 * check that it is to the same destination
204	 * and is still up.  If not, free it and try again.
205	 * The address family should also be checked in case of sharing the
206	 * cache with IPv6.
207	 */
208	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
209			  dst->sin_family != AF_INET ||
210			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
211		if (!nortfree)
212			RTFREE(ro->ro_rt);
213		ro->ro_rt = (struct rtentry *)NULL;
214	}
215#ifdef IPFIREWALL_FORWARD
216	if (ro->ro_rt == NULL && fwd_tag == NULL) {
217#else
218	if (ro->ro_rt == NULL) {
219#endif
220		bzero(dst, sizeof(*dst));
221		dst->sin_family = AF_INET;
222		dst->sin_len = sizeof(*dst);
223		dst->sin_addr = ip->ip_dst;
224	}
225	/*
226	 * If routing to interface only, short circuit routing lookup.
227	 * The use of an all-ones broadcast address implies this; an
228	 * interface is specified by the broadcast address of an interface,
229	 * or the destination address of a ptp interface.
230	 */
231	if (flags & IP_SENDONES) {
232		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
233		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
234			IPSTAT_INC(ips_noroute);
235			error = ENETUNREACH;
236			goto bad;
237		}
238		ip->ip_dst.s_addr = INADDR_BROADCAST;
239		dst->sin_addr = ip->ip_dst;
240		ifp = ia->ia_ifp;
241		ip->ip_ttl = 1;
242		isbroadcast = 1;
243	} else if (flags & IP_ROUTETOIF) {
244		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
245		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
246			IPSTAT_INC(ips_noroute);
247			error = ENETUNREACH;
248			goto bad;
249		}
250		ifp = ia->ia_ifp;
251		ip->ip_ttl = 1;
252		isbroadcast = in_broadcast(dst->sin_addr, ifp);
253	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
254	    imo != NULL && imo->imo_multicast_ifp != NULL) {
255		/*
256		 * Bypass the normal routing lookup for multicast
257		 * packets if the interface is specified.
258		 */
259		ifp = imo->imo_multicast_ifp;
260		IFP_TO_IA(ifp, ia);
261		isbroadcast = 0;	/* fool gcc */
262	} else {
263		/*
264		 * We want to do any cloning requested by the link layer,
265		 * as this is probably required in all cases for correct
266		 * operation (as it is for ARP).
267		 */
268		if (ro->ro_rt == NULL)
269#ifdef RADIX_MPATH
270			rtalloc_mpath_fib(ro,
271			    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
272			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
273#else
274			in_rtalloc_ign(ro, 0,
275			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
276#endif
277		if (ro->ro_rt == NULL) {
278#ifdef IPSEC
279			/*
280			 * There is no route for this packet, but it is
281			 * possible that a matching SPD entry exists.
282			 */
283			no_route_but_check_spd = 1;
284			mtu = 0; /* Silence GCC warning. */
285			goto sendit;
286#endif
287			IPSTAT_INC(ips_noroute);
288			error = EHOSTUNREACH;
289			goto bad;
290		}
291		ia = ifatoia(ro->ro_rt->rt_ifa);
292		ifp = ro->ro_rt->rt_ifp;
293		ro->ro_rt->rt_rmx.rmx_pksent++;
294		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
295			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
296		if (ro->ro_rt->rt_flags & RTF_HOST)
297			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
298		else
299			isbroadcast = in_broadcast(dst->sin_addr, ifp);
300	}
301	/*
302	 * Calculate MTU.  If we have a route that is up, use that,
303	 * otherwise use the interface's MTU.
304	 */
305	if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) {
306		/*
307		 * This case can happen if the user changed the MTU
308		 * of an interface after enabling IP on it.  Because
309		 * most netifs don't keep track of routes pointing to
310		 * them, there is no way for one to update all its
311		 * routes when the MTU is changed.
312		 */
313		if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)
314			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
315		mtu = ro->ro_rt->rt_rmx.rmx_mtu;
316	} else {
317		mtu = ifp->if_mtu;
318	}
319	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
320		m->m_flags |= M_MCAST;
321		/*
322		 * IP destination address is multicast.  Make sure "dst"
323		 * still points to the address in "ro".  (It may have been
324		 * changed to point to a gateway address, above.)
325		 */
326		dst = (struct sockaddr_in *)&ro->ro_dst;
327		/*
328		 * See if the caller provided any multicast options
329		 */
330		if (imo != NULL) {
331			ip->ip_ttl = imo->imo_multicast_ttl;
332			if (imo->imo_multicast_vif != -1)
333				ip->ip_src.s_addr =
334				    ip_mcast_src ?
335				    ip_mcast_src(imo->imo_multicast_vif) :
336				    INADDR_ANY;
337		} else
338			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
339		/*
340		 * Confirm that the outgoing interface supports multicast.
341		 */
342		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
343			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
344				IPSTAT_INC(ips_noroute);
345				error = ENETUNREACH;
346				goto bad;
347			}
348		}
349		/*
350		 * If source address not specified yet, use address
351		 * of outgoing interface.
352		 */
353		if (ip->ip_src.s_addr == INADDR_ANY) {
354			/* Interface may have no addresses. */
355			if (ia != NULL)
356				ip->ip_src = IA_SIN(ia)->sin_addr;
357		}
358
359		if ((imo == NULL && in_mcast_loop) ||
360		    (imo && imo->imo_multicast_loop)) {
361			/*
362			 * Loop back multicast datagram if not expressly
363			 * forbidden to do so, even if we are not a member
364			 * of the group; ip_input() will filter it later,
365			 * thus deferring a hash lookup and mutex acquisition
366			 * at the expense of a cheap copy using m_copym().
367			 */
368			ip_mloopback(ifp, m, dst, hlen);
369		} else {
370			/*
371			 * If we are acting as a multicast router, perform
372			 * multicast forwarding as if the packet had just
373			 * arrived on the interface to which we are about
374			 * to send.  The multicast forwarding function
375			 * recursively calls this function, using the
376			 * IP_FORWARDING flag to prevent infinite recursion.
377			 *
378			 * Multicasts that are looped back by ip_mloopback(),
379			 * above, will be forwarded by the ip_input() routine,
380			 * if necessary.
381			 */
382			if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
383				/*
384				 * If rsvp daemon is not running, do not
385				 * set ip_moptions. This ensures that the packet
386				 * is multicast and not just sent down one link
387				 * as prescribed by rsvpd.
388				 */
389				if (!V_rsvp_on)
390					imo = NULL;
391				if (ip_mforward &&
392				    ip_mforward(ip, ifp, m, imo) != 0) {
393					m_freem(m);
394					goto done;
395				}
396			}
397		}
398
399		/*
400		 * Multicasts with a time-to-live of zero may be looped-
401		 * back, above, but must not be transmitted on a network.
402		 * Also, multicasts addressed to the loopback interface
403		 * are not sent -- the above call to ip_mloopback() will
404		 * loop back a copy. ip_input() will drop the copy if
405		 * this host does not belong to the destination group on
406		 * the loopback interface.
407		 */
408		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
409			m_freem(m);
410			goto done;
411		}
412
413		goto sendit;
414	}
415
416	/*
417	 * If the source address is not specified yet, use the address
418	 * of the outoing interface.
419	 */
420	if (ip->ip_src.s_addr == INADDR_ANY) {
421		/* Interface may have no addresses. */
422		if (ia != NULL) {
423			ip->ip_src = IA_SIN(ia)->sin_addr;
424		}
425	}
426
427	/*
428	 * Verify that we have any chance at all of being able to queue the
429	 * packet or packet fragments, unless ALTQ is enabled on the given
430	 * interface in which case packetdrop should be done by queueing.
431	 */
432#ifdef ALTQ
433	if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
434	    ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
435	    ifp->if_snd.ifq_maxlen))
436#else
437	if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
438	    ifp->if_snd.ifq_maxlen)
439#endif /* ALTQ */
440	{
441		error = ENOBUFS;
442		IPSTAT_INC(ips_odropped);
443		ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
444		goto bad;
445	}
446
447	/*
448	 * Look for broadcast address and
449	 * verify user is allowed to send
450	 * such a packet.
451	 */
452	if (isbroadcast) {
453		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
454			error = EADDRNOTAVAIL;
455			goto bad;
456		}
457		if ((flags & IP_ALLOWBROADCAST) == 0) {
458			error = EACCES;
459			goto bad;
460		}
461		/* don't allow broadcast messages to be fragmented */
462		if (ip->ip_len > mtu) {
463			error = EMSGSIZE;
464			goto bad;
465		}
466		m->m_flags |= M_BCAST;
467	} else {
468		m->m_flags &= ~M_BCAST;
469	}
470
471sendit:
472#ifdef IPSEC
473	switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) {
474	case 1:
475		goto bad;
476	case -1:
477		goto done;
478	case 0:
479	default:
480		break;	/* Continue with packet processing. */
481	}
482	/*
483	 * Check if there was a route for this packet; return error if not.
484	 */
485	if (no_route_but_check_spd) {
486		IPSTAT_INC(ips_noroute);
487		error = EHOSTUNREACH;
488		goto bad;
489	}
490	/* Update variables that are affected by ipsec4_output(). */
491	ip = mtod(m, struct ip *);
492	hlen = ip->ip_hl << 2;
493#endif /* IPSEC */
494
495	/* Jump over all PFIL processing if hooks are not active. */
496	if (!PFIL_HOOKED(&inet_pfil_hook))
497		goto passout;
498
499	/* Run through list of hooks for output packets. */
500	odst.s_addr = ip->ip_dst.s_addr;
501	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
502	if (error != 0 || m == NULL)
503		goto done;
504
505	ip = mtod(m, struct ip *);
506
507	/* See if destination IP address was changed by packet filter. */
508	if (odst.s_addr != ip->ip_dst.s_addr) {
509		m->m_flags |= M_SKIP_FIREWALL;
510		/* If destination is now ourself drop to ip_input(). */
511		if (in_localip(ip->ip_dst)) {
512			m->m_flags |= M_FASTFWD_OURS;
513			if (m->m_pkthdr.rcvif == NULL)
514				m->m_pkthdr.rcvif = V_loif;
515			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
516				m->m_pkthdr.csum_flags |=
517				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
518				m->m_pkthdr.csum_data = 0xffff;
519			}
520			m->m_pkthdr.csum_flags |=
521			    CSUM_IP_CHECKED | CSUM_IP_VALID;
522#ifdef SCTP
523			if (m->m_pkthdr.csum_flags & CSUM_SCTP)
524				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
525#endif
526			error = netisr_queue(NETISR_IP, m);
527			goto done;
528		} else
529			goto again;	/* Redo the routing table lookup. */
530	}
531
532#ifdef IPFIREWALL_FORWARD
533	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
534	if (m->m_flags & M_FASTFWD_OURS) {
535		if (m->m_pkthdr.rcvif == NULL)
536			m->m_pkthdr.rcvif = V_loif;
537		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
538			m->m_pkthdr.csum_flags |=
539			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
540			m->m_pkthdr.csum_data = 0xffff;
541		}
542#ifdef SCTP
543		if (m->m_pkthdr.csum_flags & CSUM_SCTP)
544			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
545#endif
546		m->m_pkthdr.csum_flags |=
547			    CSUM_IP_CHECKED | CSUM_IP_VALID;
548
549		error = netisr_queue(NETISR_IP, m);
550		goto done;
551	}
552	/* Or forward to some other address? */
553	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
554	if (fwd_tag) {
555		dst = (struct sockaddr_in *)&ro->ro_dst;
556		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
557		m->m_flags |= M_SKIP_FIREWALL;
558		m_tag_delete(m, fwd_tag);
559		goto again;
560	}
561#endif /* IPFIREWALL_FORWARD */
562
563passout:
564	/* 127/8 must not appear on wire - RFC1122. */
565	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
566	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
567		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
568			IPSTAT_INC(ips_badaddr);
569			error = EADDRNOTAVAIL;
570			goto bad;
571		}
572	}
573
574	m->m_pkthdr.csum_flags |= CSUM_IP;
575	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
576	if (sw_csum & CSUM_DELAY_DATA) {
577		in_delayed_cksum(m);
578		sw_csum &= ~CSUM_DELAY_DATA;
579	}
580#ifdef SCTP
581	if (sw_csum & CSUM_SCTP) {
582		sctp_delayed_cksum(m);
583		sw_csum &= ~CSUM_SCTP;
584	}
585#endif
586	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
587
588	/*
589	 * If small enough for interface, or the interface will take
590	 * care of the fragmentation for us, we can just send directly.
591	 */
592	if (ip->ip_len <= mtu ||
593	    (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
594	    ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
595		ip->ip_len = htons(ip->ip_len);
596		ip->ip_off = htons(ip->ip_off);
597		ip->ip_sum = 0;
598		if (sw_csum & CSUM_DELAY_IP)
599			ip->ip_sum = in_cksum(m, hlen);
600
601		/*
602		 * Record statistics for this interface address.
603		 * With CSUM_TSO the byte/packet count will be slightly
604		 * incorrect because we count the IP+TCP headers only
605		 * once instead of for every generated packet.
606		 */
607		if (!(flags & IP_FORWARDING) && ia) {
608			if (m->m_pkthdr.csum_flags & CSUM_TSO)
609				ia->ia_ifa.if_opackets +=
610				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
611			else
612				ia->ia_ifa.if_opackets++;
613			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
614		}
615#ifdef MBUF_STRESS_TEST
616		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
617			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
618#endif
619		/*
620		 * Reset layer specific mbuf flags
621		 * to avoid confusing lower layers.
622		 */
623		m->m_flags &= ~(M_PROTOFLAGS);
624		error = (*ifp->if_output)(ifp, m,
625		    		(struct sockaddr *)dst, ro);
626		goto done;
627	}
628
629	/* Balk when DF bit is set or the interface didn't support TSO. */
630	if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
631		error = EMSGSIZE;
632		IPSTAT_INC(ips_cantfrag);
633		goto bad;
634	}
635
636	/*
637	 * Too large for interface; fragment if possible. If successful,
638	 * on return, m will point to a list of packets to be sent.
639	 */
640	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum);
641	if (error)
642		goto bad;
643	for (; m; m = m0) {
644		m0 = m->m_nextpkt;
645		m->m_nextpkt = 0;
646		if (error == 0) {
647			/* Record statistics for this interface address. */
648			if (ia != NULL) {
649				ia->ia_ifa.if_opackets++;
650				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
651			}
652			/*
653			 * Reset layer specific mbuf flags
654			 * to avoid confusing upper layers.
655			 */
656			m->m_flags &= ~(M_PROTOFLAGS);
657
658			error = (*ifp->if_output)(ifp, m,
659			    (struct sockaddr *)dst, ro);
660		} else
661			m_freem(m);
662	}
663
664	if (error == 0)
665		IPSTAT_INC(ips_fragmented);
666
667done:
668	if (ro == &iproute && ro->ro_rt && !nortfree) {
669		RTFREE(ro->ro_rt);
670	}
671	return (error);
672bad:
673	m_freem(m);
674	goto done;
675}
676
677/*
678 * Create a chain of fragments which fit the given mtu. m_frag points to the
679 * mbuf to be fragmented; on return it points to the chain with the fragments.
680 * Return 0 if no error. If error, m_frag may contain a partially built
681 * chain of fragments that should be freed by the caller.
682 *
683 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
684 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
685 */
686int
687ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
688    u_long if_hwassist_flags, int sw_csum)
689{
690	INIT_VNET_INET(curvnet);
691	int error = 0;
692	int hlen = ip->ip_hl << 2;
693	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
694	int off;
695	struct mbuf *m0 = *m_frag;	/* the original packet		*/
696	int firstlen;
697	struct mbuf **mnext;
698	int nfrags;
699
700	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
701		IPSTAT_INC(ips_cantfrag);
702		return EMSGSIZE;
703	}
704
705	/*
706	 * Must be able to put at least 8 bytes per fragment.
707	 */
708	if (len < 8)
709		return EMSGSIZE;
710
711	/*
712	 * If the interface will not calculate checksums on
713	 * fragmented packets, then do it here.
714	 */
715	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
716	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
717		in_delayed_cksum(m0);
718		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
719	}
720#ifdef SCTP
721	if (m0->m_pkthdr.csum_flags & CSUM_SCTP &&
722	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
723		sctp_delayed_cksum(m0);
724		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
725	}
726#endif
727	if (len > PAGE_SIZE) {
728		/*
729		 * Fragment large datagrams such that each segment
730		 * contains a multiple of PAGE_SIZE amount of data,
731		 * plus headers. This enables a receiver to perform
732		 * page-flipping zero-copy optimizations.
733		 *
734		 * XXX When does this help given that sender and receiver
735		 * could have different page sizes, and also mtu could
736		 * be less than the receiver's page size ?
737		 */
738		int newlen;
739		struct mbuf *m;
740
741		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
742			off += m->m_len;
743
744		/*
745		 * firstlen (off - hlen) must be aligned on an
746		 * 8-byte boundary
747		 */
748		if (off < hlen)
749			goto smart_frag_failure;
750		off = ((off - hlen) & ~7) + hlen;
751		newlen = (~PAGE_MASK) & mtu;
752		if ((newlen + sizeof (struct ip)) > mtu) {
753			/* we failed, go back the default */
754smart_frag_failure:
755			newlen = len;
756			off = hlen + len;
757		}
758		len = newlen;
759
760	} else {
761		off = hlen + len;
762	}
763
764	firstlen = off - hlen;
765	mnext = &m0->m_nextpkt;		/* pointer to next packet */
766
767	/*
768	 * Loop through length of segment after first fragment,
769	 * make new header and copy data of each part and link onto chain.
770	 * Here, m0 is the original packet, m is the fragment being created.
771	 * The fragments are linked off the m_nextpkt of the original
772	 * packet, which after processing serves as the first fragment.
773	 */
774	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
775		struct ip *mhip;	/* ip header on the fragment */
776		struct mbuf *m;
777		int mhlen = sizeof (struct ip);
778
779		MGETHDR(m, M_DONTWAIT, MT_DATA);
780		if (m == NULL) {
781			error = ENOBUFS;
782			IPSTAT_INC(ips_odropped);
783			goto done;
784		}
785		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
786		/*
787		 * In the first mbuf, leave room for the link header, then
788		 * copy the original IP header including options. The payload
789		 * goes into an additional mbuf chain returned by m_copym().
790		 */
791		m->m_data += max_linkhdr;
792		mhip = mtod(m, struct ip *);
793		*mhip = *ip;
794		if (hlen > sizeof (struct ip)) {
795			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
796			mhip->ip_v = IPVERSION;
797			mhip->ip_hl = mhlen >> 2;
798		}
799		m->m_len = mhlen;
800		/* XXX do we need to add ip->ip_off below ? */
801		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
802		if (off + len >= ip->ip_len) {	/* last fragment */
803			len = ip->ip_len - off;
804			m->m_flags |= M_LASTFRAG;
805		} else
806			mhip->ip_off |= IP_MF;
807		mhip->ip_len = htons((u_short)(len + mhlen));
808		m->m_next = m_copym(m0, off, len, M_DONTWAIT);
809		if (m->m_next == NULL) {	/* copy failed */
810			m_free(m);
811			error = ENOBUFS;	/* ??? */
812			IPSTAT_INC(ips_odropped);
813			goto done;
814		}
815		m->m_pkthdr.len = mhlen + len;
816		m->m_pkthdr.rcvif = NULL;
817#ifdef MAC
818		mac_netinet_fragment(m0, m);
819#endif
820		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
821		mhip->ip_off = htons(mhip->ip_off);
822		mhip->ip_sum = 0;
823		if (sw_csum & CSUM_DELAY_IP)
824			mhip->ip_sum = in_cksum(m, mhlen);
825		*mnext = m;
826		mnext = &m->m_nextpkt;
827	}
828	IPSTAT_ADD(ips_ofragments, nfrags);
829
830	/* set first marker for fragment chain */
831	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
832	m0->m_pkthdr.csum_data = nfrags;
833
834	/*
835	 * Update first fragment by trimming what's been copied out
836	 * and updating header.
837	 */
838	m_adj(m0, hlen + firstlen - ip->ip_len);
839	m0->m_pkthdr.len = hlen + firstlen;
840	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
841	ip->ip_off |= IP_MF;
842	ip->ip_off = htons(ip->ip_off);
843	ip->ip_sum = 0;
844	if (sw_csum & CSUM_DELAY_IP)
845		ip->ip_sum = in_cksum(m0, hlen);
846
847done:
848	*m_frag = m0;
849	return error;
850}
851
852void
853in_delayed_cksum(struct mbuf *m)
854{
855	struct ip *ip;
856	u_short csum, offset;
857
858	ip = mtod(m, struct ip *);
859	offset = ip->ip_hl << 2 ;
860	csum = in_cksum_skip(m, ip->ip_len, offset);
861	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
862		csum = 0xffff;
863	offset += m->m_pkthdr.csum_data;	/* checksum offset */
864
865	if (offset + sizeof(u_short) > m->m_len) {
866		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
867		    m->m_len, offset, ip->ip_p);
868		/*
869		 * XXX
870		 * this shouldn't happen, but if it does, the
871		 * correct behavior may be to insert the checksum
872		 * in the appropriate next mbuf in the chain.
873		 */
874		return;
875	}
876	*(u_short *)(m->m_data + offset) = csum;
877}
878
879/*
880 * IP socket option processing.
881 */
882int
883ip_ctloutput(struct socket *so, struct sockopt *sopt)
884{
885	struct	inpcb *inp = sotoinpcb(so);
886	int	error, optval;
887
888	error = optval = 0;
889	if (sopt->sopt_level != IPPROTO_IP) {
890		if ((sopt->sopt_level == SOL_SOCKET) &&
891		    (sopt->sopt_name == SO_SETFIB)) {
892			inp->inp_inc.inc_fibnum = so->so_fibnum;
893			return (0);
894		}
895		return (EINVAL);
896	}
897
898	switch (sopt->sopt_dir) {
899	case SOPT_SET:
900		switch (sopt->sopt_name) {
901		case IP_OPTIONS:
902#ifdef notyet
903		case IP_RETOPTS:
904#endif
905		{
906			struct mbuf *m;
907			if (sopt->sopt_valsize > MLEN) {
908				error = EMSGSIZE;
909				break;
910			}
911			MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
912			if (m == NULL) {
913				error = ENOBUFS;
914				break;
915			}
916			m->m_len = sopt->sopt_valsize;
917			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
918					    m->m_len);
919			if (error) {
920				m_free(m);
921				break;
922			}
923			INP_WLOCK(inp);
924			error = ip_pcbopts(inp, sopt->sopt_name, m);
925			INP_WUNLOCK(inp);
926			return (error);
927		}
928
929#if defined(IP_NONLOCALBIND)
930		case IP_NONLOCALOK:
931			if (! ip_nonlocalok) {
932				error = ENOPROTOOPT;
933				break;
934			}
935			/* FALLTHROUGH */
936#endif
937		case IP_TOS:
938		case IP_TTL:
939		case IP_MINTTL:
940		case IP_RECVOPTS:
941		case IP_RECVRETOPTS:
942		case IP_RECVDSTADDR:
943		case IP_RECVTTL:
944		case IP_RECVIF:
945		case IP_FAITH:
946		case IP_ONESBCAST:
947		case IP_DONTFRAG:
948			error = sooptcopyin(sopt, &optval, sizeof optval,
949					    sizeof optval);
950			if (error)
951				break;
952
953			switch (sopt->sopt_name) {
954			case IP_TOS:
955				inp->inp_ip_tos = optval;
956				break;
957
958			case IP_TTL:
959				inp->inp_ip_ttl = optval;
960				break;
961
962			case IP_MINTTL:
963				if (optval >= 0 && optval <= MAXTTL)
964					inp->inp_ip_minttl = optval;
965				else
966					error = EINVAL;
967				break;
968
969#define	OPTSET(bit) do {						\
970	INP_WLOCK(inp);							\
971	if (optval)							\
972		inp->inp_flags |= bit;					\
973	else								\
974		inp->inp_flags &= ~bit;					\
975	INP_WUNLOCK(inp);						\
976} while (0)
977
978			case IP_RECVOPTS:
979				OPTSET(INP_RECVOPTS);
980				break;
981
982			case IP_RECVRETOPTS:
983				OPTSET(INP_RECVRETOPTS);
984				break;
985
986			case IP_RECVDSTADDR:
987				OPTSET(INP_RECVDSTADDR);
988				break;
989
990			case IP_RECVTTL:
991				OPTSET(INP_RECVTTL);
992				break;
993
994			case IP_RECVIF:
995				OPTSET(INP_RECVIF);
996				break;
997
998			case IP_FAITH:
999				OPTSET(INP_FAITH);
1000				break;
1001
1002			case IP_ONESBCAST:
1003				OPTSET(INP_ONESBCAST);
1004				break;
1005			case IP_DONTFRAG:
1006				OPTSET(INP_DONTFRAG);
1007				break;
1008#if defined(IP_NONLOCALBIND)
1009			case IP_NONLOCALOK:
1010				OPTSET(INP_NONLOCALOK);
1011				break;
1012#endif
1013			}
1014			break;
1015#undef OPTSET
1016
1017		/*
1018		 * Multicast socket options are processed by the in_mcast
1019		 * module.
1020		 */
1021		case IP_MULTICAST_IF:
1022		case IP_MULTICAST_VIF:
1023		case IP_MULTICAST_TTL:
1024		case IP_MULTICAST_LOOP:
1025		case IP_ADD_MEMBERSHIP:
1026		case IP_DROP_MEMBERSHIP:
1027		case IP_ADD_SOURCE_MEMBERSHIP:
1028		case IP_DROP_SOURCE_MEMBERSHIP:
1029		case IP_BLOCK_SOURCE:
1030		case IP_UNBLOCK_SOURCE:
1031		case IP_MSFILTER:
1032		case MCAST_JOIN_GROUP:
1033		case MCAST_LEAVE_GROUP:
1034		case MCAST_JOIN_SOURCE_GROUP:
1035		case MCAST_LEAVE_SOURCE_GROUP:
1036		case MCAST_BLOCK_SOURCE:
1037		case MCAST_UNBLOCK_SOURCE:
1038			error = inp_setmoptions(inp, sopt);
1039			break;
1040
1041		case IP_PORTRANGE:
1042			error = sooptcopyin(sopt, &optval, sizeof optval,
1043					    sizeof optval);
1044			if (error)
1045				break;
1046
1047			INP_WLOCK(inp);
1048			switch (optval) {
1049			case IP_PORTRANGE_DEFAULT:
1050				inp->inp_flags &= ~(INP_LOWPORT);
1051				inp->inp_flags &= ~(INP_HIGHPORT);
1052				break;
1053
1054			case IP_PORTRANGE_HIGH:
1055				inp->inp_flags &= ~(INP_LOWPORT);
1056				inp->inp_flags |= INP_HIGHPORT;
1057				break;
1058
1059			case IP_PORTRANGE_LOW:
1060				inp->inp_flags &= ~(INP_HIGHPORT);
1061				inp->inp_flags |= INP_LOWPORT;
1062				break;
1063
1064			default:
1065				error = EINVAL;
1066				break;
1067			}
1068			INP_WUNLOCK(inp);
1069			break;
1070
1071#ifdef IPSEC
1072		case IP_IPSEC_POLICY:
1073		{
1074			caddr_t req;
1075			struct mbuf *m;
1076
1077			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1078				break;
1079			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1080				break;
1081			req = mtod(m, caddr_t);
1082			error = ipsec_set_policy(inp, sopt->sopt_name, req,
1083			    m->m_len, (sopt->sopt_td != NULL) ?
1084			    sopt->sopt_td->td_ucred : NULL);
1085			m_freem(m);
1086			break;
1087		}
1088#endif /* IPSEC */
1089
1090		default:
1091			error = ENOPROTOOPT;
1092			break;
1093		}
1094		break;
1095
1096	case SOPT_GET:
1097		switch (sopt->sopt_name) {
1098		case IP_OPTIONS:
1099		case IP_RETOPTS:
1100			if (inp->inp_options)
1101				error = sooptcopyout(sopt,
1102						     mtod(inp->inp_options,
1103							  char *),
1104						     inp->inp_options->m_len);
1105			else
1106				sopt->sopt_valsize = 0;
1107			break;
1108
1109		case IP_TOS:
1110		case IP_TTL:
1111		case IP_MINTTL:
1112		case IP_RECVOPTS:
1113		case IP_RECVRETOPTS:
1114		case IP_RECVDSTADDR:
1115		case IP_RECVTTL:
1116		case IP_RECVIF:
1117		case IP_PORTRANGE:
1118		case IP_FAITH:
1119		case IP_ONESBCAST:
1120		case IP_DONTFRAG:
1121			switch (sopt->sopt_name) {
1122
1123			case IP_TOS:
1124				optval = inp->inp_ip_tos;
1125				break;
1126
1127			case IP_TTL:
1128				optval = inp->inp_ip_ttl;
1129				break;
1130
1131			case IP_MINTTL:
1132				optval = inp->inp_ip_minttl;
1133				break;
1134
1135#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1136
1137			case IP_RECVOPTS:
1138				optval = OPTBIT(INP_RECVOPTS);
1139				break;
1140
1141			case IP_RECVRETOPTS:
1142				optval = OPTBIT(INP_RECVRETOPTS);
1143				break;
1144
1145			case IP_RECVDSTADDR:
1146				optval = OPTBIT(INP_RECVDSTADDR);
1147				break;
1148
1149			case IP_RECVTTL:
1150				optval = OPTBIT(INP_RECVTTL);
1151				break;
1152
1153			case IP_RECVIF:
1154				optval = OPTBIT(INP_RECVIF);
1155				break;
1156
1157			case IP_PORTRANGE:
1158				if (inp->inp_flags & INP_HIGHPORT)
1159					optval = IP_PORTRANGE_HIGH;
1160				else if (inp->inp_flags & INP_LOWPORT)
1161					optval = IP_PORTRANGE_LOW;
1162				else
1163					optval = 0;
1164				break;
1165
1166			case IP_FAITH:
1167				optval = OPTBIT(INP_FAITH);
1168				break;
1169
1170			case IP_ONESBCAST:
1171				optval = OPTBIT(INP_ONESBCAST);
1172				break;
1173			case IP_DONTFRAG:
1174				optval = OPTBIT(INP_DONTFRAG);
1175				break;
1176			}
1177			error = sooptcopyout(sopt, &optval, sizeof optval);
1178			break;
1179
1180		/*
1181		 * Multicast socket options are processed by the in_mcast
1182		 * module.
1183		 */
1184		case IP_MULTICAST_IF:
1185		case IP_MULTICAST_VIF:
1186		case IP_MULTICAST_TTL:
1187		case IP_MULTICAST_LOOP:
1188		case IP_MSFILTER:
1189			error = inp_getmoptions(inp, sopt);
1190			break;
1191
1192#ifdef IPSEC
1193		case IP_IPSEC_POLICY:
1194		{
1195			struct mbuf *m = NULL;
1196			caddr_t req = NULL;
1197			size_t len = 0;
1198
1199			if (m != 0) {
1200				req = mtod(m, caddr_t);
1201				len = m->m_len;
1202			}
1203			error = ipsec_get_policy(sotoinpcb(so), req, len, &m);
1204			if (error == 0)
1205				error = soopt_mcopyout(sopt, m); /* XXX */
1206			if (error == 0)
1207				m_freem(m);
1208			break;
1209		}
1210#endif /* IPSEC */
1211
1212		default:
1213			error = ENOPROTOOPT;
1214			break;
1215		}
1216		break;
1217	}
1218	return (error);
1219}
1220
1221/*
1222 * Routine called from ip_output() to loop back a copy of an IP multicast
1223 * packet to the input queue of a specified interface.  Note that this
1224 * calls the output routine of the loopback "driver", but with an interface
1225 * pointer that might NOT be a loopback interface -- evil, but easier than
1226 * replicating that code here.
1227 */
1228static void
1229ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
1230    int hlen)
1231{
1232	register struct ip *ip;
1233	struct mbuf *copym;
1234
1235	/*
1236	 * Make a deep copy of the packet because we're going to
1237	 * modify the pack in order to generate checksums.
1238	 */
1239	copym = m_dup(m, M_DONTWAIT);
1240	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
1241		copym = m_pullup(copym, hlen);
1242	if (copym != NULL) {
1243		/* If needed, compute the checksum and mark it as valid. */
1244		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1245			in_delayed_cksum(copym);
1246			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1247			copym->m_pkthdr.csum_flags |=
1248			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1249			copym->m_pkthdr.csum_data = 0xffff;
1250		}
1251		/*
1252		 * We don't bother to fragment if the IP length is greater
1253		 * than the interface's MTU.  Can this possibly matter?
1254		 */
1255		ip = mtod(copym, struct ip *);
1256		ip->ip_len = htons(ip->ip_len);
1257		ip->ip_off = htons(ip->ip_off);
1258		ip->ip_sum = 0;
1259		ip->ip_sum = in_cksum(copym, hlen);
1260#if 1 /* XXX */
1261		if (dst->sin_family != AF_INET) {
1262			printf("ip_mloopback: bad address family %d\n",
1263						dst->sin_family);
1264			dst->sin_family = AF_INET;
1265		}
1266#endif
1267		if_simloop(ifp, copym, dst->sin_family, 0);
1268	}
1269}
1270