ip_output.c revision 188306
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/netinet/ip_output.c 188306 2009-02-08 09:27:07Z bz $");
34
35#include "opt_ipfw.h"
36#include "opt_inet.h"
37#include "opt_ipsec.h"
38#include "opt_mac.h"
39#include "opt_mbuf_stress_test.h"
40#include "opt_mpath.h"
41#include "opt_sctp.h"
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/kernel.h>
46#include <sys/malloc.h>
47#include <sys/mbuf.h>
48#include <sys/priv.h>
49#include <sys/proc.h>
50#include <sys/protosw.h>
51#include <sys/socket.h>
52#include <sys/socketvar.h>
53#include <sys/sysctl.h>
54#include <sys/ucred.h>
55#include <sys/vimage.h>
56
57#include <net/if.h>
58#include <net/netisr.h>
59#include <net/pfil.h>
60#include <net/route.h>
61#ifdef RADIX_MPATH
62#include <net/radix_mpath.h>
63#endif
64#include <net/vnet.h>
65
66#include <netinet/in.h>
67#include <netinet/in_systm.h>
68#include <netinet/ip.h>
69#include <netinet/in_pcb.h>
70#include <netinet/in_var.h>
71#include <netinet/ip_var.h>
72#include <netinet/ip_options.h>
73#include <netinet/vinet.h>
74#ifdef SCTP
75#include <netinet/sctp.h>
76#include <netinet/sctp_crc32.h>
77#endif
78
79#ifdef IPSEC
80#include <netinet/ip_ipsec.h>
81#include <netipsec/ipsec.h>
82#endif /* IPSEC*/
83
84#include <machine/in_cksum.h>
85
86#include <security/mac/mac_framework.h>
87
88#define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
89				x, (ntohl(a.s_addr)>>24)&0xFF,\
90				  (ntohl(a.s_addr)>>16)&0xFF,\
91				  (ntohl(a.s_addr)>>8)&0xFF,\
92				  (ntohl(a.s_addr))&0xFF, y);
93
94#ifdef VIMAGE_GLOBALS
95u_short ip_id;
96#endif
97
98#ifdef MBUF_STRESS_TEST
99int mbuf_frag_size = 0;
100SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
101	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
102#endif
103
104#if defined(IP_NONLOCALBIND)
105static int ip_nonlocalok = 0;
106SYSCTL_INT(_net_inet_ip, OID_AUTO, nonlocalok,
107	CTLFLAG_RW|CTLFLAG_SECURE, &ip_nonlocalok, 0, "");
108#endif
109
110static void	ip_mloopback
111	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
112
113
114extern	struct protosw inetsw[];
115
116/*
117 * IP output.  The packet in mbuf chain m contains a skeletal IP
118 * header (with len, off, ttl, proto, tos, src, dst).
119 * The mbuf chain containing the packet will be freed.
120 * The mbuf opt, if present, will not be freed.
121 * In the IP forwarding case, the packet will arrive with options already
122 * inserted, so must have a NULL opt pointer.
123 */
124int
125ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
126    struct ip_moptions *imo, struct inpcb *inp)
127{
128	INIT_VNET_NET(curvnet);
129	INIT_VNET_INET(curvnet);
130	struct ip *ip;
131	struct ifnet *ifp = NULL;	/* keep compiler happy */
132	struct mbuf *m0;
133	int hlen = sizeof (struct ip);
134	int mtu;
135	int len, error = 0;
136	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
137	struct in_ifaddr *ia = NULL;
138	int isbroadcast, sw_csum;
139	struct route iproute;
140	struct in_addr odst;
141#ifdef IPFIREWALL_FORWARD
142	struct m_tag *fwd_tag = NULL;
143#endif
144	M_ASSERTPKTHDR(m);
145
146	if (ro == NULL) {
147		ro = &iproute;
148		bzero(ro, sizeof (*ro));
149	}
150
151	if (inp != NULL) {
152		M_SETFIB(m, inp->inp_inc.inc_fibnum);
153		INP_LOCK_ASSERT(inp);
154	}
155
156	if (opt) {
157		len = 0;
158		m = ip_insertoptions(m, opt, &len);
159		if (len != 0)
160			hlen = len;
161	}
162	ip = mtod(m, struct ip *);
163
164	/*
165	 * Fill in IP header.  If we are not allowing fragmentation,
166	 * then the ip_id field is meaningless, but we don't set it
167	 * to zero.  Doing so causes various problems when devices along
168	 * the path (routers, load balancers, firewalls, etc.) illegally
169	 * disable DF on our packet.  Note that a 16-bit counter
170	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
171	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
172	 * for Counting NATted Hosts", Proc. IMW'02, available at
173	 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
174	 */
175	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
176		ip->ip_v = IPVERSION;
177		ip->ip_hl = hlen >> 2;
178		ip->ip_id = ip_newid();
179		V_ipstat.ips_localout++;
180	} else {
181		hlen = ip->ip_hl << 2;
182	}
183
184	dst = (struct sockaddr_in *)&ro->ro_dst;
185again:
186	/*
187	 * If there is a cached route,
188	 * check that it is to the same destination
189	 * and is still up.  If not, free it and try again.
190	 * The address family should also be checked in case of sharing the
191	 * cache with IPv6.
192	 */
193	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
194			  dst->sin_family != AF_INET ||
195			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
196		RTFREE(ro->ro_rt);
197		ro->ro_rt = (struct rtentry *)NULL;
198	}
199#ifdef IPFIREWALL_FORWARD
200	if (ro->ro_rt == NULL && fwd_tag == NULL) {
201#else
202	if (ro->ro_rt == NULL) {
203#endif
204		bzero(dst, sizeof(*dst));
205		dst->sin_family = AF_INET;
206		dst->sin_len = sizeof(*dst);
207		dst->sin_addr = ip->ip_dst;
208	}
209	/*
210	 * If routing to interface only, short circuit routing lookup.
211	 * The use of an all-ones broadcast address implies this; an
212	 * interface is specified by the broadcast address of an interface,
213	 * or the destination address of a ptp interface.
214	 */
215	if (flags & IP_SENDONES) {
216		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
217		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
218			V_ipstat.ips_noroute++;
219			error = ENETUNREACH;
220			goto bad;
221		}
222		ip->ip_dst.s_addr = INADDR_BROADCAST;
223		dst->sin_addr = ip->ip_dst;
224		ifp = ia->ia_ifp;
225		ip->ip_ttl = 1;
226		isbroadcast = 1;
227	} else if (flags & IP_ROUTETOIF) {
228		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
229		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
230			V_ipstat.ips_noroute++;
231			error = ENETUNREACH;
232			goto bad;
233		}
234		ifp = ia->ia_ifp;
235		ip->ip_ttl = 1;
236		isbroadcast = in_broadcast(dst->sin_addr, ifp);
237	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
238	    imo != NULL && imo->imo_multicast_ifp != NULL) {
239		/*
240		 * Bypass the normal routing lookup for multicast
241		 * packets if the interface is specified.
242		 */
243		ifp = imo->imo_multicast_ifp;
244		IFP_TO_IA(ifp, ia);
245		isbroadcast = 0;	/* fool gcc */
246	} else {
247		/*
248		 * We want to do any cloning requested by the link layer,
249		 * as this is probably required in all cases for correct
250		 * operation (as it is for ARP).
251		 */
252		if (ro->ro_rt == NULL)
253#ifdef RADIX_MPATH
254			rtalloc_mpath_fib(ro,
255			    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
256			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
257#else
258			in_rtalloc_ign(ro, 0,
259			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
260#endif
261		if (ro->ro_rt == NULL) {
262			V_ipstat.ips_noroute++;
263			error = EHOSTUNREACH;
264			goto bad;
265		}
266		ia = ifatoia(ro->ro_rt->rt_ifa);
267		ifp = ro->ro_rt->rt_ifp;
268		ro->ro_rt->rt_rmx.rmx_pksent++;
269		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
270			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
271		if (ro->ro_rt->rt_flags & RTF_HOST)
272			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
273		else
274			isbroadcast = in_broadcast(dst->sin_addr, ifp);
275	}
276	/*
277	 * Calculate MTU.  If we have a route that is up, use that,
278	 * otherwise use the interface's MTU.
279	 */
280	if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) {
281		/*
282		 * This case can happen if the user changed the MTU
283		 * of an interface after enabling IP on it.  Because
284		 * most netifs don't keep track of routes pointing to
285		 * them, there is no way for one to update all its
286		 * routes when the MTU is changed.
287		 */
288		if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)
289			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
290		mtu = ro->ro_rt->rt_rmx.rmx_mtu;
291	} else {
292		mtu = ifp->if_mtu;
293	}
294	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
295		struct in_multi *inm;
296
297		m->m_flags |= M_MCAST;
298		/*
299		 * IP destination address is multicast.  Make sure "dst"
300		 * still points to the address in "ro".  (It may have been
301		 * changed to point to a gateway address, above.)
302		 */
303		dst = (struct sockaddr_in *)&ro->ro_dst;
304		/*
305		 * See if the caller provided any multicast options
306		 */
307		if (imo != NULL) {
308			ip->ip_ttl = imo->imo_multicast_ttl;
309			if (imo->imo_multicast_vif != -1)
310				ip->ip_src.s_addr =
311				    ip_mcast_src ?
312				    ip_mcast_src(imo->imo_multicast_vif) :
313				    INADDR_ANY;
314		} else
315			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
316		/*
317		 * Confirm that the outgoing interface supports multicast.
318		 */
319		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
320			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
321				V_ipstat.ips_noroute++;
322				error = ENETUNREACH;
323				goto bad;
324			}
325		}
326		/*
327		 * If source address not specified yet, use address
328		 * of outgoing interface.
329		 */
330		if (ip->ip_src.s_addr == INADDR_ANY) {
331			/* Interface may have no addresses. */
332			if (ia != NULL)
333				ip->ip_src = IA_SIN(ia)->sin_addr;
334		}
335
336		IN_MULTI_LOCK();
337		IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
338		if (inm != NULL &&
339		   (imo == NULL || imo->imo_multicast_loop)) {
340			IN_MULTI_UNLOCK();
341			/*
342			 * If we belong to the destination multicast group
343			 * on the outgoing interface, and the caller did not
344			 * forbid loopback, loop back a copy.
345			 */
346			ip_mloopback(ifp, m, dst, hlen);
347		}
348		else {
349			IN_MULTI_UNLOCK();
350			/*
351			 * If we are acting as a multicast router, perform
352			 * multicast forwarding as if the packet had just
353			 * arrived on the interface to which we are about
354			 * to send.  The multicast forwarding function
355			 * recursively calls this function, using the
356			 * IP_FORWARDING flag to prevent infinite recursion.
357			 *
358			 * Multicasts that are looped back by ip_mloopback(),
359			 * above, will be forwarded by the ip_input() routine,
360			 * if necessary.
361			 */
362			if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
363				/*
364				 * If rsvp daemon is not running, do not
365				 * set ip_moptions. This ensures that the packet
366				 * is multicast and not just sent down one link
367				 * as prescribed by rsvpd.
368				 */
369				if (!V_rsvp_on)
370					imo = NULL;
371				if (ip_mforward &&
372				    ip_mforward(ip, ifp, m, imo) != 0) {
373					m_freem(m);
374					goto done;
375				}
376			}
377		}
378
379		/*
380		 * Multicasts with a time-to-live of zero may be looped-
381		 * back, above, but must not be transmitted on a network.
382		 * Also, multicasts addressed to the loopback interface
383		 * are not sent -- the above call to ip_mloopback() will
384		 * loop back a copy if this host actually belongs to the
385		 * destination group on the loopback interface.
386		 */
387		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
388			m_freem(m);
389			goto done;
390		}
391
392		goto sendit;
393	}
394
395	/*
396	 * If the source address is not specified yet, use the address
397	 * of the outoing interface.
398	 */
399	if (ip->ip_src.s_addr == INADDR_ANY) {
400		/* Interface may have no addresses. */
401		if (ia != NULL) {
402			ip->ip_src = IA_SIN(ia)->sin_addr;
403		}
404	}
405
406	/*
407	 * Verify that we have any chance at all of being able to queue the
408	 * packet or packet fragments, unless ALTQ is enabled on the given
409	 * interface in which case packetdrop should be done by queueing.
410	 */
411#ifdef ALTQ
412	if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
413	    ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
414	    ifp->if_snd.ifq_maxlen))
415#else
416	if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
417	    ifp->if_snd.ifq_maxlen)
418#endif /* ALTQ */
419	{
420		error = ENOBUFS;
421		V_ipstat.ips_odropped++;
422		ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
423		goto bad;
424	}
425
426	/*
427	 * Look for broadcast address and
428	 * verify user is allowed to send
429	 * such a packet.
430	 */
431	if (isbroadcast) {
432		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
433			error = EADDRNOTAVAIL;
434			goto bad;
435		}
436		if ((flags & IP_ALLOWBROADCAST) == 0) {
437			error = EACCES;
438			goto bad;
439		}
440		/* don't allow broadcast messages to be fragmented */
441		if (ip->ip_len > mtu) {
442			error = EMSGSIZE;
443			goto bad;
444		}
445		m->m_flags |= M_BCAST;
446	} else {
447		m->m_flags &= ~M_BCAST;
448	}
449
450sendit:
451#ifdef IPSEC
452	switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) {
453	case 1:
454		goto bad;
455	case -1:
456		goto done;
457	case 0:
458	default:
459		break;	/* Continue with packet processing. */
460	}
461	/* Update variables that are affected by ipsec4_output(). */
462	ip = mtod(m, struct ip *);
463	hlen = ip->ip_hl << 2;
464#endif /* IPSEC */
465
466	/* Jump over all PFIL processing if hooks are not active. */
467	if (!PFIL_HOOKED(&inet_pfil_hook))
468		goto passout;
469
470	/* Run through list of hooks for output packets. */
471	odst.s_addr = ip->ip_dst.s_addr;
472	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
473	if (error != 0 || m == NULL)
474		goto done;
475
476	ip = mtod(m, struct ip *);
477
478	/* See if destination IP address was changed by packet filter. */
479	if (odst.s_addr != ip->ip_dst.s_addr) {
480		m->m_flags |= M_SKIP_FIREWALL;
481		/* If destination is now ourself drop to ip_input(). */
482		if (in_localip(ip->ip_dst)) {
483			m->m_flags |= M_FASTFWD_OURS;
484			if (m->m_pkthdr.rcvif == NULL)
485				m->m_pkthdr.rcvif = V_loif;
486			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
487				m->m_pkthdr.csum_flags |=
488				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
489				m->m_pkthdr.csum_data = 0xffff;
490			}
491			m->m_pkthdr.csum_flags |=
492			    CSUM_IP_CHECKED | CSUM_IP_VALID;
493#ifdef SCTP
494			if (m->m_pkthdr.csum_flags & CSUM_SCTP)
495				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
496#endif
497			error = netisr_queue(NETISR_IP, m);
498			goto done;
499		} else
500			goto again;	/* Redo the routing table lookup. */
501	}
502
503#ifdef IPFIREWALL_FORWARD
504	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
505	if (m->m_flags & M_FASTFWD_OURS) {
506		if (m->m_pkthdr.rcvif == NULL)
507			m->m_pkthdr.rcvif = V_loif;
508		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
509			m->m_pkthdr.csum_flags |=
510			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
511			m->m_pkthdr.csum_data = 0xffff;
512		}
513#ifdef SCTP
514		if (m->m_pkthdr.csum_flags & CSUM_SCTP)
515			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
516#endif
517		m->m_pkthdr.csum_flags |=
518			    CSUM_IP_CHECKED | CSUM_IP_VALID;
519
520		error = netisr_queue(NETISR_IP, m);
521		goto done;
522	}
523	/* Or forward to some other address? */
524	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
525	if (fwd_tag) {
526		dst = (struct sockaddr_in *)&ro->ro_dst;
527		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
528		m->m_flags |= M_SKIP_FIREWALL;
529		m_tag_delete(m, fwd_tag);
530		goto again;
531	}
532#endif /* IPFIREWALL_FORWARD */
533
534passout:
535	/* 127/8 must not appear on wire - RFC1122. */
536	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
537	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
538		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
539			V_ipstat.ips_badaddr++;
540			error = EADDRNOTAVAIL;
541			goto bad;
542		}
543	}
544
545	m->m_pkthdr.csum_flags |= CSUM_IP;
546	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
547	if (sw_csum & CSUM_DELAY_DATA) {
548		in_delayed_cksum(m);
549		sw_csum &= ~CSUM_DELAY_DATA;
550	}
551#ifdef SCTP
552	if (sw_csum & CSUM_SCTP) {
553		sctp_delayed_cksum(m);
554		sw_csum &= ~CSUM_SCTP;
555	}
556#endif
557	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
558
559	/*
560	 * If small enough for interface, or the interface will take
561	 * care of the fragmentation for us, we can just send directly.
562	 */
563	if (ip->ip_len <= mtu ||
564	    (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
565	    ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
566		ip->ip_len = htons(ip->ip_len);
567		ip->ip_off = htons(ip->ip_off);
568		ip->ip_sum = 0;
569		if (sw_csum & CSUM_DELAY_IP)
570			ip->ip_sum = in_cksum(m, hlen);
571
572		/*
573		 * Record statistics for this interface address.
574		 * With CSUM_TSO the byte/packet count will be slightly
575		 * incorrect because we count the IP+TCP headers only
576		 * once instead of for every generated packet.
577		 */
578		if (!(flags & IP_FORWARDING) && ia) {
579			if (m->m_pkthdr.csum_flags & CSUM_TSO)
580				ia->ia_ifa.if_opackets +=
581				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
582			else
583				ia->ia_ifa.if_opackets++;
584			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
585		}
586#ifdef MBUF_STRESS_TEST
587		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
588			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
589#endif
590		/*
591		 * Reset layer specific mbuf flags
592		 * to avoid confusing lower layers.
593		 */
594		m->m_flags &= ~(M_PROTOFLAGS);
595		error = (*ifp->if_output)(ifp, m,
596				(struct sockaddr *)dst, ro->ro_rt);
597		goto done;
598	}
599
600	/* Balk when DF bit is set or the interface didn't support TSO. */
601	if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
602		error = EMSGSIZE;
603		V_ipstat.ips_cantfrag++;
604		goto bad;
605	}
606
607	/*
608	 * Too large for interface; fragment if possible. If successful,
609	 * on return, m will point to a list of packets to be sent.
610	 */
611	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum);
612	if (error)
613		goto bad;
614	for (; m; m = m0) {
615		m0 = m->m_nextpkt;
616		m->m_nextpkt = 0;
617		if (error == 0) {
618			/* Record statistics for this interface address. */
619			if (ia != NULL) {
620				ia->ia_ifa.if_opackets++;
621				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
622			}
623			/*
624			 * Reset layer specific mbuf flags
625			 * to avoid confusing upper layers.
626			 */
627			m->m_flags &= ~(M_PROTOFLAGS);
628
629			error = (*ifp->if_output)(ifp, m,
630			    (struct sockaddr *)dst, ro->ro_rt);
631		} else
632			m_freem(m);
633	}
634
635	if (error == 0)
636		V_ipstat.ips_fragmented++;
637
638done:
639	if (ro == &iproute && ro->ro_rt) {
640		RTFREE(ro->ro_rt);
641	}
642	return (error);
643bad:
644	m_freem(m);
645	goto done;
646}
647
648/*
649 * Create a chain of fragments which fit the given mtu. m_frag points to the
650 * mbuf to be fragmented; on return it points to the chain with the fragments.
651 * Return 0 if no error. If error, m_frag may contain a partially built
652 * chain of fragments that should be freed by the caller.
653 *
654 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
655 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
656 */
657int
658ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
659    u_long if_hwassist_flags, int sw_csum)
660{
661	INIT_VNET_INET(curvnet);
662	int error = 0;
663	int hlen = ip->ip_hl << 2;
664	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
665	int off;
666	struct mbuf *m0 = *m_frag;	/* the original packet		*/
667	int firstlen;
668	struct mbuf **mnext;
669	int nfrags;
670
671	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
672		V_ipstat.ips_cantfrag++;
673		return EMSGSIZE;
674	}
675
676	/*
677	 * Must be able to put at least 8 bytes per fragment.
678	 */
679	if (len < 8)
680		return EMSGSIZE;
681
682	/*
683	 * If the interface will not calculate checksums on
684	 * fragmented packets, then do it here.
685	 */
686	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
687	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
688		in_delayed_cksum(m0);
689		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
690	}
691#ifdef SCTP
692	if (m0->m_pkthdr.csum_flags & CSUM_SCTP &&
693	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
694		sctp_delayed_cksum(m0);
695		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
696	}
697#endif
698	if (len > PAGE_SIZE) {
699		/*
700		 * Fragment large datagrams such that each segment
701		 * contains a multiple of PAGE_SIZE amount of data,
702		 * plus headers. This enables a receiver to perform
703		 * page-flipping zero-copy optimizations.
704		 *
705		 * XXX When does this help given that sender and receiver
706		 * could have different page sizes, and also mtu could
707		 * be less than the receiver's page size ?
708		 */
709		int newlen;
710		struct mbuf *m;
711
712		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
713			off += m->m_len;
714
715		/*
716		 * firstlen (off - hlen) must be aligned on an
717		 * 8-byte boundary
718		 */
719		if (off < hlen)
720			goto smart_frag_failure;
721		off = ((off - hlen) & ~7) + hlen;
722		newlen = (~PAGE_MASK) & mtu;
723		if ((newlen + sizeof (struct ip)) > mtu) {
724			/* we failed, go back the default */
725smart_frag_failure:
726			newlen = len;
727			off = hlen + len;
728		}
729		len = newlen;
730
731	} else {
732		off = hlen + len;
733	}
734
735	firstlen = off - hlen;
736	mnext = &m0->m_nextpkt;		/* pointer to next packet */
737
738	/*
739	 * Loop through length of segment after first fragment,
740	 * make new header and copy data of each part and link onto chain.
741	 * Here, m0 is the original packet, m is the fragment being created.
742	 * The fragments are linked off the m_nextpkt of the original
743	 * packet, which after processing serves as the first fragment.
744	 */
745	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
746		struct ip *mhip;	/* ip header on the fragment */
747		struct mbuf *m;
748		int mhlen = sizeof (struct ip);
749
750		MGETHDR(m, M_DONTWAIT, MT_DATA);
751		if (m == NULL) {
752			error = ENOBUFS;
753			V_ipstat.ips_odropped++;
754			goto done;
755		}
756		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
757		/*
758		 * In the first mbuf, leave room for the link header, then
759		 * copy the original IP header including options. The payload
760		 * goes into an additional mbuf chain returned by m_copy().
761		 */
762		m->m_data += max_linkhdr;
763		mhip = mtod(m, struct ip *);
764		*mhip = *ip;
765		if (hlen > sizeof (struct ip)) {
766			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
767			mhip->ip_v = IPVERSION;
768			mhip->ip_hl = mhlen >> 2;
769		}
770		m->m_len = mhlen;
771		/* XXX do we need to add ip->ip_off below ? */
772		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
773		if (off + len >= ip->ip_len) {	/* last fragment */
774			len = ip->ip_len - off;
775			m->m_flags |= M_LASTFRAG;
776		} else
777			mhip->ip_off |= IP_MF;
778		mhip->ip_len = htons((u_short)(len + mhlen));
779		m->m_next = m_copy(m0, off, len);
780		if (m->m_next == NULL) {	/* copy failed */
781			m_free(m);
782			error = ENOBUFS;	/* ??? */
783			V_ipstat.ips_odropped++;
784			goto done;
785		}
786		m->m_pkthdr.len = mhlen + len;
787		m->m_pkthdr.rcvif = NULL;
788#ifdef MAC
789		mac_netinet_fragment(m0, m);
790#endif
791		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
792		mhip->ip_off = htons(mhip->ip_off);
793		mhip->ip_sum = 0;
794		if (sw_csum & CSUM_DELAY_IP)
795			mhip->ip_sum = in_cksum(m, mhlen);
796		*mnext = m;
797		mnext = &m->m_nextpkt;
798	}
799	V_ipstat.ips_ofragments += nfrags;
800
801	/* set first marker for fragment chain */
802	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
803	m0->m_pkthdr.csum_data = nfrags;
804
805	/*
806	 * Update first fragment by trimming what's been copied out
807	 * and updating header.
808	 */
809	m_adj(m0, hlen + firstlen - ip->ip_len);
810	m0->m_pkthdr.len = hlen + firstlen;
811	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
812	ip->ip_off |= IP_MF;
813	ip->ip_off = htons(ip->ip_off);
814	ip->ip_sum = 0;
815	if (sw_csum & CSUM_DELAY_IP)
816		ip->ip_sum = in_cksum(m0, hlen);
817
818done:
819	*m_frag = m0;
820	return error;
821}
822
823void
824in_delayed_cksum(struct mbuf *m)
825{
826	struct ip *ip;
827	u_short csum, offset;
828
829	ip = mtod(m, struct ip *);
830	offset = ip->ip_hl << 2 ;
831	csum = in_cksum_skip(m, ip->ip_len, offset);
832	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
833		csum = 0xffff;
834	offset += m->m_pkthdr.csum_data;	/* checksum offset */
835
836	if (offset + sizeof(u_short) > m->m_len) {
837		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
838		    m->m_len, offset, ip->ip_p);
839		/*
840		 * XXX
841		 * this shouldn't happen, but if it does, the
842		 * correct behavior may be to insert the checksum
843		 * in the appropriate next mbuf in the chain.
844		 */
845		return;
846	}
847	*(u_short *)(m->m_data + offset) = csum;
848}
849
850/*
851 * IP socket option processing.
852 */
853int
854ip_ctloutput(struct socket *so, struct sockopt *sopt)
855{
856	struct	inpcb *inp = sotoinpcb(so);
857	int	error, optval;
858
859	error = optval = 0;
860	if (sopt->sopt_level != IPPROTO_IP) {
861		if ((sopt->sopt_level == SOL_SOCKET) &&
862		    (sopt->sopt_name == SO_SETFIB)) {
863			inp->inp_inc.inc_fibnum = so->so_fibnum;
864			return (0);
865		}
866		return (EINVAL);
867	}
868
869	switch (sopt->sopt_dir) {
870	case SOPT_SET:
871		switch (sopt->sopt_name) {
872		case IP_OPTIONS:
873#ifdef notyet
874		case IP_RETOPTS:
875#endif
876		{
877			struct mbuf *m;
878			if (sopt->sopt_valsize > MLEN) {
879				error = EMSGSIZE;
880				break;
881			}
882			MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
883			if (m == NULL) {
884				error = ENOBUFS;
885				break;
886			}
887			m->m_len = sopt->sopt_valsize;
888			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
889					    m->m_len);
890			if (error) {
891				m_free(m);
892				break;
893			}
894			INP_WLOCK(inp);
895			error = ip_pcbopts(inp, sopt->sopt_name, m);
896			INP_WUNLOCK(inp);
897			return (error);
898		}
899
900#if defined(IP_NONLOCALBIND)
901		case IP_NONLOCALOK:
902			if (! ip_nonlocalok) {
903				error = ENOPROTOOPT;
904				break;
905			}
906			/* FALLTHROUGH */
907#endif
908		case IP_TOS:
909		case IP_TTL:
910		case IP_MINTTL:
911		case IP_RECVOPTS:
912		case IP_RECVRETOPTS:
913		case IP_RECVDSTADDR:
914		case IP_RECVTTL:
915		case IP_RECVIF:
916		case IP_FAITH:
917		case IP_ONESBCAST:
918		case IP_DONTFRAG:
919			error = sooptcopyin(sopt, &optval, sizeof optval,
920					    sizeof optval);
921			if (error)
922				break;
923
924			switch (sopt->sopt_name) {
925			case IP_TOS:
926				inp->inp_ip_tos = optval;
927				break;
928
929			case IP_TTL:
930				inp->inp_ip_ttl = optval;
931				break;
932
933			case IP_MINTTL:
934				if (optval >= 0 && optval <= MAXTTL)
935					inp->inp_ip_minttl = optval;
936				else
937					error = EINVAL;
938				break;
939
940#define	OPTSET(bit) do {						\
941	INP_WLOCK(inp);							\
942	if (optval)							\
943		inp->inp_flags |= bit;					\
944	else								\
945		inp->inp_flags &= ~bit;					\
946	INP_WUNLOCK(inp);						\
947} while (0)
948
949			case IP_RECVOPTS:
950				OPTSET(INP_RECVOPTS);
951				break;
952
953			case IP_RECVRETOPTS:
954				OPTSET(INP_RECVRETOPTS);
955				break;
956
957			case IP_RECVDSTADDR:
958				OPTSET(INP_RECVDSTADDR);
959				break;
960
961			case IP_RECVTTL:
962				OPTSET(INP_RECVTTL);
963				break;
964
965			case IP_RECVIF:
966				OPTSET(INP_RECVIF);
967				break;
968
969			case IP_FAITH:
970				OPTSET(INP_FAITH);
971				break;
972
973			case IP_ONESBCAST:
974				OPTSET(INP_ONESBCAST);
975				break;
976			case IP_DONTFRAG:
977				OPTSET(INP_DONTFRAG);
978				break;
979#if defined(IP_NONLOCALBIND)
980			case IP_NONLOCALOK:
981				OPTSET(INP_NONLOCALOK);
982				break;
983#endif
984			}
985			break;
986#undef OPTSET
987
988		/*
989		 * Multicast socket options are processed by the in_mcast
990		 * module.
991		 */
992		case IP_MULTICAST_IF:
993		case IP_MULTICAST_VIF:
994		case IP_MULTICAST_TTL:
995		case IP_MULTICAST_LOOP:
996		case IP_ADD_MEMBERSHIP:
997		case IP_DROP_MEMBERSHIP:
998		case IP_ADD_SOURCE_MEMBERSHIP:
999		case IP_DROP_SOURCE_MEMBERSHIP:
1000		case IP_BLOCK_SOURCE:
1001		case IP_UNBLOCK_SOURCE:
1002		case IP_MSFILTER:
1003		case MCAST_JOIN_GROUP:
1004		case MCAST_LEAVE_GROUP:
1005		case MCAST_JOIN_SOURCE_GROUP:
1006		case MCAST_LEAVE_SOURCE_GROUP:
1007		case MCAST_BLOCK_SOURCE:
1008		case MCAST_UNBLOCK_SOURCE:
1009			error = inp_setmoptions(inp, sopt);
1010			break;
1011
1012		case IP_PORTRANGE:
1013			error = sooptcopyin(sopt, &optval, sizeof optval,
1014					    sizeof optval);
1015			if (error)
1016				break;
1017
1018			INP_WLOCK(inp);
1019			switch (optval) {
1020			case IP_PORTRANGE_DEFAULT:
1021				inp->inp_flags &= ~(INP_LOWPORT);
1022				inp->inp_flags &= ~(INP_HIGHPORT);
1023				break;
1024
1025			case IP_PORTRANGE_HIGH:
1026				inp->inp_flags &= ~(INP_LOWPORT);
1027				inp->inp_flags |= INP_HIGHPORT;
1028				break;
1029
1030			case IP_PORTRANGE_LOW:
1031				inp->inp_flags &= ~(INP_HIGHPORT);
1032				inp->inp_flags |= INP_LOWPORT;
1033				break;
1034
1035			default:
1036				error = EINVAL;
1037				break;
1038			}
1039			INP_WUNLOCK(inp);
1040			break;
1041
1042#ifdef IPSEC
1043		case IP_IPSEC_POLICY:
1044		{
1045			caddr_t req;
1046			struct mbuf *m;
1047
1048			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1049				break;
1050			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1051				break;
1052			req = mtod(m, caddr_t);
1053			error = ipsec_set_policy(inp, sopt->sopt_name, req,
1054			    m->m_len, (sopt->sopt_td != NULL) ?
1055			    sopt->sopt_td->td_ucred : NULL);
1056			m_freem(m);
1057			break;
1058		}
1059#endif /* IPSEC */
1060
1061		default:
1062			error = ENOPROTOOPT;
1063			break;
1064		}
1065		break;
1066
1067	case SOPT_GET:
1068		switch (sopt->sopt_name) {
1069		case IP_OPTIONS:
1070		case IP_RETOPTS:
1071			if (inp->inp_options)
1072				error = sooptcopyout(sopt,
1073						     mtod(inp->inp_options,
1074							  char *),
1075						     inp->inp_options->m_len);
1076			else
1077				sopt->sopt_valsize = 0;
1078			break;
1079
1080		case IP_TOS:
1081		case IP_TTL:
1082		case IP_MINTTL:
1083		case IP_RECVOPTS:
1084		case IP_RECVRETOPTS:
1085		case IP_RECVDSTADDR:
1086		case IP_RECVTTL:
1087		case IP_RECVIF:
1088		case IP_PORTRANGE:
1089		case IP_FAITH:
1090		case IP_ONESBCAST:
1091		case IP_DONTFRAG:
1092			switch (sopt->sopt_name) {
1093
1094			case IP_TOS:
1095				optval = inp->inp_ip_tos;
1096				break;
1097
1098			case IP_TTL:
1099				optval = inp->inp_ip_ttl;
1100				break;
1101
1102			case IP_MINTTL:
1103				optval = inp->inp_ip_minttl;
1104				break;
1105
1106#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1107
1108			case IP_RECVOPTS:
1109				optval = OPTBIT(INP_RECVOPTS);
1110				break;
1111
1112			case IP_RECVRETOPTS:
1113				optval = OPTBIT(INP_RECVRETOPTS);
1114				break;
1115
1116			case IP_RECVDSTADDR:
1117				optval = OPTBIT(INP_RECVDSTADDR);
1118				break;
1119
1120			case IP_RECVTTL:
1121				optval = OPTBIT(INP_RECVTTL);
1122				break;
1123
1124			case IP_RECVIF:
1125				optval = OPTBIT(INP_RECVIF);
1126				break;
1127
1128			case IP_PORTRANGE:
1129				if (inp->inp_flags & INP_HIGHPORT)
1130					optval = IP_PORTRANGE_HIGH;
1131				else if (inp->inp_flags & INP_LOWPORT)
1132					optval = IP_PORTRANGE_LOW;
1133				else
1134					optval = 0;
1135				break;
1136
1137			case IP_FAITH:
1138				optval = OPTBIT(INP_FAITH);
1139				break;
1140
1141			case IP_ONESBCAST:
1142				optval = OPTBIT(INP_ONESBCAST);
1143				break;
1144			case IP_DONTFRAG:
1145				optval = OPTBIT(INP_DONTFRAG);
1146				break;
1147			}
1148			error = sooptcopyout(sopt, &optval, sizeof optval);
1149			break;
1150
1151		/*
1152		 * Multicast socket options are processed by the in_mcast
1153		 * module.
1154		 */
1155		case IP_MULTICAST_IF:
1156		case IP_MULTICAST_VIF:
1157		case IP_MULTICAST_TTL:
1158		case IP_MULTICAST_LOOP:
1159		case IP_MSFILTER:
1160			error = inp_getmoptions(inp, sopt);
1161			break;
1162
1163#ifdef IPSEC
1164		case IP_IPSEC_POLICY:
1165		{
1166			struct mbuf *m = NULL;
1167			caddr_t req = NULL;
1168			size_t len = 0;
1169
1170			if (m != 0) {
1171				req = mtod(m, caddr_t);
1172				len = m->m_len;
1173			}
1174			error = ipsec_get_policy(sotoinpcb(so), req, len, &m);
1175			if (error == 0)
1176				error = soopt_mcopyout(sopt, m); /* XXX */
1177			if (error == 0)
1178				m_freem(m);
1179			break;
1180		}
1181#endif /* IPSEC */
1182
1183		default:
1184			error = ENOPROTOOPT;
1185			break;
1186		}
1187		break;
1188	}
1189	return (error);
1190}
1191
1192/*
1193 * Routine called from ip_output() to loop back a copy of an IP multicast
1194 * packet to the input queue of a specified interface.  Note that this
1195 * calls the output routine of the loopback "driver", but with an interface
1196 * pointer that might NOT be a loopback interface -- evil, but easier than
1197 * replicating that code here.
1198 */
1199static void
1200ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
1201    int hlen)
1202{
1203	register struct ip *ip;
1204	struct mbuf *copym;
1205
1206	/*
1207	 * Make a deep copy of the packet because we're going to
1208	 * modify the pack in order to generate checksums.
1209	 */
1210	copym = m_dup(m, M_DONTWAIT);
1211	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
1212		copym = m_pullup(copym, hlen);
1213	if (copym != NULL) {
1214		/* If needed, compute the checksum and mark it as valid. */
1215		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1216			in_delayed_cksum(copym);
1217			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1218			copym->m_pkthdr.csum_flags |=
1219			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1220			copym->m_pkthdr.csum_data = 0xffff;
1221		}
1222		/*
1223		 * We don't bother to fragment if the IP length is greater
1224		 * than the interface's MTU.  Can this possibly matter?
1225		 */
1226		ip = mtod(copym, struct ip *);
1227		ip->ip_len = htons(ip->ip_len);
1228		ip->ip_off = htons(ip->ip_off);
1229		ip->ip_sum = 0;
1230		ip->ip_sum = in_cksum(copym, hlen);
1231#if 1 /* XXX */
1232		if (dst->sin_family != AF_INET) {
1233			printf("ip_mloopback: bad address family %d\n",
1234						dst->sin_family);
1235			dst->sin_family = AF_INET;
1236		}
1237#endif
1238		if_simloop(ifp, copym, dst->sin_family, 0);
1239	}
1240}
1241