ip_output.c revision 171133
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
30 * $FreeBSD: head/sys/netinet/ip_output.c 171133 2007-07-01 11:41:27Z gnn $
31 */
32
33#include "opt_ipfw.h"
34#include "opt_ipsec.h"
35#include "opt_mac.h"
36#include "opt_mbuf_stress_test.h"
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/kernel.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/priv.h>
44#include <sys/protosw.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
47#include <sys/sysctl.h>
48
49#include <net/if.h>
50#include <net/netisr.h>
51#include <net/pfil.h>
52#include <net/route.h>
53
54#include <netinet/in.h>
55#include <netinet/in_systm.h>
56#include <netinet/ip.h>
57#include <netinet/in_pcb.h>
58#include <netinet/in_var.h>
59#include <netinet/ip_var.h>
60#include <netinet/ip_options.h>
61
62#ifdef FAST_IPSEC
63#include <netinet/ip_ipsec.h>
64#include <netipsec/ipsec.h>
65#endif /* FAST_IPSEC*/
66
67#include <machine/in_cksum.h>
68
69#include <security/mac/mac_framework.h>
70
71#define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
72				x, (ntohl(a.s_addr)>>24)&0xFF,\
73				  (ntohl(a.s_addr)>>16)&0xFF,\
74				  (ntohl(a.s_addr)>>8)&0xFF,\
75				  (ntohl(a.s_addr))&0xFF, y);
76
77u_short ip_id;
78
79#ifdef MBUF_STRESS_TEST
80int mbuf_frag_size = 0;
81SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
82	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
83#endif
84
85static void	ip_mloopback
86	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
87
88
89extern	struct protosw inetsw[];
90
91/*
92 * IP output.  The packet in mbuf chain m contains a skeletal IP
93 * header (with len, off, ttl, proto, tos, src, dst).
94 * The mbuf chain containing the packet will be freed.
95 * The mbuf opt, if present, will not be freed.
96 * In the IP forwarding case, the packet will arrive with options already
97 * inserted, so must have a NULL opt pointer.
98 */
99int
100ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
101    struct ip_moptions *imo, struct inpcb *inp)
102{
103	struct ip *ip;
104	struct ifnet *ifp = NULL;	/* keep compiler happy */
105	struct mbuf *m0;
106	int hlen = sizeof (struct ip);
107	int mtu;
108	int len, error = 0;
109	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
110	struct in_ifaddr *ia = NULL;
111	int isbroadcast, sw_csum;
112	struct route iproute;
113	struct in_addr odst;
114#ifdef IPFIREWALL_FORWARD
115	struct m_tag *fwd_tag = NULL;
116#endif
117	M_ASSERTPKTHDR(m);
118
119	if (ro == NULL) {
120		ro = &iproute;
121		bzero(ro, sizeof (*ro));
122	}
123
124	if (inp != NULL)
125		INP_LOCK_ASSERT(inp);
126
127	if (opt) {
128		len = 0;
129		m = ip_insertoptions(m, opt, &len);
130		if (len != 0)
131			hlen = len;
132	}
133	ip = mtod(m, struct ip *);
134
135	/*
136	 * Fill in IP header.  If we are not allowing fragmentation,
137	 * then the ip_id field is meaningless, but we don't set it
138	 * to zero.  Doing so causes various problems when devices along
139	 * the path (routers, load balancers, firewalls, etc.) illegally
140	 * disable DF on our packet.  Note that a 16-bit counter
141	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
142	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
143	 * for Counting NATted Hosts", Proc. IMW'02, available at
144	 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
145	 */
146	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
147		ip->ip_v = IPVERSION;
148		ip->ip_hl = hlen >> 2;
149		ip->ip_id = ip_newid();
150		ipstat.ips_localout++;
151	} else {
152		hlen = ip->ip_hl << 2;
153	}
154
155	dst = (struct sockaddr_in *)&ro->ro_dst;
156again:
157	/*
158	 * If there is a cached route,
159	 * check that it is to the same destination
160	 * and is still up.  If not, free it and try again.
161	 * The address family should also be checked in case of sharing the
162	 * cache with IPv6.
163	 */
164	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
165			  dst->sin_family != AF_INET ||
166			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
167		RTFREE(ro->ro_rt);
168		ro->ro_rt = (struct rtentry *)NULL;
169	}
170#ifdef IPFIREWALL_FORWARD
171	if (ro->ro_rt == NULL && fwd_tag == NULL) {
172#else
173	if (ro->ro_rt == NULL) {
174#endif
175		bzero(dst, sizeof(*dst));
176		dst->sin_family = AF_INET;
177		dst->sin_len = sizeof(*dst);
178		dst->sin_addr = ip->ip_dst;
179	}
180	/*
181	 * If routing to interface only, short circuit routing lookup.
182	 * The use of an all-ones broadcast address implies this; an
183	 * interface is specified by the broadcast address of an interface,
184	 * or the destination address of a ptp interface.
185	 */
186	if (flags & IP_SENDONES) {
187		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
188		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
189			ipstat.ips_noroute++;
190			error = ENETUNREACH;
191			goto bad;
192		}
193		ip->ip_dst.s_addr = INADDR_BROADCAST;
194		dst->sin_addr = ip->ip_dst;
195		ifp = ia->ia_ifp;
196		ip->ip_ttl = 1;
197		isbroadcast = 1;
198	} else if (flags & IP_ROUTETOIF) {
199		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
200		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
201			ipstat.ips_noroute++;
202			error = ENETUNREACH;
203			goto bad;
204		}
205		ifp = ia->ia_ifp;
206		ip->ip_ttl = 1;
207		isbroadcast = in_broadcast(dst->sin_addr, ifp);
208	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
209	    imo != NULL && imo->imo_multicast_ifp != NULL) {
210		/*
211		 * Bypass the normal routing lookup for multicast
212		 * packets if the interface is specified.
213		 */
214		ifp = imo->imo_multicast_ifp;
215		IFP_TO_IA(ifp, ia);
216		isbroadcast = 0;	/* fool gcc */
217	} else {
218		/*
219		 * We want to do any cloning requested by the link layer,
220		 * as this is probably required in all cases for correct
221		 * operation (as it is for ARP).
222		 */
223		if (ro->ro_rt == NULL)
224			rtalloc_ign(ro, 0);
225		if (ro->ro_rt == NULL) {
226			ipstat.ips_noroute++;
227			error = EHOSTUNREACH;
228			goto bad;
229		}
230		ia = ifatoia(ro->ro_rt->rt_ifa);
231		ifp = ro->ro_rt->rt_ifp;
232		ro->ro_rt->rt_rmx.rmx_pksent++;
233		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
234			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
235		if (ro->ro_rt->rt_flags & RTF_HOST)
236			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
237		else
238			isbroadcast = in_broadcast(dst->sin_addr, ifp);
239	}
240	/*
241	 * Calculate MTU.  If we have a route that is up, use that,
242	 * otherwise use the interface's MTU.
243	 */
244	if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) {
245		/*
246		 * This case can happen if the user changed the MTU
247		 * of an interface after enabling IP on it.  Because
248		 * most netifs don't keep track of routes pointing to
249		 * them, there is no way for one to update all its
250		 * routes when the MTU is changed.
251		 */
252		if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)
253			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
254		mtu = ro->ro_rt->rt_rmx.rmx_mtu;
255	} else {
256		mtu = ifp->if_mtu;
257	}
258	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
259		struct in_multi *inm;
260
261		m->m_flags |= M_MCAST;
262		/*
263		 * IP destination address is multicast.  Make sure "dst"
264		 * still points to the address in "ro".  (It may have been
265		 * changed to point to a gateway address, above.)
266		 */
267		dst = (struct sockaddr_in *)&ro->ro_dst;
268		/*
269		 * See if the caller provided any multicast options
270		 */
271		if (imo != NULL) {
272			ip->ip_ttl = imo->imo_multicast_ttl;
273			if (imo->imo_multicast_vif != -1)
274				ip->ip_src.s_addr =
275				    ip_mcast_src ?
276				    ip_mcast_src(imo->imo_multicast_vif) :
277				    INADDR_ANY;
278		} else
279			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
280		/*
281		 * Confirm that the outgoing interface supports multicast.
282		 */
283		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
284			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
285				ipstat.ips_noroute++;
286				error = ENETUNREACH;
287				goto bad;
288			}
289		}
290		/*
291		 * If source address not specified yet, use address
292		 * of outgoing interface.
293		 */
294		if (ip->ip_src.s_addr == INADDR_ANY) {
295			/* Interface may have no addresses. */
296			if (ia != NULL)
297				ip->ip_src = IA_SIN(ia)->sin_addr;
298		}
299
300		IN_MULTI_LOCK();
301		IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
302		if (inm != NULL &&
303		   (imo == NULL || imo->imo_multicast_loop)) {
304			IN_MULTI_UNLOCK();
305			/*
306			 * If we belong to the destination multicast group
307			 * on the outgoing interface, and the caller did not
308			 * forbid loopback, loop back a copy.
309			 */
310			ip_mloopback(ifp, m, dst, hlen);
311		}
312		else {
313			IN_MULTI_UNLOCK();
314			/*
315			 * If we are acting as a multicast router, perform
316			 * multicast forwarding as if the packet had just
317			 * arrived on the interface to which we are about
318			 * to send.  The multicast forwarding function
319			 * recursively calls this function, using the
320			 * IP_FORWARDING flag to prevent infinite recursion.
321			 *
322			 * Multicasts that are looped back by ip_mloopback(),
323			 * above, will be forwarded by the ip_input() routine,
324			 * if necessary.
325			 */
326			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
327				/*
328				 * If rsvp daemon is not running, do not
329				 * set ip_moptions. This ensures that the packet
330				 * is multicast and not just sent down one link
331				 * as prescribed by rsvpd.
332				 */
333				if (!rsvp_on)
334					imo = NULL;
335				if (ip_mforward &&
336				    ip_mforward(ip, ifp, m, imo) != 0) {
337					m_freem(m);
338					goto done;
339				}
340			}
341		}
342
343		/*
344		 * Multicasts with a time-to-live of zero may be looped-
345		 * back, above, but must not be transmitted on a network.
346		 * Also, multicasts addressed to the loopback interface
347		 * are not sent -- the above call to ip_mloopback() will
348		 * loop back a copy if this host actually belongs to the
349		 * destination group on the loopback interface.
350		 */
351		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
352			m_freem(m);
353			goto done;
354		}
355
356		goto sendit;
357	}
358
359	/*
360	 * If the source address is not specified yet, use the address
361	 * of the outoing interface.
362	 */
363	if (ip->ip_src.s_addr == INADDR_ANY) {
364		/* Interface may have no addresses. */
365		if (ia != NULL) {
366			ip->ip_src = IA_SIN(ia)->sin_addr;
367		}
368	}
369
370	/*
371	 * Verify that we have any chance at all of being able to queue the
372	 * packet or packet fragments, unless ALTQ is enabled on the given
373	 * interface in which case packetdrop should be done by queueing.
374	 */
375#ifdef ALTQ
376	if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
377	    ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
378	    ifp->if_snd.ifq_maxlen))
379#else
380	if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
381	    ifp->if_snd.ifq_maxlen)
382#endif /* ALTQ */
383	{
384		error = ENOBUFS;
385		ipstat.ips_odropped++;
386		ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
387		goto bad;
388	}
389
390	/*
391	 * Look for broadcast address and
392	 * verify user is allowed to send
393	 * such a packet.
394	 */
395	if (isbroadcast) {
396		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
397			error = EADDRNOTAVAIL;
398			goto bad;
399		}
400		if ((flags & IP_ALLOWBROADCAST) == 0) {
401			error = EACCES;
402			goto bad;
403		}
404		/* don't allow broadcast messages to be fragmented */
405		if (ip->ip_len > mtu) {
406			error = EMSGSIZE;
407			goto bad;
408		}
409		m->m_flags |= M_BCAST;
410	} else {
411		m->m_flags &= ~M_BCAST;
412	}
413
414sendit:
415#ifdef FAST_IPSEC
416	switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) {
417	case 1:
418		goto bad;
419	case -1:
420		goto done;
421	case 0:
422	default:
423		break;	/* Continue with packet processing. */
424	}
425	/* Update variables that are affected by ipsec4_output(). */
426	ip = mtod(m, struct ip *);
427	hlen = ip->ip_hl << 2;
428#endif /* FAST_IPSEC */
429
430	/* Jump over all PFIL processing if hooks are not active. */
431	if (!PFIL_HOOKED(&inet_pfil_hook))
432		goto passout;
433
434	/* Run through list of hooks for output packets. */
435	odst.s_addr = ip->ip_dst.s_addr;
436	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
437	if (error != 0 || m == NULL)
438		goto done;
439
440	ip = mtod(m, struct ip *);
441
442	/* See if destination IP address was changed by packet filter. */
443	if (odst.s_addr != ip->ip_dst.s_addr) {
444		m->m_flags |= M_SKIP_FIREWALL;
445		/* If destination is now ourself drop to ip_input(). */
446		if (in_localip(ip->ip_dst)) {
447			m->m_flags |= M_FASTFWD_OURS;
448			if (m->m_pkthdr.rcvif == NULL)
449				m->m_pkthdr.rcvif = loif;
450			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
451				m->m_pkthdr.csum_flags |=
452				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
453				m->m_pkthdr.csum_data = 0xffff;
454			}
455			m->m_pkthdr.csum_flags |=
456			    CSUM_IP_CHECKED | CSUM_IP_VALID;
457
458			error = netisr_queue(NETISR_IP, m);
459			goto done;
460		} else
461			goto again;	/* Redo the routing table lookup. */
462	}
463
464#ifdef IPFIREWALL_FORWARD
465	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
466	if (m->m_flags & M_FASTFWD_OURS) {
467		if (m->m_pkthdr.rcvif == NULL)
468			m->m_pkthdr.rcvif = loif;
469		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
470			m->m_pkthdr.csum_flags |=
471			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
472			m->m_pkthdr.csum_data = 0xffff;
473		}
474		m->m_pkthdr.csum_flags |=
475			    CSUM_IP_CHECKED | CSUM_IP_VALID;
476
477		error = netisr_queue(NETISR_IP, m);
478		goto done;
479	}
480	/* Or forward to some other address? */
481	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
482	if (fwd_tag) {
483		dst = (struct sockaddr_in *)&ro->ro_dst;
484		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
485		m->m_flags |= M_SKIP_FIREWALL;
486		m_tag_delete(m, fwd_tag);
487		goto again;
488	}
489#endif /* IPFIREWALL_FORWARD */
490
491passout:
492	/* 127/8 must not appear on wire - RFC1122. */
493	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
494	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
495		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
496			ipstat.ips_badaddr++;
497			error = EADDRNOTAVAIL;
498			goto bad;
499		}
500	}
501
502	m->m_pkthdr.csum_flags |= CSUM_IP;
503	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
504	if (sw_csum & CSUM_DELAY_DATA) {
505		in_delayed_cksum(m);
506		sw_csum &= ~CSUM_DELAY_DATA;
507	}
508	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
509
510	/*
511	 * If small enough for interface, or the interface will take
512	 * care of the fragmentation for us, we can just send directly.
513	 */
514	if (ip->ip_len <= mtu ||
515	    (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
516	    ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
517		ip->ip_len = htons(ip->ip_len);
518		ip->ip_off = htons(ip->ip_off);
519		ip->ip_sum = 0;
520		if (sw_csum & CSUM_DELAY_IP)
521			ip->ip_sum = in_cksum(m, hlen);
522
523		/*
524		 * Record statistics for this interface address.
525		 * With CSUM_TSO the byte/packet count will be slightly
526		 * incorrect because we count the IP+TCP headers only
527		 * once instead of for every generated packet.
528		 */
529		if (!(flags & IP_FORWARDING) && ia) {
530			if (m->m_pkthdr.csum_flags & CSUM_TSO)
531				ia->ia_ifa.if_opackets +=
532				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
533			else
534				ia->ia_ifa.if_opackets++;
535			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
536		}
537#ifdef MBUF_STRESS_TEST
538		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
539			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
540#endif
541		/*
542		 * Reset layer specific mbuf flags
543		 * to avoid confusing lower layers.
544		 */
545		m->m_flags &= ~(M_PROTOFLAGS);
546
547		error = (*ifp->if_output)(ifp, m,
548				(struct sockaddr *)dst, ro->ro_rt);
549		goto done;
550	}
551
552	/* Balk when DF bit is set or the interface didn't support TSO. */
553	if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
554		error = EMSGSIZE;
555		ipstat.ips_cantfrag++;
556		goto bad;
557	}
558
559	/*
560	 * Too large for interface; fragment if possible. If successful,
561	 * on return, m will point to a list of packets to be sent.
562	 */
563	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum);
564	if (error)
565		goto bad;
566	for (; m; m = m0) {
567		m0 = m->m_nextpkt;
568		m->m_nextpkt = 0;
569		if (error == 0) {
570			/* Record statistics for this interface address. */
571			if (ia != NULL) {
572				ia->ia_ifa.if_opackets++;
573				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
574			}
575			/*
576			 * Reset layer specific mbuf flags
577			 * to avoid confusing upper layers.
578			 */
579			m->m_flags &= ~(M_PROTOFLAGS);
580
581			error = (*ifp->if_output)(ifp, m,
582			    (struct sockaddr *)dst, ro->ro_rt);
583		} else
584			m_freem(m);
585	}
586
587	if (error == 0)
588		ipstat.ips_fragmented++;
589
590done:
591	if (ro == &iproute && ro->ro_rt) {
592		RTFREE(ro->ro_rt);
593	}
594	return (error);
595bad:
596	m_freem(m);
597	goto done;
598}
599
600/*
601 * Create a chain of fragments which fit the given mtu. m_frag points to the
602 * mbuf to be fragmented; on return it points to the chain with the fragments.
603 * Return 0 if no error. If error, m_frag may contain a partially built
604 * chain of fragments that should be freed by the caller.
605 *
606 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
607 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
608 */
609int
610ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
611    u_long if_hwassist_flags, int sw_csum)
612{
613	int error = 0;
614	int hlen = ip->ip_hl << 2;
615	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
616	int off;
617	struct mbuf *m0 = *m_frag;	/* the original packet		*/
618	int firstlen;
619	struct mbuf **mnext;
620	int nfrags;
621
622	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
623		ipstat.ips_cantfrag++;
624		return EMSGSIZE;
625	}
626
627	/*
628	 * Must be able to put at least 8 bytes per fragment.
629	 */
630	if (len < 8)
631		return EMSGSIZE;
632
633	/*
634	 * If the interface will not calculate checksums on
635	 * fragmented packets, then do it here.
636	 */
637	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
638	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
639		in_delayed_cksum(m0);
640		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
641	}
642
643	if (len > PAGE_SIZE) {
644		/*
645		 * Fragment large datagrams such that each segment
646		 * contains a multiple of PAGE_SIZE amount of data,
647		 * plus headers. This enables a receiver to perform
648		 * page-flipping zero-copy optimizations.
649		 *
650		 * XXX When does this help given that sender and receiver
651		 * could have different page sizes, and also mtu could
652		 * be less than the receiver's page size ?
653		 */
654		int newlen;
655		struct mbuf *m;
656
657		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
658			off += m->m_len;
659
660		/*
661		 * firstlen (off - hlen) must be aligned on an
662		 * 8-byte boundary
663		 */
664		if (off < hlen)
665			goto smart_frag_failure;
666		off = ((off - hlen) & ~7) + hlen;
667		newlen = (~PAGE_MASK) & mtu;
668		if ((newlen + sizeof (struct ip)) > mtu) {
669			/* we failed, go back the default */
670smart_frag_failure:
671			newlen = len;
672			off = hlen + len;
673		}
674		len = newlen;
675
676	} else {
677		off = hlen + len;
678	}
679
680	firstlen = off - hlen;
681	mnext = &m0->m_nextpkt;		/* pointer to next packet */
682
683	/*
684	 * Loop through length of segment after first fragment,
685	 * make new header and copy data of each part and link onto chain.
686	 * Here, m0 is the original packet, m is the fragment being created.
687	 * The fragments are linked off the m_nextpkt of the original
688	 * packet, which after processing serves as the first fragment.
689	 */
690	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
691		struct ip *mhip;	/* ip header on the fragment */
692		struct mbuf *m;
693		int mhlen = sizeof (struct ip);
694
695		MGETHDR(m, M_DONTWAIT, MT_DATA);
696		if (m == NULL) {
697			error = ENOBUFS;
698			ipstat.ips_odropped++;
699			goto done;
700		}
701		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
702		/*
703		 * In the first mbuf, leave room for the link header, then
704		 * copy the original IP header including options. The payload
705		 * goes into an additional mbuf chain returned by m_copy().
706		 */
707		m->m_data += max_linkhdr;
708		mhip = mtod(m, struct ip *);
709		*mhip = *ip;
710		if (hlen > sizeof (struct ip)) {
711			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
712			mhip->ip_v = IPVERSION;
713			mhip->ip_hl = mhlen >> 2;
714		}
715		m->m_len = mhlen;
716		/* XXX do we need to add ip->ip_off below ? */
717		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
718		if (off + len >= ip->ip_len) {	/* last fragment */
719			len = ip->ip_len - off;
720			m->m_flags |= M_LASTFRAG;
721		} else
722			mhip->ip_off |= IP_MF;
723		mhip->ip_len = htons((u_short)(len + mhlen));
724		m->m_next = m_copy(m0, off, len);
725		if (m->m_next == NULL) {	/* copy failed */
726			m_free(m);
727			error = ENOBUFS;	/* ??? */
728			ipstat.ips_odropped++;
729			goto done;
730		}
731		m->m_pkthdr.len = mhlen + len;
732		m->m_pkthdr.rcvif = NULL;
733#ifdef MAC
734		mac_create_fragment(m0, m);
735#endif
736		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
737		mhip->ip_off = htons(mhip->ip_off);
738		mhip->ip_sum = 0;
739		if (sw_csum & CSUM_DELAY_IP)
740			mhip->ip_sum = in_cksum(m, mhlen);
741		*mnext = m;
742		mnext = &m->m_nextpkt;
743	}
744	ipstat.ips_ofragments += nfrags;
745
746	/* set first marker for fragment chain */
747	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
748	m0->m_pkthdr.csum_data = nfrags;
749
750	/*
751	 * Update first fragment by trimming what's been copied out
752	 * and updating header.
753	 */
754	m_adj(m0, hlen + firstlen - ip->ip_len);
755	m0->m_pkthdr.len = hlen + firstlen;
756	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
757	ip->ip_off |= IP_MF;
758	ip->ip_off = htons(ip->ip_off);
759	ip->ip_sum = 0;
760	if (sw_csum & CSUM_DELAY_IP)
761		ip->ip_sum = in_cksum(m0, hlen);
762
763done:
764	*m_frag = m0;
765	return error;
766}
767
768void
769in_delayed_cksum(struct mbuf *m)
770{
771	struct ip *ip;
772	u_short csum, offset;
773
774	ip = mtod(m, struct ip *);
775	offset = ip->ip_hl << 2 ;
776	csum = in_cksum_skip(m, ip->ip_len, offset);
777	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
778		csum = 0xffff;
779	offset += m->m_pkthdr.csum_data;	/* checksum offset */
780
781	if (offset + sizeof(u_short) > m->m_len) {
782		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
783		    m->m_len, offset, ip->ip_p);
784		/*
785		 * XXX
786		 * this shouldn't happen, but if it does, the
787		 * correct behavior may be to insert the checksum
788		 * in the appropriate next mbuf in the chain.
789		 */
790		return;
791	}
792	*(u_short *)(m->m_data + offset) = csum;
793}
794
795/*
796 * IP socket option processing.
797 */
798int
799ip_ctloutput(struct socket *so, struct sockopt *sopt)
800{
801	struct	inpcb *inp = sotoinpcb(so);
802	int	error, optval;
803
804	error = optval = 0;
805	if (sopt->sopt_level != IPPROTO_IP) {
806		return (EINVAL);
807	}
808
809	switch (sopt->sopt_dir) {
810	case SOPT_SET:
811		switch (sopt->sopt_name) {
812		case IP_OPTIONS:
813#ifdef notyet
814		case IP_RETOPTS:
815#endif
816		{
817			struct mbuf *m;
818			if (sopt->sopt_valsize > MLEN) {
819				error = EMSGSIZE;
820				break;
821			}
822			MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
823			if (m == NULL) {
824				error = ENOBUFS;
825				break;
826			}
827			m->m_len = sopt->sopt_valsize;
828			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
829					    m->m_len);
830			if (error) {
831				m_free(m);
832				break;
833			}
834			INP_LOCK(inp);
835			error = ip_pcbopts(inp, sopt->sopt_name, m);
836			INP_UNLOCK(inp);
837			return (error);
838		}
839
840		case IP_TOS:
841		case IP_TTL:
842		case IP_MINTTL:
843		case IP_RECVOPTS:
844		case IP_RECVRETOPTS:
845		case IP_RECVDSTADDR:
846		case IP_RECVTTL:
847		case IP_RECVIF:
848		case IP_FAITH:
849		case IP_ONESBCAST:
850		case IP_DONTFRAG:
851			error = sooptcopyin(sopt, &optval, sizeof optval,
852					    sizeof optval);
853			if (error)
854				break;
855
856			switch (sopt->sopt_name) {
857			case IP_TOS:
858				inp->inp_ip_tos = optval;
859				break;
860
861			case IP_TTL:
862				inp->inp_ip_ttl = optval;
863				break;
864
865			case IP_MINTTL:
866				if (optval > 0 && optval <= MAXTTL)
867					inp->inp_ip_minttl = optval;
868				else
869					error = EINVAL;
870				break;
871
872#define	OPTSET(bit) do {						\
873	INP_LOCK(inp);							\
874	if (optval)							\
875		inp->inp_flags |= bit;					\
876	else								\
877		inp->inp_flags &= ~bit;					\
878	INP_UNLOCK(inp);						\
879} while (0)
880
881			case IP_RECVOPTS:
882				OPTSET(INP_RECVOPTS);
883				break;
884
885			case IP_RECVRETOPTS:
886				OPTSET(INP_RECVRETOPTS);
887				break;
888
889			case IP_RECVDSTADDR:
890				OPTSET(INP_RECVDSTADDR);
891				break;
892
893			case IP_RECVTTL:
894				OPTSET(INP_RECVTTL);
895				break;
896
897			case IP_RECVIF:
898				OPTSET(INP_RECVIF);
899				break;
900
901			case IP_FAITH:
902				OPTSET(INP_FAITH);
903				break;
904
905			case IP_ONESBCAST:
906				OPTSET(INP_ONESBCAST);
907				break;
908			case IP_DONTFRAG:
909				OPTSET(INP_DONTFRAG);
910				break;
911			}
912			break;
913#undef OPTSET
914
915		/*
916		 * Multicast socket options are processed by the in_mcast
917		 * module.
918		 */
919		case IP_MULTICAST_IF:
920		case IP_MULTICAST_VIF:
921		case IP_MULTICAST_TTL:
922		case IP_MULTICAST_LOOP:
923		case IP_ADD_MEMBERSHIP:
924		case IP_DROP_MEMBERSHIP:
925		case IP_ADD_SOURCE_MEMBERSHIP:
926		case IP_DROP_SOURCE_MEMBERSHIP:
927		case IP_BLOCK_SOURCE:
928		case IP_UNBLOCK_SOURCE:
929		case IP_MSFILTER:
930		case MCAST_JOIN_GROUP:
931		case MCAST_LEAVE_GROUP:
932		case MCAST_JOIN_SOURCE_GROUP:
933		case MCAST_LEAVE_SOURCE_GROUP:
934		case MCAST_BLOCK_SOURCE:
935		case MCAST_UNBLOCK_SOURCE:
936			error = inp_setmoptions(inp, sopt);
937			break;
938
939		case IP_PORTRANGE:
940			error = sooptcopyin(sopt, &optval, sizeof optval,
941					    sizeof optval);
942			if (error)
943				break;
944
945			INP_LOCK(inp);
946			switch (optval) {
947			case IP_PORTRANGE_DEFAULT:
948				inp->inp_flags &= ~(INP_LOWPORT);
949				inp->inp_flags &= ~(INP_HIGHPORT);
950				break;
951
952			case IP_PORTRANGE_HIGH:
953				inp->inp_flags &= ~(INP_LOWPORT);
954				inp->inp_flags |= INP_HIGHPORT;
955				break;
956
957			case IP_PORTRANGE_LOW:
958				inp->inp_flags &= ~(INP_HIGHPORT);
959				inp->inp_flags |= INP_LOWPORT;
960				break;
961
962			default:
963				error = EINVAL;
964				break;
965			}
966			INP_UNLOCK(inp);
967			break;
968
969#ifdef FAST_IPSEC
970		case IP_IPSEC_POLICY:
971		{
972			caddr_t req;
973			size_t len = 0;
974			int priv;
975			struct mbuf *m;
976			int optname;
977
978			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
979				break;
980			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
981				break;
982			if (sopt->sopt_td != NULL) {
983				/*
984				 * XXXRW: Would be more desirable to do this
985				 * one layer down so that we only exercise
986				 * privilege if it is needed.
987				 */
988				error = priv_check(sopt->sopt_td,
989				    PRIV_NETINET_IPSEC);
990				if (error)
991					priv = 0;
992				else
993					priv = 1;
994			} else
995				priv = 1;
996			req = mtod(m, caddr_t);
997			len = m->m_len;
998			optname = sopt->sopt_name;
999			error = ipsec4_set_policy(inp, optname, req, len, priv);
1000			m_freem(m);
1001			break;
1002		}
1003#endif /* FAST_IPSEC */
1004
1005		default:
1006			error = ENOPROTOOPT;
1007			break;
1008		}
1009		break;
1010
1011	case SOPT_GET:
1012		switch (sopt->sopt_name) {
1013		case IP_OPTIONS:
1014		case IP_RETOPTS:
1015			if (inp->inp_options)
1016				error = sooptcopyout(sopt,
1017						     mtod(inp->inp_options,
1018							  char *),
1019						     inp->inp_options->m_len);
1020			else
1021				sopt->sopt_valsize = 0;
1022			break;
1023
1024		case IP_TOS:
1025		case IP_TTL:
1026		case IP_MINTTL:
1027		case IP_RECVOPTS:
1028		case IP_RECVRETOPTS:
1029		case IP_RECVDSTADDR:
1030		case IP_RECVTTL:
1031		case IP_RECVIF:
1032		case IP_PORTRANGE:
1033		case IP_FAITH:
1034		case IP_ONESBCAST:
1035		case IP_DONTFRAG:
1036			switch (sopt->sopt_name) {
1037
1038			case IP_TOS:
1039				optval = inp->inp_ip_tos;
1040				break;
1041
1042			case IP_TTL:
1043				optval = inp->inp_ip_ttl;
1044				break;
1045
1046			case IP_MINTTL:
1047				optval = inp->inp_ip_minttl;
1048				break;
1049
1050#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1051
1052			case IP_RECVOPTS:
1053				optval = OPTBIT(INP_RECVOPTS);
1054				break;
1055
1056			case IP_RECVRETOPTS:
1057				optval = OPTBIT(INP_RECVRETOPTS);
1058				break;
1059
1060			case IP_RECVDSTADDR:
1061				optval = OPTBIT(INP_RECVDSTADDR);
1062				break;
1063
1064			case IP_RECVTTL:
1065				optval = OPTBIT(INP_RECVTTL);
1066				break;
1067
1068			case IP_RECVIF:
1069				optval = OPTBIT(INP_RECVIF);
1070				break;
1071
1072			case IP_PORTRANGE:
1073				if (inp->inp_flags & INP_HIGHPORT)
1074					optval = IP_PORTRANGE_HIGH;
1075				else if (inp->inp_flags & INP_LOWPORT)
1076					optval = IP_PORTRANGE_LOW;
1077				else
1078					optval = 0;
1079				break;
1080
1081			case IP_FAITH:
1082				optval = OPTBIT(INP_FAITH);
1083				break;
1084
1085			case IP_ONESBCAST:
1086				optval = OPTBIT(INP_ONESBCAST);
1087				break;
1088			case IP_DONTFRAG:
1089				optval = OPTBIT(INP_DONTFRAG);
1090				break;
1091			}
1092			error = sooptcopyout(sopt, &optval, sizeof optval);
1093			break;
1094
1095		/*
1096		 * Multicast socket options are processed by the in_mcast
1097		 * module.
1098		 */
1099		case IP_MULTICAST_IF:
1100		case IP_MULTICAST_VIF:
1101		case IP_MULTICAST_TTL:
1102		case IP_MULTICAST_LOOP:
1103		case IP_MSFILTER:
1104			error = inp_getmoptions(inp, sopt);
1105			break;
1106
1107#ifdef FAST_IPSEC
1108		case IP_IPSEC_POLICY:
1109		{
1110			struct mbuf *m = NULL;
1111			caddr_t req = NULL;
1112			size_t len = 0;
1113
1114			if (m != 0) {
1115				req = mtod(m, caddr_t);
1116				len = m->m_len;
1117			}
1118			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1119			if (error == 0)
1120				error = soopt_mcopyout(sopt, m); /* XXX */
1121			if (error == 0)
1122				m_freem(m);
1123			break;
1124		}
1125#endif /* FAST_IPSEC */
1126
1127		default:
1128			error = ENOPROTOOPT;
1129			break;
1130		}
1131		break;
1132	}
1133	return (error);
1134}
1135
1136/*
1137 * Routine called from ip_output() to loop back a copy of an IP multicast
1138 * packet to the input queue of a specified interface.  Note that this
1139 * calls the output routine of the loopback "driver", but with an interface
1140 * pointer that might NOT be a loopback interface -- evil, but easier than
1141 * replicating that code here.
1142 */
1143static void
1144ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
1145    int hlen)
1146{
1147	register struct ip *ip;
1148	struct mbuf *copym;
1149
1150	copym = m_copy(m, 0, M_COPYALL);
1151	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
1152		copym = m_pullup(copym, hlen);
1153	if (copym != NULL) {
1154		/* If needed, compute the checksum and mark it as valid. */
1155		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1156			in_delayed_cksum(copym);
1157			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1158			copym->m_pkthdr.csum_flags |=
1159			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1160			copym->m_pkthdr.csum_data = 0xffff;
1161		}
1162		/*
1163		 * We don't bother to fragment if the IP length is greater
1164		 * than the interface's MTU.  Can this possibly matter?
1165		 */
1166		ip = mtod(copym, struct ip *);
1167		ip->ip_len = htons(ip->ip_len);
1168		ip->ip_off = htons(ip->ip_off);
1169		ip->ip_sum = 0;
1170		ip->ip_sum = in_cksum(copym, hlen);
1171		/*
1172		 * NB:
1173		 * It's not clear whether there are any lingering
1174		 * reentrancy problems in other areas which might
1175		 * be exposed by using ip_input directly (in
1176		 * particular, everything which modifies the packet
1177		 * in-place).  Yet another option is using the
1178		 * protosw directly to deliver the looped back
1179		 * packet.  For the moment, we'll err on the side
1180		 * of safety by using if_simloop().
1181		 */
1182#if 1 /* XXX */
1183		if (dst->sin_family != AF_INET) {
1184			printf("ip_mloopback: bad address family %d\n",
1185						dst->sin_family);
1186			dst->sin_family = AF_INET;
1187		}
1188#endif
1189
1190#ifdef notdef
1191		copym->m_pkthdr.rcvif = ifp;
1192		ip_input(copym);
1193#else
1194		if_simloop(ifp, copym, dst->sin_family, 0);
1195#endif
1196	}
1197}
1198