ip_output.c revision 162798
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
30 * $FreeBSD: head/sys/netinet/ip_output.c 162798 2006-09-29 16:44:45Z andre $
31 */
32
33#include "opt_ipfw.h"
34#include "opt_ipsec.h"
35#include "opt_mac.h"
36#include "opt_mbuf_stress_test.h"
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/kernel.h>
41#include <sys/mac.h>
42#include <sys/malloc.h>
43#include <sys/mbuf.h>
44#include <sys/protosw.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
47#include <sys/sysctl.h>
48
49#include <net/if.h>
50#include <net/netisr.h>
51#include <net/pfil.h>
52#include <net/route.h>
53
54#include <netinet/in.h>
55#include <netinet/in_systm.h>
56#include <netinet/ip.h>
57#include <netinet/in_pcb.h>
58#include <netinet/in_var.h>
59#include <netinet/ip_var.h>
60#include <netinet/ip_options.h>
61
62#if defined(IPSEC) || defined(FAST_IPSEC)
63#include <netinet/ip_ipsec.h>
64#ifdef IPSEC
65#include <netinet6/ipsec.h>
66#endif
67#ifdef FAST_IPSEC
68#include <netipsec/ipsec.h>
69#endif
70#endif /*IPSEC*/
71
72#include <machine/in_cksum.h>
73
74static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
75
76#define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
77				x, (ntohl(a.s_addr)>>24)&0xFF,\
78				  (ntohl(a.s_addr)>>16)&0xFF,\
79				  (ntohl(a.s_addr)>>8)&0xFF,\
80				  (ntohl(a.s_addr))&0xFF, y);
81
82u_short ip_id;
83
84#ifdef MBUF_STRESS_TEST
85int mbuf_frag_size = 0;
86SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
87	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
88#endif
89
90static struct ifnet *ip_multicast_if(struct in_addr *, int *);
91static void	ip_mloopback
92	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
93static int	ip_getmoptions(struct inpcb *, struct sockopt *);
94static int	ip_setmoptions(struct inpcb *, struct sockopt *);
95
96
97extern	struct protosw inetsw[];
98
99/*
100 * IP output.  The packet in mbuf chain m contains a skeletal IP
101 * header (with len, off, ttl, proto, tos, src, dst).
102 * The mbuf chain containing the packet will be freed.
103 * The mbuf opt, if present, will not be freed.
104 * In the IP forwarding case, the packet will arrive with options already
105 * inserted, so must have a NULL opt pointer.
106 */
107int
108ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro,
109	int flags, struct ip_moptions *imo, struct inpcb *inp)
110{
111	struct ip *ip;
112	struct ifnet *ifp = NULL;	/* keep compiler happy */
113	struct mbuf *m0;
114	int hlen = sizeof (struct ip);
115	int mtu;
116	int len, error = 0;
117	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
118	struct in_ifaddr *ia = NULL;
119	struct in_ifaddr *sia = NULL;
120	int isbroadcast, sw_csum;
121	struct route iproute;
122	struct in_addr odst;
123#ifdef IPFIREWALL_FORWARD
124	struct m_tag *fwd_tag = NULL;
125#endif
126	M_ASSERTPKTHDR(m);
127
128	if (ro == NULL) {
129		ro = &iproute;
130		bzero(ro, sizeof (*ro));
131	}
132
133	if (inp != NULL)
134		INP_LOCK_ASSERT(inp);
135
136	if (opt) {
137		len = 0;
138		m = ip_insertoptions(m, opt, &len);
139		if (len != 0)
140			hlen = len;
141	}
142	ip = mtod(m, struct ip *);
143
144	/*
145	 * Fill in IP header.  If we are not allowing fragmentation,
146	 * then the ip_id field is meaningless, but we don't set it
147	 * to zero.  Doing so causes various problems when devices along
148	 * the path (routers, load balancers, firewalls, etc.) illegally
149	 * disable DF on our packet.  Note that a 16-bit counter
150	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
151	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
152	 * for Counting NATted Hosts", Proc. IMW'02, available at
153	 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
154	 */
155	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
156		ip->ip_v = IPVERSION;
157		ip->ip_hl = hlen >> 2;
158		ip->ip_id = ip_newid();
159		ipstat.ips_localout++;
160	} else {
161		hlen = ip->ip_hl << 2;
162	}
163
164	dst = (struct sockaddr_in *)&ro->ro_dst;
165again:
166	/*
167	 * If there is a cached route,
168	 * check that it is to the same destination
169	 * and is still up.  If not, free it and try again.
170	 * The address family should also be checked in case of sharing the
171	 * cache with IPv6.
172	 */
173	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
174			  dst->sin_family != AF_INET ||
175			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
176		RTFREE(ro->ro_rt);
177		ro->ro_rt = (struct rtentry *)NULL;
178	}
179#ifdef IPFIREWALL_FORWARD
180	if (ro->ro_rt == NULL && fwd_tag == NULL) {
181#else
182	if (ro->ro_rt == NULL) {
183#endif
184		bzero(dst, sizeof(*dst));
185		dst->sin_family = AF_INET;
186		dst->sin_len = sizeof(*dst);
187		dst->sin_addr = ip->ip_dst;
188	}
189	/*
190	 * If routing to interface only,
191	 * short circuit routing lookup.
192	 */
193	if (flags & IP_ROUTETOIF) {
194		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
195		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
196			ipstat.ips_noroute++;
197			error = ENETUNREACH;
198			goto bad;
199		}
200		ifp = ia->ia_ifp;
201		ip->ip_ttl = 1;
202		isbroadcast = in_broadcast(dst->sin_addr, ifp);
203	} else if (flags & IP_SENDONES) {
204		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL) {
205			ipstat.ips_noroute++;
206			error = ENETUNREACH;
207			goto bad;
208		}
209		ifp = ia->ia_ifp;
210		ip->ip_dst.s_addr = INADDR_BROADCAST;
211		dst->sin_addr = ip->ip_dst;
212		ip->ip_ttl = 1;
213		isbroadcast = 1;
214	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
215	    imo != NULL && imo->imo_multicast_ifp != NULL) {
216		/*
217		 * Bypass the normal routing lookup for multicast
218		 * packets if the interface is specified.
219		 */
220		ifp = imo->imo_multicast_ifp;
221		IFP_TO_IA(ifp, ia);
222		isbroadcast = 0;	/* fool gcc */
223	} else {
224		/*
225		 * We want to do any cloning requested by the link layer,
226		 * as this is probably required in all cases for correct
227		 * operation (as it is for ARP).
228		 */
229		if (ro->ro_rt == NULL)
230			rtalloc_ign(ro, 0);
231		if (ro->ro_rt == NULL) {
232			ipstat.ips_noroute++;
233			error = EHOSTUNREACH;
234			goto bad;
235		}
236		ia = ifatoia(ro->ro_rt->rt_ifa);
237		ifp = ro->ro_rt->rt_ifp;
238		ro->ro_rt->rt_rmx.rmx_pksent++;
239		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
240			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
241		if (ro->ro_rt->rt_flags & RTF_HOST)
242			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
243		else
244			isbroadcast = in_broadcast(dst->sin_addr, ifp);
245	}
246	/*
247	 * Calculate MTU.  If we have a route that is up, use that,
248	 * otherwise use the interface's MTU.
249	 */
250	if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) {
251		/*
252		 * This case can happen if the user changed the MTU
253		 * of an interface after enabling IP on it.  Because
254		 * most netifs don't keep track of routes pointing to
255		 * them, there is no way for one to update all its
256		 * routes when the MTU is changed.
257		 */
258		if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)
259			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
260		mtu = ro->ro_rt->rt_rmx.rmx_mtu;
261	} else {
262		mtu = ifp->if_mtu;
263	}
264	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
265		struct in_multi *inm;
266
267		m->m_flags |= M_MCAST;
268		/*
269		 * IP destination address is multicast.  Make sure "dst"
270		 * still points to the address in "ro".  (It may have been
271		 * changed to point to a gateway address, above.)
272		 */
273		dst = (struct sockaddr_in *)&ro->ro_dst;
274		/*
275		 * See if the caller provided any multicast options
276		 */
277		if (imo != NULL) {
278			ip->ip_ttl = imo->imo_multicast_ttl;
279			if (imo->imo_multicast_vif != -1)
280				ip->ip_src.s_addr =
281				    ip_mcast_src ?
282				    ip_mcast_src(imo->imo_multicast_vif) :
283				    INADDR_ANY;
284		} else
285			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
286		/*
287		 * Confirm that the outgoing interface supports multicast.
288		 */
289		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
290			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
291				ipstat.ips_noroute++;
292				error = ENETUNREACH;
293				goto bad;
294			}
295		}
296		/*
297		 * If source address not specified yet, use address
298		 * of outgoing interface.
299		 */
300		if (ip->ip_src.s_addr == INADDR_ANY) {
301			/* Interface may have no addresses. */
302			if (ia != NULL)
303				ip->ip_src = IA_SIN(ia)->sin_addr;
304		}
305
306		IN_MULTI_LOCK();
307		IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
308		if (inm != NULL &&
309		   (imo == NULL || imo->imo_multicast_loop)) {
310			IN_MULTI_UNLOCK();
311			/*
312			 * If we belong to the destination multicast group
313			 * on the outgoing interface, and the caller did not
314			 * forbid loopback, loop back a copy.
315			 */
316			ip_mloopback(ifp, m, dst, hlen);
317		}
318		else {
319			IN_MULTI_UNLOCK();
320			/*
321			 * If we are acting as a multicast router, perform
322			 * multicast forwarding as if the packet had just
323			 * arrived on the interface to which we are about
324			 * to send.  The multicast forwarding function
325			 * recursively calls this function, using the
326			 * IP_FORWARDING flag to prevent infinite recursion.
327			 *
328			 * Multicasts that are looped back by ip_mloopback(),
329			 * above, will be forwarded by the ip_input() routine,
330			 * if necessary.
331			 */
332			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
333				/*
334				 * If rsvp daemon is not running, do not
335				 * set ip_moptions. This ensures that the packet
336				 * is multicast and not just sent down one link
337				 * as prescribed by rsvpd.
338				 */
339				if (!rsvp_on)
340					imo = NULL;
341				if (ip_mforward &&
342				    ip_mforward(ip, ifp, m, imo) != 0) {
343					m_freem(m);
344					goto done;
345				}
346			}
347		}
348
349		/*
350		 * Multicasts with a time-to-live of zero may be looped-
351		 * back, above, but must not be transmitted on a network.
352		 * Also, multicasts addressed to the loopback interface
353		 * are not sent -- the above call to ip_mloopback() will
354		 * loop back a copy if this host actually belongs to the
355		 * destination group on the loopback interface.
356		 */
357		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
358			m_freem(m);
359			goto done;
360		}
361
362		goto sendit;
363	}
364
365	/*
366	 * If the source address is not specified yet, use the address
367	 * of the outoing interface.
368	 */
369	if (ip->ip_src.s_addr == INADDR_ANY) {
370		/* Interface may have no addresses. */
371		if (ia != NULL) {
372			ip->ip_src = IA_SIN(ia)->sin_addr;
373		}
374	}
375
376	/*
377	 * Verify that we have any chance at all of being able to queue the
378	 * packet or packet fragments, unless ALTQ is enabled on the given
379	 * interface in which case packetdrop should be done by queueing.
380	 */
381#ifdef ALTQ
382	if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
383	    ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
384	    ifp->if_snd.ifq_maxlen))
385#else
386	if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
387	    ifp->if_snd.ifq_maxlen)
388#endif /* ALTQ */
389	{
390		error = ENOBUFS;
391		ipstat.ips_odropped++;
392		ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
393		goto bad;
394	}
395
396	/*
397	 * Look for broadcast address and
398	 * verify user is allowed to send
399	 * such a packet.
400	 */
401	if (isbroadcast) {
402		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
403			error = EADDRNOTAVAIL;
404			goto bad;
405		}
406		if ((flags & IP_ALLOWBROADCAST) == 0) {
407			error = EACCES;
408			goto bad;
409		}
410		/* don't allow broadcast messages to be fragmented */
411		if (ip->ip_len > mtu) {
412			error = EMSGSIZE;
413			goto bad;
414		}
415		m->m_flags |= M_BCAST;
416	} else {
417		m->m_flags &= ~M_BCAST;
418	}
419
420sendit:
421#if defined(IPSEC) || defined(FAST_IPSEC)
422	switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) {
423	case 1:
424		goto bad;
425	case -1:
426		goto done;
427	case 0:
428	default:
429		break;	/* Continue with packet processing. */
430	}
431	/* Update variables that are affected by ipsec4_output(). */
432	ip = mtod(m, struct ip *);
433	hlen = ip->ip_hl << 2;
434#endif /* IPSEC */
435
436	/* Jump over all PFIL processing if hooks are not active. */
437	if (!PFIL_HOOKED(&inet_pfil_hook))
438		goto passout;
439
440	/* Run through list of hooks for output packets. */
441	odst.s_addr = ip->ip_dst.s_addr;
442	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
443	if (error != 0 || m == NULL)
444		goto done;
445
446	ip = mtod(m, struct ip *);
447
448	/* See if destination IP address was changed by packet filter. */
449	if (odst.s_addr != ip->ip_dst.s_addr) {
450		m->m_flags |= M_SKIP_FIREWALL;
451		/* If destination is now ourself drop to ip_input(). */
452		if (in_localip(ip->ip_dst)) {
453			m->m_flags |= M_FASTFWD_OURS;
454			if (m->m_pkthdr.rcvif == NULL)
455				m->m_pkthdr.rcvif = loif;
456			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
457				m->m_pkthdr.csum_flags |=
458				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
459				m->m_pkthdr.csum_data = 0xffff;
460			}
461			m->m_pkthdr.csum_flags |=
462			    CSUM_IP_CHECKED | CSUM_IP_VALID;
463
464			error = netisr_queue(NETISR_IP, m);
465			goto done;
466		} else
467			goto again;	/* Redo the routing table lookup. */
468	}
469
470#ifdef IPFIREWALL_FORWARD
471	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
472	if (m->m_flags & M_FASTFWD_OURS) {
473		if (m->m_pkthdr.rcvif == NULL)
474			m->m_pkthdr.rcvif = loif;
475		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
476			m->m_pkthdr.csum_flags |=
477			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
478			m->m_pkthdr.csum_data = 0xffff;
479		}
480		m->m_pkthdr.csum_flags |=
481			    CSUM_IP_CHECKED | CSUM_IP_VALID;
482
483		error = netisr_queue(NETISR_IP, m);
484		goto done;
485	}
486	/* Or forward to some other address? */
487	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
488	if (fwd_tag) {
489		dst = (struct sockaddr_in *)&ro->ro_dst;
490		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
491		m->m_flags |= M_SKIP_FIREWALL;
492		m_tag_delete(m, fwd_tag);
493		goto again;
494	}
495#endif /* IPFIREWALL_FORWARD */
496
497passout:
498	/* 127/8 must not appear on wire - RFC1122. */
499	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
500	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
501		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
502			ipstat.ips_badaddr++;
503			error = EADDRNOTAVAIL;
504			goto bad;
505		}
506	}
507
508	m->m_pkthdr.csum_flags |= CSUM_IP;
509	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
510	if (sw_csum & CSUM_DELAY_DATA) {
511		in_delayed_cksum(m);
512		sw_csum &= ~CSUM_DELAY_DATA;
513	}
514	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
515
516	/*
517	 * If small enough for interface, or the interface will take
518	 * care of the fragmentation for us, we can just send directly.
519	 */
520	if (ip->ip_len <= mtu ||
521	    (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
522	    ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
523		ip->ip_len = htons(ip->ip_len);
524		ip->ip_off = htons(ip->ip_off);
525		ip->ip_sum = 0;
526		if (sw_csum & CSUM_DELAY_IP)
527			ip->ip_sum = in_cksum(m, hlen);
528
529		/*
530		 * Record statistics for this interface address.
531		 * With CSUM_TSO the byte/packet count will be slightly
532		 * incorrect because we count the IP+TCP headers only
533		 * once instead of for every generated packet.
534		 */
535		if (!(flags & IP_FORWARDING) && ia) {
536			INADDR_TO_IFADDR(ip->ip_src, sia);
537			if (sia == NULL)
538				sia = ia;
539			if (m->m_pkthdr.csum_flags & CSUM_TSO)
540				sia->ia_ifa.if_opackets +=
541				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
542			else
543				sia->ia_ifa.if_opackets++;
544			sia->ia_ifa.if_obytes += m->m_pkthdr.len;
545		}
546#ifdef IPSEC
547		/* clean ipsec history once it goes out of the node */
548		ipsec_delaux(m);
549#endif
550#ifdef MBUF_STRESS_TEST
551		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
552			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
553#endif
554		/*
555		 * Reset layer specific mbuf flags
556		 * to avoid confusing lower layers.
557		 */
558		m->m_flags &= ~(M_PROTOFLAGS);
559
560		error = (*ifp->if_output)(ifp, m,
561				(struct sockaddr *)dst, ro->ro_rt);
562		goto done;
563	}
564
565	/* Balk when DF bit is set or the interface didn't support TSO. */
566	if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
567		error = EMSGSIZE;
568		ipstat.ips_cantfrag++;
569		goto bad;
570	}
571
572	/*
573	 * Too large for interface; fragment if possible. If successful,
574	 * on return, m will point to a list of packets to be sent.
575	 */
576	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum);
577	if (error)
578		goto bad;
579	for (; m; m = m0) {
580		m0 = m->m_nextpkt;
581		m->m_nextpkt = 0;
582#ifdef IPSEC
583		/* clean ipsec history once it goes out of the node */
584		ipsec_delaux(m);
585#endif
586		if (error == 0) {
587			/* Record statistics for this interface address. */
588			if (ia != NULL) {
589				INADDR_TO_IFADDR(ip->ip_src, sia);
590				if (sia == NULL)
591					sia = ia;
592				sia->ia_ifa.if_opackets++;
593				sia->ia_ifa.if_obytes += m->m_pkthdr.len;
594			}
595			/*
596			 * Reset layer specific mbuf flags
597			 * to avoid confusing upper layers.
598			 */
599			m->m_flags &= ~(M_PROTOFLAGS);
600
601			error = (*ifp->if_output)(ifp, m,
602			    (struct sockaddr *)dst, ro->ro_rt);
603		} else
604			m_freem(m);
605	}
606
607	if (error == 0)
608		ipstat.ips_fragmented++;
609
610done:
611	if (ro == &iproute && ro->ro_rt) {
612		RTFREE(ro->ro_rt);
613	}
614	return (error);
615bad:
616	m_freem(m);
617	goto done;
618}
619
620/*
621 * Create a chain of fragments which fit the given mtu. m_frag points to the
622 * mbuf to be fragmented; on return it points to the chain with the fragments.
623 * Return 0 if no error. If error, m_frag may contain a partially built
624 * chain of fragments that should be freed by the caller.
625 *
626 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
627 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
628 */
629int
630ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
631	    u_long if_hwassist_flags, int sw_csum)
632{
633	int error = 0;
634	int hlen = ip->ip_hl << 2;
635	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
636	int off;
637	struct mbuf *m0 = *m_frag;	/* the original packet		*/
638	int firstlen;
639	struct mbuf **mnext;
640	int nfrags;
641
642	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
643		ipstat.ips_cantfrag++;
644		return EMSGSIZE;
645	}
646
647	/*
648	 * Must be able to put at least 8 bytes per fragment.
649	 */
650	if (len < 8)
651		return EMSGSIZE;
652
653	/*
654	 * If the interface will not calculate checksums on
655	 * fragmented packets, then do it here.
656	 */
657	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
658	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
659		in_delayed_cksum(m0);
660		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
661	}
662
663	if (len > PAGE_SIZE) {
664		/*
665		 * Fragment large datagrams such that each segment
666		 * contains a multiple of PAGE_SIZE amount of data,
667		 * plus headers. This enables a receiver to perform
668		 * page-flipping zero-copy optimizations.
669		 *
670		 * XXX When does this help given that sender and receiver
671		 * could have different page sizes, and also mtu could
672		 * be less than the receiver's page size ?
673		 */
674		int newlen;
675		struct mbuf *m;
676
677		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
678			off += m->m_len;
679
680		/*
681		 * firstlen (off - hlen) must be aligned on an
682		 * 8-byte boundary
683		 */
684		if (off < hlen)
685			goto smart_frag_failure;
686		off = ((off - hlen) & ~7) + hlen;
687		newlen = (~PAGE_MASK) & mtu;
688		if ((newlen + sizeof (struct ip)) > mtu) {
689			/* we failed, go back the default */
690smart_frag_failure:
691			newlen = len;
692			off = hlen + len;
693		}
694		len = newlen;
695
696	} else {
697		off = hlen + len;
698	}
699
700	firstlen = off - hlen;
701	mnext = &m0->m_nextpkt;		/* pointer to next packet */
702
703	/*
704	 * Loop through length of segment after first fragment,
705	 * make new header and copy data of each part and link onto chain.
706	 * Here, m0 is the original packet, m is the fragment being created.
707	 * The fragments are linked off the m_nextpkt of the original
708	 * packet, which after processing serves as the first fragment.
709	 */
710	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
711		struct ip *mhip;	/* ip header on the fragment */
712		struct mbuf *m;
713		int mhlen = sizeof (struct ip);
714
715		MGETHDR(m, M_DONTWAIT, MT_DATA);
716		if (m == NULL) {
717			error = ENOBUFS;
718			ipstat.ips_odropped++;
719			goto done;
720		}
721		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
722		/*
723		 * In the first mbuf, leave room for the link header, then
724		 * copy the original IP header including options. The payload
725		 * goes into an additional mbuf chain returned by m_copy().
726		 */
727		m->m_data += max_linkhdr;
728		mhip = mtod(m, struct ip *);
729		*mhip = *ip;
730		if (hlen > sizeof (struct ip)) {
731			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
732			mhip->ip_v = IPVERSION;
733			mhip->ip_hl = mhlen >> 2;
734		}
735		m->m_len = mhlen;
736		/* XXX do we need to add ip->ip_off below ? */
737		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
738		if (off + len >= ip->ip_len) {	/* last fragment */
739			len = ip->ip_len - off;
740			m->m_flags |= M_LASTFRAG;
741		} else
742			mhip->ip_off |= IP_MF;
743		mhip->ip_len = htons((u_short)(len + mhlen));
744		m->m_next = m_copy(m0, off, len);
745		if (m->m_next == NULL) {	/* copy failed */
746			m_free(m);
747			error = ENOBUFS;	/* ??? */
748			ipstat.ips_odropped++;
749			goto done;
750		}
751		m->m_pkthdr.len = mhlen + len;
752		m->m_pkthdr.rcvif = NULL;
753#ifdef MAC
754		mac_create_fragment(m0, m);
755#endif
756		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
757		mhip->ip_off = htons(mhip->ip_off);
758		mhip->ip_sum = 0;
759		if (sw_csum & CSUM_DELAY_IP)
760			mhip->ip_sum = in_cksum(m, mhlen);
761		*mnext = m;
762		mnext = &m->m_nextpkt;
763	}
764	ipstat.ips_ofragments += nfrags;
765
766	/* set first marker for fragment chain */
767	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
768	m0->m_pkthdr.csum_data = nfrags;
769
770	/*
771	 * Update first fragment by trimming what's been copied out
772	 * and updating header.
773	 */
774	m_adj(m0, hlen + firstlen - ip->ip_len);
775	m0->m_pkthdr.len = hlen + firstlen;
776	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
777	ip->ip_off |= IP_MF;
778	ip->ip_off = htons(ip->ip_off);
779	ip->ip_sum = 0;
780	if (sw_csum & CSUM_DELAY_IP)
781		ip->ip_sum = in_cksum(m0, hlen);
782
783done:
784	*m_frag = m0;
785	return error;
786}
787
788void
789in_delayed_cksum(struct mbuf *m)
790{
791	struct ip *ip;
792	u_short csum, offset;
793
794	ip = mtod(m, struct ip *);
795	offset = ip->ip_hl << 2 ;
796	csum = in_cksum_skip(m, ip->ip_len, offset);
797	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
798		csum = 0xffff;
799	offset += m->m_pkthdr.csum_data;	/* checksum offset */
800
801	if (offset + sizeof(u_short) > m->m_len) {
802		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
803		    m->m_len, offset, ip->ip_p);
804		/*
805		 * XXX
806		 * this shouldn't happen, but if it does, the
807		 * correct behavior may be to insert the checksum
808		 * in the appropriate next mbuf in the chain.
809		 */
810		return;
811	}
812	*(u_short *)(m->m_data + offset) = csum;
813}
814
815/*
816 * IP socket option processing.
817 */
818int
819ip_ctloutput(so, sopt)
820	struct socket *so;
821	struct sockopt *sopt;
822{
823	struct	inpcb *inp = sotoinpcb(so);
824	int	error, optval;
825
826	error = optval = 0;
827	if (sopt->sopt_level != IPPROTO_IP) {
828		return (EINVAL);
829	}
830
831	switch (sopt->sopt_dir) {
832	case SOPT_SET:
833		switch (sopt->sopt_name) {
834		case IP_OPTIONS:
835#ifdef notyet
836		case IP_RETOPTS:
837#endif
838		{
839			struct mbuf *m;
840			if (sopt->sopt_valsize > MLEN) {
841				error = EMSGSIZE;
842				break;
843			}
844			MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
845			if (m == NULL) {
846				error = ENOBUFS;
847				break;
848			}
849			m->m_len = sopt->sopt_valsize;
850			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
851					    m->m_len);
852			if (error) {
853				m_free(m);
854				break;
855			}
856			INP_LOCK(inp);
857			error = ip_pcbopts(inp, sopt->sopt_name, m);
858			INP_UNLOCK(inp);
859			return (error);
860		}
861
862		case IP_TOS:
863		case IP_TTL:
864		case IP_MINTTL:
865		case IP_RECVOPTS:
866		case IP_RECVRETOPTS:
867		case IP_RECVDSTADDR:
868		case IP_RECVTTL:
869		case IP_RECVIF:
870		case IP_FAITH:
871		case IP_ONESBCAST:
872		case IP_DONTFRAG:
873			error = sooptcopyin(sopt, &optval, sizeof optval,
874					    sizeof optval);
875			if (error)
876				break;
877
878			switch (sopt->sopt_name) {
879			case IP_TOS:
880				inp->inp_ip_tos = optval;
881				break;
882
883			case IP_TTL:
884				inp->inp_ip_ttl = optval;
885				break;
886
887			case IP_MINTTL:
888				if (optval > 0 && optval <= MAXTTL)
889					inp->inp_ip_minttl = optval;
890				else
891					error = EINVAL;
892				break;
893
894#define	OPTSET(bit) do {						\
895	INP_LOCK(inp);							\
896	if (optval)							\
897		inp->inp_flags |= bit;					\
898	else								\
899		inp->inp_flags &= ~bit;					\
900	INP_UNLOCK(inp);						\
901} while (0)
902
903			case IP_RECVOPTS:
904				OPTSET(INP_RECVOPTS);
905				break;
906
907			case IP_RECVRETOPTS:
908				OPTSET(INP_RECVRETOPTS);
909				break;
910
911			case IP_RECVDSTADDR:
912				OPTSET(INP_RECVDSTADDR);
913				break;
914
915			case IP_RECVTTL:
916				OPTSET(INP_RECVTTL);
917				break;
918
919			case IP_RECVIF:
920				OPTSET(INP_RECVIF);
921				break;
922
923			case IP_FAITH:
924				OPTSET(INP_FAITH);
925				break;
926
927			case IP_ONESBCAST:
928				OPTSET(INP_ONESBCAST);
929				break;
930			case IP_DONTFRAG:
931				OPTSET(INP_DONTFRAG);
932				break;
933			}
934			break;
935#undef OPTSET
936
937		case IP_MULTICAST_IF:
938		case IP_MULTICAST_VIF:
939		case IP_MULTICAST_TTL:
940		case IP_MULTICAST_LOOP:
941		case IP_ADD_MEMBERSHIP:
942		case IP_DROP_MEMBERSHIP:
943			error = ip_setmoptions(inp, sopt);
944			break;
945
946		case IP_PORTRANGE:
947			error = sooptcopyin(sopt, &optval, sizeof optval,
948					    sizeof optval);
949			if (error)
950				break;
951
952			INP_LOCK(inp);
953			switch (optval) {
954			case IP_PORTRANGE_DEFAULT:
955				inp->inp_flags &= ~(INP_LOWPORT);
956				inp->inp_flags &= ~(INP_HIGHPORT);
957				break;
958
959			case IP_PORTRANGE_HIGH:
960				inp->inp_flags &= ~(INP_LOWPORT);
961				inp->inp_flags |= INP_HIGHPORT;
962				break;
963
964			case IP_PORTRANGE_LOW:
965				inp->inp_flags &= ~(INP_HIGHPORT);
966				inp->inp_flags |= INP_LOWPORT;
967				break;
968
969			default:
970				error = EINVAL;
971				break;
972			}
973			INP_UNLOCK(inp);
974			break;
975
976#if defined(IPSEC) || defined(FAST_IPSEC)
977		case IP_IPSEC_POLICY:
978		{
979			caddr_t req;
980			size_t len = 0;
981			int priv;
982			struct mbuf *m;
983			int optname;
984
985			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
986				break;
987			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
988				break;
989			priv = (sopt->sopt_td != NULL &&
990				suser(sopt->sopt_td) != 0) ? 0 : 1;
991			req = mtod(m, caddr_t);
992			len = m->m_len;
993			optname = sopt->sopt_name;
994			error = ipsec4_set_policy(inp, optname, req, len, priv);
995			m_freem(m);
996			break;
997		}
998#endif /*IPSEC*/
999
1000		default:
1001			error = ENOPROTOOPT;
1002			break;
1003		}
1004		break;
1005
1006	case SOPT_GET:
1007		switch (sopt->sopt_name) {
1008		case IP_OPTIONS:
1009		case IP_RETOPTS:
1010			if (inp->inp_options)
1011				error = sooptcopyout(sopt,
1012						     mtod(inp->inp_options,
1013							  char *),
1014						     inp->inp_options->m_len);
1015			else
1016				sopt->sopt_valsize = 0;
1017			break;
1018
1019		case IP_TOS:
1020		case IP_TTL:
1021		case IP_MINTTL:
1022		case IP_RECVOPTS:
1023		case IP_RECVRETOPTS:
1024		case IP_RECVDSTADDR:
1025		case IP_RECVTTL:
1026		case IP_RECVIF:
1027		case IP_PORTRANGE:
1028		case IP_FAITH:
1029		case IP_ONESBCAST:
1030		case IP_DONTFRAG:
1031			switch (sopt->sopt_name) {
1032
1033			case IP_TOS:
1034				optval = inp->inp_ip_tos;
1035				break;
1036
1037			case IP_TTL:
1038				optval = inp->inp_ip_ttl;
1039				break;
1040
1041			case IP_MINTTL:
1042				optval = inp->inp_ip_minttl;
1043				break;
1044
1045#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1046
1047			case IP_RECVOPTS:
1048				optval = OPTBIT(INP_RECVOPTS);
1049				break;
1050
1051			case IP_RECVRETOPTS:
1052				optval = OPTBIT(INP_RECVRETOPTS);
1053				break;
1054
1055			case IP_RECVDSTADDR:
1056				optval = OPTBIT(INP_RECVDSTADDR);
1057				break;
1058
1059			case IP_RECVTTL:
1060				optval = OPTBIT(INP_RECVTTL);
1061				break;
1062
1063			case IP_RECVIF:
1064				optval = OPTBIT(INP_RECVIF);
1065				break;
1066
1067			case IP_PORTRANGE:
1068				if (inp->inp_flags & INP_HIGHPORT)
1069					optval = IP_PORTRANGE_HIGH;
1070				else if (inp->inp_flags & INP_LOWPORT)
1071					optval = IP_PORTRANGE_LOW;
1072				else
1073					optval = 0;
1074				break;
1075
1076			case IP_FAITH:
1077				optval = OPTBIT(INP_FAITH);
1078				break;
1079
1080			case IP_ONESBCAST:
1081				optval = OPTBIT(INP_ONESBCAST);
1082				break;
1083			case IP_DONTFRAG:
1084				optval = OPTBIT(INP_DONTFRAG);
1085				break;
1086			}
1087			error = sooptcopyout(sopt, &optval, sizeof optval);
1088			break;
1089
1090		case IP_MULTICAST_IF:
1091		case IP_MULTICAST_VIF:
1092		case IP_MULTICAST_TTL:
1093		case IP_MULTICAST_LOOP:
1094		case IP_ADD_MEMBERSHIP:
1095		case IP_DROP_MEMBERSHIP:
1096			error = ip_getmoptions(inp, sopt);
1097			break;
1098
1099#if defined(IPSEC) || defined(FAST_IPSEC)
1100		case IP_IPSEC_POLICY:
1101		{
1102			struct mbuf *m = NULL;
1103			caddr_t req = NULL;
1104			size_t len = 0;
1105
1106			if (m != 0) {
1107				req = mtod(m, caddr_t);
1108				len = m->m_len;
1109			}
1110			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1111			if (error == 0)
1112				error = soopt_mcopyout(sopt, m); /* XXX */
1113			if (error == 0)
1114				m_freem(m);
1115			break;
1116		}
1117#endif /*IPSEC*/
1118
1119		default:
1120			error = ENOPROTOOPT;
1121			break;
1122		}
1123		break;
1124	}
1125	return (error);
1126}
1127
1128/*
1129 * XXX
1130 * The whole multicast option thing needs to be re-thought.
1131 * Several of these options are equally applicable to non-multicast
1132 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1133 * standard option (IP_TTL).
1134 */
1135
1136/*
1137 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1138 */
1139static struct ifnet *
1140ip_multicast_if(a, ifindexp)
1141	struct in_addr *a;
1142	int *ifindexp;
1143{
1144	int ifindex;
1145	struct ifnet *ifp;
1146
1147	if (ifindexp)
1148		*ifindexp = 0;
1149	if (ntohl(a->s_addr) >> 24 == 0) {
1150		ifindex = ntohl(a->s_addr) & 0xffffff;
1151		if (ifindex < 0 || if_index < ifindex)
1152			return NULL;
1153		ifp = ifnet_byindex(ifindex);
1154		if (ifindexp)
1155			*ifindexp = ifindex;
1156	} else {
1157		INADDR_TO_IFP(*a, ifp);
1158	}
1159	return ifp;
1160}
1161
1162/*
1163 * Given an inpcb, return its multicast options structure pointer.  Accepts
1164 * an unlocked inpcb pointer, but will return it locked.  May sleep.
1165 */
1166static struct ip_moptions *
1167ip_findmoptions(struct inpcb *inp)
1168{
1169	struct ip_moptions *imo;
1170	struct in_multi **immp;
1171
1172	INP_LOCK(inp);
1173	if (inp->inp_moptions != NULL)
1174		return (inp->inp_moptions);
1175
1176	INP_UNLOCK(inp);
1177
1178	imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
1179	immp = (struct in_multi **)malloc((sizeof(*immp) * IP_MIN_MEMBERSHIPS),
1180					  M_IPMOPTS, M_WAITOK);
1181
1182	imo->imo_multicast_ifp = NULL;
1183	imo->imo_multicast_addr.s_addr = INADDR_ANY;
1184	imo->imo_multicast_vif = -1;
1185	imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1186	imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1187	imo->imo_num_memberships = 0;
1188	imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
1189	imo->imo_membership = immp;
1190
1191	INP_LOCK(inp);
1192	if (inp->inp_moptions != NULL) {
1193		free(immp, M_IPMOPTS);
1194		free(imo, M_IPMOPTS);
1195		return (inp->inp_moptions);
1196	}
1197	inp->inp_moptions = imo;
1198	return (imo);
1199}
1200
1201/*
1202 * Set the IP multicast options in response to user setsockopt().
1203 */
1204static int
1205ip_setmoptions(struct inpcb *inp, struct sockopt *sopt)
1206{
1207	int error = 0;
1208	int i;
1209	struct in_addr addr;
1210	struct ip_mreq mreq;
1211	struct ifnet *ifp;
1212	struct ip_moptions *imo;
1213	struct route ro;
1214	struct sockaddr_in *dst;
1215	int ifindex;
1216	int s;
1217
1218	switch (sopt->sopt_name) {
1219	/* store an index number for the vif you wanna use in the send */
1220	case IP_MULTICAST_VIF:
1221		if (legal_vif_num == 0) {
1222			error = EOPNOTSUPP;
1223			break;
1224		}
1225		error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1226		if (error)
1227			break;
1228		if (!legal_vif_num(i) && (i != -1)) {
1229			error = EINVAL;
1230			break;
1231		}
1232		imo = ip_findmoptions(inp);
1233		imo->imo_multicast_vif = i;
1234		INP_UNLOCK(inp);
1235		break;
1236
1237	case IP_MULTICAST_IF:
1238		/*
1239		 * Select the interface for outgoing multicast packets.
1240		 */
1241		error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1242		if (error)
1243			break;
1244		/*
1245		 * INADDR_ANY is used to remove a previous selection.
1246		 * When no interface is selected, a default one is
1247		 * chosen every time a multicast packet is sent.
1248		 */
1249		imo = ip_findmoptions(inp);
1250		if (addr.s_addr == INADDR_ANY) {
1251			imo->imo_multicast_ifp = NULL;
1252			INP_UNLOCK(inp);
1253			break;
1254		}
1255		/*
1256		 * The selected interface is identified by its local
1257		 * IP address.  Find the interface and confirm that
1258		 * it supports multicasting.
1259		 */
1260		s = splimp();
1261		ifp = ip_multicast_if(&addr, &ifindex);
1262		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1263			INP_UNLOCK(inp);
1264			splx(s);
1265			error = EADDRNOTAVAIL;
1266			break;
1267		}
1268		imo->imo_multicast_ifp = ifp;
1269		if (ifindex)
1270			imo->imo_multicast_addr = addr;
1271		else
1272			imo->imo_multicast_addr.s_addr = INADDR_ANY;
1273		INP_UNLOCK(inp);
1274		splx(s);
1275		break;
1276
1277	case IP_MULTICAST_TTL:
1278		/*
1279		 * Set the IP time-to-live for outgoing multicast packets.
1280		 * The original multicast API required a char argument,
1281		 * which is inconsistent with the rest of the socket API.
1282		 * We allow either a char or an int.
1283		 */
1284		if (sopt->sopt_valsize == 1) {
1285			u_char ttl;
1286			error = sooptcopyin(sopt, &ttl, 1, 1);
1287			if (error)
1288				break;
1289			imo = ip_findmoptions(inp);
1290			imo->imo_multicast_ttl = ttl;
1291			INP_UNLOCK(inp);
1292		} else {
1293			u_int ttl;
1294			error = sooptcopyin(sopt, &ttl, sizeof ttl,
1295					    sizeof ttl);
1296			if (error)
1297				break;
1298			if (ttl > 255)
1299				error = EINVAL;
1300			else {
1301				imo = ip_findmoptions(inp);
1302				imo->imo_multicast_ttl = ttl;
1303				INP_UNLOCK(inp);
1304			}
1305		}
1306		break;
1307
1308	case IP_MULTICAST_LOOP:
1309		/*
1310		 * Set the loopback flag for outgoing multicast packets.
1311		 * Must be zero or one.  The original multicast API required a
1312		 * char argument, which is inconsistent with the rest
1313		 * of the socket API.  We allow either a char or an int.
1314		 */
1315		if (sopt->sopt_valsize == 1) {
1316			u_char loop;
1317			error = sooptcopyin(sopt, &loop, 1, 1);
1318			if (error)
1319				break;
1320			imo = ip_findmoptions(inp);
1321			imo->imo_multicast_loop = !!loop;
1322			INP_UNLOCK(inp);
1323		} else {
1324			u_int loop;
1325			error = sooptcopyin(sopt, &loop, sizeof loop,
1326					    sizeof loop);
1327			if (error)
1328				break;
1329			imo = ip_findmoptions(inp);
1330			imo->imo_multicast_loop = !!loop;
1331			INP_UNLOCK(inp);
1332		}
1333		break;
1334
1335	case IP_ADD_MEMBERSHIP:
1336		/*
1337		 * Add a multicast group membership.
1338		 * Group must be a valid IP multicast address.
1339		 */
1340		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1341		if (error)
1342			break;
1343
1344		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1345			error = EINVAL;
1346			break;
1347		}
1348		s = splimp();
1349		/*
1350		 * If no interface address was provided, use the interface of
1351		 * the route to the given multicast address.
1352		 */
1353		if (mreq.imr_interface.s_addr == INADDR_ANY) {
1354			bzero((caddr_t)&ro, sizeof(ro));
1355			dst = (struct sockaddr_in *)&ro.ro_dst;
1356			dst->sin_len = sizeof(*dst);
1357			dst->sin_family = AF_INET;
1358			dst->sin_addr = mreq.imr_multiaddr;
1359			rtalloc_ign(&ro, RTF_CLONING);
1360			if (ro.ro_rt == NULL) {
1361				error = EADDRNOTAVAIL;
1362				splx(s);
1363				break;
1364			}
1365			ifp = ro.ro_rt->rt_ifp;
1366			RTFREE(ro.ro_rt);
1367		}
1368		else {
1369			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1370		}
1371
1372		/*
1373		 * See if we found an interface, and confirm that it
1374		 * supports multicast.
1375		 */
1376		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1377			error = EADDRNOTAVAIL;
1378			splx(s);
1379			break;
1380		}
1381		/*
1382		 * See if the membership already exists or if all the
1383		 * membership slots are full.
1384		 */
1385		imo = ip_findmoptions(inp);
1386		for (i = 0; i < imo->imo_num_memberships; ++i) {
1387			if (imo->imo_membership[i]->inm_ifp == ifp &&
1388			    imo->imo_membership[i]->inm_addr.s_addr
1389						== mreq.imr_multiaddr.s_addr)
1390				break;
1391		}
1392		if (i < imo->imo_num_memberships) {
1393			INP_UNLOCK(inp);
1394			error = EADDRINUSE;
1395			splx(s);
1396			break;
1397		}
1398		if (imo->imo_num_memberships == imo->imo_max_memberships) {
1399		    struct in_multi **nmships, **omships;
1400		    size_t newmax;
1401		    /*
1402		     * Resize the vector to next power-of-two minus 1. If the
1403		     * size would exceed the maximum then we know we've really
1404		     * run out of entries. Otherwise, we realloc() the vector
1405		     * with the INP lock held to avoid introducing a race.
1406		     */
1407		    nmships = NULL;
1408		    omships = imo->imo_membership;
1409		    newmax = ((imo->imo_max_memberships + 1) * 2) - 1;
1410		    if (newmax <= IP_MAX_MEMBERSHIPS) {
1411			nmships = (struct in_multi **)realloc(omships,
1412sizeof(*nmships) * newmax, M_IPMOPTS, M_NOWAIT);
1413			if (nmships != NULL) {
1414			    imo->imo_membership = nmships;
1415			    imo->imo_max_memberships = newmax;
1416			}
1417		    }
1418		    if (nmships == NULL) {
1419			INP_UNLOCK(inp);
1420			error = ETOOMANYREFS;
1421			splx(s);
1422			break;
1423		    }
1424		}
1425		/*
1426		 * Everything looks good; add a new record to the multicast
1427		 * address list for the given interface.
1428		 */
1429		if ((imo->imo_membership[i] =
1430		    in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
1431			INP_UNLOCK(inp);
1432			error = ENOBUFS;
1433			splx(s);
1434			break;
1435		}
1436		++imo->imo_num_memberships;
1437		INP_UNLOCK(inp);
1438		splx(s);
1439		break;
1440
1441	case IP_DROP_MEMBERSHIP:
1442		/*
1443		 * Drop a multicast group membership.
1444		 * Group must be a valid IP multicast address.
1445		 */
1446		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1447		if (error)
1448			break;
1449
1450		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1451			error = EINVAL;
1452			break;
1453		}
1454
1455		s = splimp();
1456		/*
1457		 * If an interface address was specified, get a pointer
1458		 * to its ifnet structure.
1459		 */
1460		if (mreq.imr_interface.s_addr == INADDR_ANY)
1461			ifp = NULL;
1462		else {
1463			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1464			if (ifp == NULL) {
1465				error = EADDRNOTAVAIL;
1466				splx(s);
1467				break;
1468			}
1469		}
1470		/*
1471		 * Find the membership in the membership array.
1472		 */
1473		imo = ip_findmoptions(inp);
1474		for (i = 0; i < imo->imo_num_memberships; ++i) {
1475			if ((ifp == NULL ||
1476			     imo->imo_membership[i]->inm_ifp == ifp) &&
1477			     imo->imo_membership[i]->inm_addr.s_addr ==
1478			     mreq.imr_multiaddr.s_addr)
1479				break;
1480		}
1481		if (i == imo->imo_num_memberships) {
1482			INP_UNLOCK(inp);
1483			error = EADDRNOTAVAIL;
1484			splx(s);
1485			break;
1486		}
1487		/*
1488		 * Give up the multicast address record to which the
1489		 * membership points.
1490		 */
1491		in_delmulti(imo->imo_membership[i]);
1492		/*
1493		 * Remove the gap in the membership array.
1494		 */
1495		for (++i; i < imo->imo_num_memberships; ++i)
1496			imo->imo_membership[i-1] = imo->imo_membership[i];
1497		--imo->imo_num_memberships;
1498		INP_UNLOCK(inp);
1499		splx(s);
1500		break;
1501
1502	default:
1503		error = EOPNOTSUPP;
1504		break;
1505	}
1506
1507	return (error);
1508}
1509
1510/*
1511 * Return the IP multicast options in response to user getsockopt().
1512 */
1513static int
1514ip_getmoptions(struct inpcb *inp, struct sockopt *sopt)
1515{
1516	struct ip_moptions *imo;
1517	struct in_addr addr;
1518	struct in_ifaddr *ia;
1519	int error, optval;
1520	u_char coptval;
1521
1522	INP_LOCK(inp);
1523	imo = inp->inp_moptions;
1524
1525	error = 0;
1526	switch (sopt->sopt_name) {
1527	case IP_MULTICAST_VIF:
1528		if (imo != NULL)
1529			optval = imo->imo_multicast_vif;
1530		else
1531			optval = -1;
1532		INP_UNLOCK(inp);
1533		error = sooptcopyout(sopt, &optval, sizeof optval);
1534		break;
1535
1536	case IP_MULTICAST_IF:
1537		if (imo == NULL || imo->imo_multicast_ifp == NULL)
1538			addr.s_addr = INADDR_ANY;
1539		else if (imo->imo_multicast_addr.s_addr) {
1540			/* return the value user has set */
1541			addr = imo->imo_multicast_addr;
1542		} else {
1543			IFP_TO_IA(imo->imo_multicast_ifp, ia);
1544			addr.s_addr = (ia == NULL) ? INADDR_ANY
1545				: IA_SIN(ia)->sin_addr.s_addr;
1546		}
1547		INP_UNLOCK(inp);
1548		error = sooptcopyout(sopt, &addr, sizeof addr);
1549		break;
1550
1551	case IP_MULTICAST_TTL:
1552		if (imo == 0)
1553			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
1554		else
1555			optval = coptval = imo->imo_multicast_ttl;
1556		INP_UNLOCK(inp);
1557		if (sopt->sopt_valsize == 1)
1558			error = sooptcopyout(sopt, &coptval, 1);
1559		else
1560			error = sooptcopyout(sopt, &optval, sizeof optval);
1561		break;
1562
1563	case IP_MULTICAST_LOOP:
1564		if (imo == 0)
1565			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
1566		else
1567			optval = coptval = imo->imo_multicast_loop;
1568		INP_UNLOCK(inp);
1569		if (sopt->sopt_valsize == 1)
1570			error = sooptcopyout(sopt, &coptval, 1);
1571		else
1572			error = sooptcopyout(sopt, &optval, sizeof optval);
1573		break;
1574
1575	default:
1576		INP_UNLOCK(inp);
1577		error = ENOPROTOOPT;
1578		break;
1579	}
1580	INP_UNLOCK_ASSERT(inp);
1581
1582	return (error);
1583}
1584
1585/*
1586 * Discard the IP multicast options.
1587 */
1588void
1589ip_freemoptions(imo)
1590	register struct ip_moptions *imo;
1591{
1592	register int i;
1593
1594	if (imo != NULL) {
1595		for (i = 0; i < imo->imo_num_memberships; ++i)
1596			in_delmulti(imo->imo_membership[i]);
1597		free(imo->imo_membership, M_IPMOPTS);
1598		free(imo, M_IPMOPTS);
1599	}
1600}
1601
1602/*
1603 * Routine called from ip_output() to loop back a copy of an IP multicast
1604 * packet to the input queue of a specified interface.  Note that this
1605 * calls the output routine of the loopback "driver", but with an interface
1606 * pointer that might NOT be a loopback interface -- evil, but easier than
1607 * replicating that code here.
1608 */
1609static void
1610ip_mloopback(ifp, m, dst, hlen)
1611	struct ifnet *ifp;
1612	register struct mbuf *m;
1613	register struct sockaddr_in *dst;
1614	int hlen;
1615{
1616	register struct ip *ip;
1617	struct mbuf *copym;
1618
1619	copym = m_copy(m, 0, M_COPYALL);
1620	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
1621		copym = m_pullup(copym, hlen);
1622	if (copym != NULL) {
1623		/* If needed, compute the checksum and mark it as valid. */
1624		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1625			in_delayed_cksum(copym);
1626			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1627			copym->m_pkthdr.csum_flags |=
1628			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1629			copym->m_pkthdr.csum_data = 0xffff;
1630		}
1631		/*
1632		 * We don't bother to fragment if the IP length is greater
1633		 * than the interface's MTU.  Can this possibly matter?
1634		 */
1635		ip = mtod(copym, struct ip *);
1636		ip->ip_len = htons(ip->ip_len);
1637		ip->ip_off = htons(ip->ip_off);
1638		ip->ip_sum = 0;
1639		ip->ip_sum = in_cksum(copym, hlen);
1640		/*
1641		 * NB:
1642		 * It's not clear whether there are any lingering
1643		 * reentrancy problems in other areas which might
1644		 * be exposed by using ip_input directly (in
1645		 * particular, everything which modifies the packet
1646		 * in-place).  Yet another option is using the
1647		 * protosw directly to deliver the looped back
1648		 * packet.  For the moment, we'll err on the side
1649		 * of safety by using if_simloop().
1650		 */
1651#if 1 /* XXX */
1652		if (dst->sin_family != AF_INET) {
1653			printf("ip_mloopback: bad address family %d\n",
1654						dst->sin_family);
1655			dst->sin_family = AF_INET;
1656		}
1657#endif
1658
1659#ifdef notdef
1660		copym->m_pkthdr.rcvif = ifp;
1661		ip_input(copym);
1662#else
1663		if_simloop(ifp, copym, dst->sin_family, 0);
1664#endif
1665	}
1666}
1667