ip_output.c revision 155201
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
30 * $FreeBSD: head/sys/netinet/ip_output.c 155201 2006-02-02 03:13:16Z csjp $
31 */
32
33#include "opt_ipfw.h"
34#include "opt_ipsec.h"
35#include "opt_mac.h"
36#include "opt_mbuf_stress_test.h"
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/kernel.h>
41#include <sys/mac.h>
42#include <sys/malloc.h>
43#include <sys/mbuf.h>
44#include <sys/protosw.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
47#include <sys/sysctl.h>
48
49#include <net/if.h>
50#include <net/netisr.h>
51#include <net/pfil.h>
52#include <net/route.h>
53
54#include <netinet/in.h>
55#include <netinet/in_systm.h>
56#include <netinet/ip.h>
57#include <netinet/in_pcb.h>
58#include <netinet/in_var.h>
59#include <netinet/ip_var.h>
60#include <netinet/ip_options.h>
61
62#if defined(IPSEC) || defined(FAST_IPSEC)
63#include <netinet/ip_ipsec.h>
64#ifdef IPSEC
65#include <netinet6/ipsec.h>
66#endif
67#ifdef FAST_IPSEC
68#include <netipsec/ipsec.h>
69#endif
70#endif /*IPSEC*/
71
72#include <machine/in_cksum.h>
73
74static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
75
76#define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
77				x, (ntohl(a.s_addr)>>24)&0xFF,\
78				  (ntohl(a.s_addr)>>16)&0xFF,\
79				  (ntohl(a.s_addr)>>8)&0xFF,\
80				  (ntohl(a.s_addr))&0xFF, y);
81
82u_short ip_id;
83
84#ifdef MBUF_STRESS_TEST
85int mbuf_frag_size = 0;
86SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
87	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
88#endif
89
90static struct ifnet *ip_multicast_if(struct in_addr *, int *);
91static void	ip_mloopback
92	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
93static int	ip_getmoptions(struct inpcb *, struct sockopt *);
94static int	ip_setmoptions(struct inpcb *, struct sockopt *);
95
96
97extern	struct protosw inetsw[];
98
99/*
100 * IP output.  The packet in mbuf chain m contains a skeletal IP
101 * header (with len, off, ttl, proto, tos, src, dst).
102 * The mbuf chain containing the packet will be freed.
103 * The mbuf opt, if present, will not be freed.
104 * In the IP forwarding case, the packet will arrive with options already
105 * inserted, so must have a NULL opt pointer.
106 */
107int
108ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro,
109	int flags, struct ip_moptions *imo, struct inpcb *inp)
110{
111	struct ip *ip;
112	struct ifnet *ifp = NULL;	/* keep compiler happy */
113	struct mbuf *m0;
114	int hlen = sizeof (struct ip);
115	int len, error = 0;
116	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
117	struct in_ifaddr *ia = NULL;
118	int isbroadcast, sw_csum;
119	struct route iproute;
120	struct in_addr odst;
121#ifdef IPFIREWALL_FORWARD
122	struct m_tag *fwd_tag = NULL;
123#endif
124	M_ASSERTPKTHDR(m);
125
126	if (ro == NULL) {
127		ro = &iproute;
128		bzero(ro, sizeof (*ro));
129	}
130
131	if (inp != NULL)
132		INP_LOCK_ASSERT(inp);
133
134	if (opt) {
135		len = 0;
136		m = ip_insertoptions(m, opt, &len);
137		if (len != 0)
138			hlen = len;
139	}
140	ip = mtod(m, struct ip *);
141
142	/*
143	 * Fill in IP header.  If we are not allowing fragmentation,
144	 * then the ip_id field is meaningless, but we don't set it
145	 * to zero.  Doing so causes various problems when devices along
146	 * the path (routers, load balancers, firewalls, etc.) illegally
147	 * disable DF on our packet.  Note that a 16-bit counter
148	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
149	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
150	 * for Counting NATted Hosts", Proc. IMW'02, available at
151	 * <http://www.research.att.com/~smb/papers/fnat.pdf>.
152	 */
153	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
154		ip->ip_v = IPVERSION;
155		ip->ip_hl = hlen >> 2;
156		ip->ip_id = ip_newid();
157		ipstat.ips_localout++;
158	} else {
159		hlen = ip->ip_hl << 2;
160	}
161
162	dst = (struct sockaddr_in *)&ro->ro_dst;
163again:
164	/*
165	 * If there is a cached route,
166	 * check that it is to the same destination
167	 * and is still up.  If not, free it and try again.
168	 * The address family should also be checked in case of sharing the
169	 * cache with IPv6.
170	 */
171	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
172			  dst->sin_family != AF_INET ||
173			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
174		RTFREE(ro->ro_rt);
175		ro->ro_rt = (struct rtentry *)0;
176	}
177#ifdef IPFIREWALL_FORWARD
178	if (ro->ro_rt == NULL && fwd_tag == NULL) {
179#else
180	if (ro->ro_rt == NULL) {
181#endif
182		bzero(dst, sizeof(*dst));
183		dst->sin_family = AF_INET;
184		dst->sin_len = sizeof(*dst);
185		dst->sin_addr = ip->ip_dst;
186	}
187	/*
188	 * If routing to interface only,
189	 * short circuit routing lookup.
190	 */
191	if (flags & IP_ROUTETOIF) {
192		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
193		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
194			ipstat.ips_noroute++;
195			error = ENETUNREACH;
196			goto bad;
197		}
198		ifp = ia->ia_ifp;
199		ip->ip_ttl = 1;
200		isbroadcast = in_broadcast(dst->sin_addr, ifp);
201	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
202	    imo != NULL && imo->imo_multicast_ifp != NULL) {
203		/*
204		 * Bypass the normal routing lookup for multicast
205		 * packets if the interface is specified.
206		 */
207		ifp = imo->imo_multicast_ifp;
208		IFP_TO_IA(ifp, ia);
209		isbroadcast = 0;	/* fool gcc */
210	} else {
211		/*
212		 * We want to do any cloning requested by the link layer,
213		 * as this is probably required in all cases for correct
214		 * operation (as it is for ARP).
215		 */
216		if (ro->ro_rt == NULL)
217			rtalloc_ign(ro, 0);
218		if (ro->ro_rt == NULL) {
219			ipstat.ips_noroute++;
220			error = EHOSTUNREACH;
221			goto bad;
222		}
223		ia = ifatoia(ro->ro_rt->rt_ifa);
224		ifp = ro->ro_rt->rt_ifp;
225		ro->ro_rt->rt_rmx.rmx_pksent++;
226		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
227			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
228		if (ro->ro_rt->rt_flags & RTF_HOST)
229			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
230		else
231			isbroadcast = in_broadcast(dst->sin_addr, ifp);
232	}
233	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
234		struct in_multi *inm;
235
236		m->m_flags |= M_MCAST;
237		/*
238		 * IP destination address is multicast.  Make sure "dst"
239		 * still points to the address in "ro".  (It may have been
240		 * changed to point to a gateway address, above.)
241		 */
242		dst = (struct sockaddr_in *)&ro->ro_dst;
243		/*
244		 * See if the caller provided any multicast options
245		 */
246		if (imo != NULL) {
247			ip->ip_ttl = imo->imo_multicast_ttl;
248			if (imo->imo_multicast_vif != -1)
249				ip->ip_src.s_addr =
250				    ip_mcast_src ?
251				    ip_mcast_src(imo->imo_multicast_vif) :
252				    INADDR_ANY;
253		} else
254			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
255		/*
256		 * Confirm that the outgoing interface supports multicast.
257		 */
258		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
259			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
260				ipstat.ips_noroute++;
261				error = ENETUNREACH;
262				goto bad;
263			}
264		}
265		/*
266		 * If source address not specified yet, use address
267		 * of outgoing interface.
268		 */
269		if (ip->ip_src.s_addr == INADDR_ANY) {
270			/* Interface may have no addresses. */
271			if (ia != NULL)
272				ip->ip_src = IA_SIN(ia)->sin_addr;
273		}
274
275		IN_MULTI_LOCK();
276		IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
277		if (inm != NULL &&
278		   (imo == NULL || imo->imo_multicast_loop)) {
279			IN_MULTI_UNLOCK();
280			/*
281			 * If we belong to the destination multicast group
282			 * on the outgoing interface, and the caller did not
283			 * forbid loopback, loop back a copy.
284			 */
285			ip_mloopback(ifp, m, dst, hlen);
286		}
287		else {
288			IN_MULTI_UNLOCK();
289			/*
290			 * If we are acting as a multicast router, perform
291			 * multicast forwarding as if the packet had just
292			 * arrived on the interface to which we are about
293			 * to send.  The multicast forwarding function
294			 * recursively calls this function, using the
295			 * IP_FORWARDING flag to prevent infinite recursion.
296			 *
297			 * Multicasts that are looped back by ip_mloopback(),
298			 * above, will be forwarded by the ip_input() routine,
299			 * if necessary.
300			 */
301			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
302				/*
303				 * If rsvp daemon is not running, do not
304				 * set ip_moptions. This ensures that the packet
305				 * is multicast and not just sent down one link
306				 * as prescribed by rsvpd.
307				 */
308				if (!rsvp_on)
309					imo = NULL;
310				if (ip_mforward &&
311				    ip_mforward(ip, ifp, m, imo) != 0) {
312					m_freem(m);
313					goto done;
314				}
315			}
316		}
317
318		/*
319		 * Multicasts with a time-to-live of zero may be looped-
320		 * back, above, but must not be transmitted on a network.
321		 * Also, multicasts addressed to the loopback interface
322		 * are not sent -- the above call to ip_mloopback() will
323		 * loop back a copy if this host actually belongs to the
324		 * destination group on the loopback interface.
325		 */
326		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
327			m_freem(m);
328			goto done;
329		}
330
331		goto sendit;
332	}
333#ifndef notdef
334	/*
335	 * If the source address is not specified yet, use the address
336	 * of the outoing interface.
337	 */
338	if (ip->ip_src.s_addr == INADDR_ANY) {
339		/* Interface may have no addresses. */
340		if (ia != NULL) {
341			ip->ip_src = IA_SIN(ia)->sin_addr;
342		}
343	}
344#endif /* notdef */
345	/*
346	 * Verify that we have any chance at all of being able to queue the
347	 * packet or packet fragments, unless ALTQ is enabled on the given
348	 * interface in which case packetdrop should be done by queueing.
349	 */
350#ifdef ALTQ
351	if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
352	    ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
353	    ifp->if_snd.ifq_maxlen))
354#else
355	if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
356	    ifp->if_snd.ifq_maxlen)
357#endif /* ALTQ */
358	{
359		error = ENOBUFS;
360		ipstat.ips_odropped++;
361		ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
362		goto bad;
363	}
364
365	/*
366	 * Look for broadcast address and
367	 * verify user is allowed to send
368	 * such a packet.
369	 */
370	if (isbroadcast) {
371		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
372			error = EADDRNOTAVAIL;
373			goto bad;
374		}
375		if ((flags & IP_ALLOWBROADCAST) == 0) {
376			error = EACCES;
377			goto bad;
378		}
379		/* don't allow broadcast messages to be fragmented */
380		if (ip->ip_len > ifp->if_mtu) {
381			error = EMSGSIZE;
382			goto bad;
383		}
384		if (flags & IP_SENDONES)
385			ip->ip_dst.s_addr = INADDR_BROADCAST;
386		m->m_flags |= M_BCAST;
387	} else {
388		m->m_flags &= ~M_BCAST;
389	}
390
391sendit:
392#if defined(IPSEC) || defined(FAST_IPSEC)
393	switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) {
394	case 1:
395		goto bad;
396	case -1:
397		goto done;
398	case 0:
399	default:
400		break;	/* Continue with packet processing. */
401	}
402	/* Update variables that are affected by ipsec4_output(). */
403	ip = mtod(m, struct ip *);
404	hlen = ip->ip_hl << 2;
405#endif /* IPSEC */
406
407	/* Jump over all PFIL processing if hooks are not active. */
408	if (!PFIL_HOOKED(&inet_pfil_hook))
409		goto passout;
410
411	/* Run through list of hooks for output packets. */
412	odst.s_addr = ip->ip_dst.s_addr;
413	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
414	if (error != 0 || m == NULL)
415		goto done;
416
417	ip = mtod(m, struct ip *);
418
419	/* See if destination IP address was changed by packet filter. */
420	if (odst.s_addr != ip->ip_dst.s_addr) {
421		m->m_flags |= M_SKIP_FIREWALL;
422		/* If destination is now ourself drop to ip_input(). */
423		if (in_localip(ip->ip_dst)) {
424			m->m_flags |= M_FASTFWD_OURS;
425			if (m->m_pkthdr.rcvif == NULL)
426				m->m_pkthdr.rcvif = loif;
427			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
428				m->m_pkthdr.csum_flags |=
429				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
430				m->m_pkthdr.csum_data = 0xffff;
431			}
432			m->m_pkthdr.csum_flags |=
433			    CSUM_IP_CHECKED | CSUM_IP_VALID;
434
435			error = netisr_queue(NETISR_IP, m);
436			goto done;
437		} else
438			goto again;	/* Redo the routing table lookup. */
439	}
440
441#ifdef IPFIREWALL_FORWARD
442	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
443	if (m->m_flags & M_FASTFWD_OURS) {
444		if (m->m_pkthdr.rcvif == NULL)
445			m->m_pkthdr.rcvif = loif;
446		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
447			m->m_pkthdr.csum_flags |=
448			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
449			m->m_pkthdr.csum_data = 0xffff;
450		}
451		m->m_pkthdr.csum_flags |=
452			    CSUM_IP_CHECKED | CSUM_IP_VALID;
453
454		error = netisr_queue(NETISR_IP, m);
455		goto done;
456	}
457	/* Or forward to some other address? */
458	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
459	if (fwd_tag) {
460#ifndef IPFIREWALL_FORWARD_EXTENDED
461		if (!in_localip(ip->ip_src) && !in_localaddr(ip->ip_dst)) {
462#endif
463			dst = (struct sockaddr_in *)&ro->ro_dst;
464			bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
465			m->m_flags |= M_SKIP_FIREWALL;
466			m_tag_delete(m, fwd_tag);
467			goto again;
468#ifndef IPFIREWALL_FORWARD_EXTENDED
469		} else {
470			m_tag_delete(m, fwd_tag);
471			/* Continue. */
472		}
473#endif
474	}
475#endif /* IPFIREWALL_FORWARD */
476
477passout:
478	/* 127/8 must not appear on wire - RFC1122. */
479	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
480	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
481		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
482			ipstat.ips_badaddr++;
483			error = EADDRNOTAVAIL;
484			goto bad;
485		}
486	}
487
488	m->m_pkthdr.csum_flags |= CSUM_IP;
489	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
490	if (sw_csum & CSUM_DELAY_DATA) {
491		in_delayed_cksum(m);
492		sw_csum &= ~CSUM_DELAY_DATA;
493	}
494	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
495
496	/*
497	 * If small enough for interface, or the interface will take
498	 * care of the fragmentation for us, can just send directly.
499	 */
500	if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT &&
501	    ((ip->ip_off & IP_DF) == 0))) {
502		ip->ip_len = htons(ip->ip_len);
503		ip->ip_off = htons(ip->ip_off);
504		ip->ip_sum = 0;
505		if (sw_csum & CSUM_DELAY_IP)
506			ip->ip_sum = in_cksum(m, hlen);
507
508		/* Record statistics for this interface address. */
509		if (!(flags & IP_FORWARDING) && ia) {
510			ia->ia_ifa.if_opackets++;
511			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
512		}
513#ifdef IPSEC
514		/* clean ipsec history once it goes out of the node */
515		ipsec_delaux(m);
516#endif
517#ifdef MBUF_STRESS_TEST
518		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
519			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
520#endif
521		/*
522		 * Reset layer specific mbuf flags
523		 * to avoid confusing lower layers.
524		 */
525		m->m_flags &= ~(M_PROTOFLAGS);
526
527		error = (*ifp->if_output)(ifp, m,
528				(struct sockaddr *)dst, ro->ro_rt);
529		goto done;
530	}
531
532	if (ip->ip_off & IP_DF) {
533		error = EMSGSIZE;
534		/*
535		 * This case can happen if the user changed the MTU
536		 * of an interface after enabling IP on it.  Because
537		 * most netifs don't keep track of routes pointing to
538		 * them, there is no way for one to update all its
539		 * routes when the MTU is changed.
540		 */
541		if (ro != NULL &&
542		    (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
543		    (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
544			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
545		}
546		ipstat.ips_cantfrag++;
547		goto bad;
548	}
549
550	/*
551	 * Too large for interface; fragment if possible. If successful,
552	 * on return, m will point to a list of packets to be sent.
553	 */
554	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
555	if (error)
556		goto bad;
557	for (; m; m = m0) {
558		m0 = m->m_nextpkt;
559		m->m_nextpkt = 0;
560#ifdef IPSEC
561		/* clean ipsec history once it goes out of the node */
562		ipsec_delaux(m);
563#endif
564		if (error == 0) {
565			/* Record statistics for this interface address. */
566			if (ia != NULL) {
567				ia->ia_ifa.if_opackets++;
568				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
569			}
570			/*
571			 * Reset layer specific mbuf flags
572			 * to avoid confusing upper layers.
573			 */
574			m->m_flags &= ~(M_PROTOFLAGS);
575
576			error = (*ifp->if_output)(ifp, m,
577			    (struct sockaddr *)dst, ro->ro_rt);
578		} else
579			m_freem(m);
580	}
581
582	if (error == 0)
583		ipstat.ips_fragmented++;
584
585done:
586	if (ro == &iproute && ro->ro_rt) {
587		RTFREE(ro->ro_rt);
588	}
589	return (error);
590bad:
591	m_freem(m);
592	goto done;
593}
594
595/*
596 * Create a chain of fragments which fit the given mtu. m_frag points to the
597 * mbuf to be fragmented; on return it points to the chain with the fragments.
598 * Return 0 if no error. If error, m_frag may contain a partially built
599 * chain of fragments that should be freed by the caller.
600 *
601 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
602 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
603 */
604int
605ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
606	    u_long if_hwassist_flags, int sw_csum)
607{
608	int error = 0;
609	int hlen = ip->ip_hl << 2;
610	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
611	int off;
612	struct mbuf *m0 = *m_frag;	/* the original packet		*/
613	int firstlen;
614	struct mbuf **mnext;
615	int nfrags;
616
617	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
618		ipstat.ips_cantfrag++;
619		return EMSGSIZE;
620	}
621
622	/*
623	 * Must be able to put at least 8 bytes per fragment.
624	 */
625	if (len < 8)
626		return EMSGSIZE;
627
628	/*
629	 * If the interface will not calculate checksums on
630	 * fragmented packets, then do it here.
631	 */
632	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
633	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
634		in_delayed_cksum(m0);
635		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
636	}
637
638	if (len > PAGE_SIZE) {
639		/*
640		 * Fragment large datagrams such that each segment
641		 * contains a multiple of PAGE_SIZE amount of data,
642		 * plus headers. This enables a receiver to perform
643		 * page-flipping zero-copy optimizations.
644		 *
645		 * XXX When does this help given that sender and receiver
646		 * could have different page sizes, and also mtu could
647		 * be less than the receiver's page size ?
648		 */
649		int newlen;
650		struct mbuf *m;
651
652		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
653			off += m->m_len;
654
655		/*
656		 * firstlen (off - hlen) must be aligned on an
657		 * 8-byte boundary
658		 */
659		if (off < hlen)
660			goto smart_frag_failure;
661		off = ((off - hlen) & ~7) + hlen;
662		newlen = (~PAGE_MASK) & mtu;
663		if ((newlen + sizeof (struct ip)) > mtu) {
664			/* we failed, go back the default */
665smart_frag_failure:
666			newlen = len;
667			off = hlen + len;
668		}
669		len = newlen;
670
671	} else {
672		off = hlen + len;
673	}
674
675	firstlen = off - hlen;
676	mnext = &m0->m_nextpkt;		/* pointer to next packet */
677
678	/*
679	 * Loop through length of segment after first fragment,
680	 * make new header and copy data of each part and link onto chain.
681	 * Here, m0 is the original packet, m is the fragment being created.
682	 * The fragments are linked off the m_nextpkt of the original
683	 * packet, which after processing serves as the first fragment.
684	 */
685	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
686		struct ip *mhip;	/* ip header on the fragment */
687		struct mbuf *m;
688		int mhlen = sizeof (struct ip);
689
690		MGETHDR(m, M_DONTWAIT, MT_DATA);
691		if (m == NULL) {
692			error = ENOBUFS;
693			ipstat.ips_odropped++;
694			goto done;
695		}
696		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
697		/*
698		 * In the first mbuf, leave room for the link header, then
699		 * copy the original IP header including options. The payload
700		 * goes into an additional mbuf chain returned by m_copy().
701		 */
702		m->m_data += max_linkhdr;
703		mhip = mtod(m, struct ip *);
704		*mhip = *ip;
705		if (hlen > sizeof (struct ip)) {
706			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
707			mhip->ip_v = IPVERSION;
708			mhip->ip_hl = mhlen >> 2;
709		}
710		m->m_len = mhlen;
711		/* XXX do we need to add ip->ip_off below ? */
712		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
713		if (off + len >= ip->ip_len) {	/* last fragment */
714			len = ip->ip_len - off;
715			m->m_flags |= M_LASTFRAG;
716		} else
717			mhip->ip_off |= IP_MF;
718		mhip->ip_len = htons((u_short)(len + mhlen));
719		m->m_next = m_copy(m0, off, len);
720		if (m->m_next == NULL) {	/* copy failed */
721			m_free(m);
722			error = ENOBUFS;	/* ??? */
723			ipstat.ips_odropped++;
724			goto done;
725		}
726		m->m_pkthdr.len = mhlen + len;
727		m->m_pkthdr.rcvif = NULL;
728#ifdef MAC
729		mac_create_fragment(m0, m);
730#endif
731		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
732		mhip->ip_off = htons(mhip->ip_off);
733		mhip->ip_sum = 0;
734		if (sw_csum & CSUM_DELAY_IP)
735			mhip->ip_sum = in_cksum(m, mhlen);
736		*mnext = m;
737		mnext = &m->m_nextpkt;
738	}
739	ipstat.ips_ofragments += nfrags;
740
741	/* set first marker for fragment chain */
742	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
743	m0->m_pkthdr.csum_data = nfrags;
744
745	/*
746	 * Update first fragment by trimming what's been copied out
747	 * and updating header.
748	 */
749	m_adj(m0, hlen + firstlen - ip->ip_len);
750	m0->m_pkthdr.len = hlen + firstlen;
751	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
752	ip->ip_off |= IP_MF;
753	ip->ip_off = htons(ip->ip_off);
754	ip->ip_sum = 0;
755	if (sw_csum & CSUM_DELAY_IP)
756		ip->ip_sum = in_cksum(m0, hlen);
757
758done:
759	*m_frag = m0;
760	return error;
761}
762
763void
764in_delayed_cksum(struct mbuf *m)
765{
766	struct ip *ip;
767	u_short csum, offset;
768
769	ip = mtod(m, struct ip *);
770	offset = ip->ip_hl << 2 ;
771	csum = in_cksum_skip(m, ip->ip_len, offset);
772	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
773		csum = 0xffff;
774	offset += m->m_pkthdr.csum_data;	/* checksum offset */
775
776	if (offset + sizeof(u_short) > m->m_len) {
777		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
778		    m->m_len, offset, ip->ip_p);
779		/*
780		 * XXX
781		 * this shouldn't happen, but if it does, the
782		 * correct behavior may be to insert the checksum
783		 * in the appropriate next mbuf in the chain.
784		 */
785		return;
786	}
787	*(u_short *)(m->m_data + offset) = csum;
788}
789
790/*
791 * IP socket option processing.
792 */
793int
794ip_ctloutput(so, sopt)
795	struct socket *so;
796	struct sockopt *sopt;
797{
798	struct	inpcb *inp = sotoinpcb(so);
799	int	error, optval;
800
801	error = optval = 0;
802	if (sopt->sopt_level != IPPROTO_IP) {
803		return (EINVAL);
804	}
805
806	switch (sopt->sopt_dir) {
807	case SOPT_SET:
808		switch (sopt->sopt_name) {
809		case IP_OPTIONS:
810#ifdef notyet
811		case IP_RETOPTS:
812#endif
813		{
814			struct mbuf *m;
815			if (sopt->sopt_valsize > MLEN) {
816				error = EMSGSIZE;
817				break;
818			}
819			MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
820			if (m == NULL) {
821				error = ENOBUFS;
822				break;
823			}
824			m->m_len = sopt->sopt_valsize;
825			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
826					    m->m_len);
827			INP_LOCK(inp);
828			error = ip_pcbopts(inp, sopt->sopt_name, m);
829			INP_UNLOCK(inp);
830			return (error);
831		}
832
833		case IP_TOS:
834		case IP_TTL:
835		case IP_MINTTL:
836		case IP_RECVOPTS:
837		case IP_RECVRETOPTS:
838		case IP_RECVDSTADDR:
839		case IP_RECVTTL:
840		case IP_RECVIF:
841		case IP_FAITH:
842		case IP_ONESBCAST:
843		case IP_DONTFRAG:
844			error = sooptcopyin(sopt, &optval, sizeof optval,
845					    sizeof optval);
846			if (error)
847				break;
848
849			switch (sopt->sopt_name) {
850			case IP_TOS:
851				inp->inp_ip_tos = optval;
852				break;
853
854			case IP_TTL:
855				inp->inp_ip_ttl = optval;
856				break;
857
858			case IP_MINTTL:
859				if (optval > 0 && optval <= MAXTTL)
860					inp->inp_ip_minttl = optval;
861				else
862					error = EINVAL;
863				break;
864
865#define	OPTSET(bit) do {						\
866	INP_LOCK(inp);							\
867	if (optval)							\
868		inp->inp_flags |= bit;					\
869	else								\
870		inp->inp_flags &= ~bit;					\
871	INP_UNLOCK(inp);						\
872} while (0)
873
874			case IP_RECVOPTS:
875				OPTSET(INP_RECVOPTS);
876				break;
877
878			case IP_RECVRETOPTS:
879				OPTSET(INP_RECVRETOPTS);
880				break;
881
882			case IP_RECVDSTADDR:
883				OPTSET(INP_RECVDSTADDR);
884				break;
885
886			case IP_RECVTTL:
887				OPTSET(INP_RECVTTL);
888				break;
889
890			case IP_RECVIF:
891				OPTSET(INP_RECVIF);
892				break;
893
894			case IP_FAITH:
895				OPTSET(INP_FAITH);
896				break;
897
898			case IP_ONESBCAST:
899				OPTSET(INP_ONESBCAST);
900				break;
901			case IP_DONTFRAG:
902				OPTSET(INP_DONTFRAG);
903				break;
904			}
905			break;
906#undef OPTSET
907
908		case IP_MULTICAST_IF:
909		case IP_MULTICAST_VIF:
910		case IP_MULTICAST_TTL:
911		case IP_MULTICAST_LOOP:
912		case IP_ADD_MEMBERSHIP:
913		case IP_DROP_MEMBERSHIP:
914			error = ip_setmoptions(inp, sopt);
915			break;
916
917		case IP_PORTRANGE:
918			error = sooptcopyin(sopt, &optval, sizeof optval,
919					    sizeof optval);
920			if (error)
921				break;
922
923			INP_LOCK(inp);
924			switch (optval) {
925			case IP_PORTRANGE_DEFAULT:
926				inp->inp_flags &= ~(INP_LOWPORT);
927				inp->inp_flags &= ~(INP_HIGHPORT);
928				break;
929
930			case IP_PORTRANGE_HIGH:
931				inp->inp_flags &= ~(INP_LOWPORT);
932				inp->inp_flags |= INP_HIGHPORT;
933				break;
934
935			case IP_PORTRANGE_LOW:
936				inp->inp_flags &= ~(INP_HIGHPORT);
937				inp->inp_flags |= INP_LOWPORT;
938				break;
939
940			default:
941				error = EINVAL;
942				break;
943			}
944			INP_UNLOCK(inp);
945			break;
946
947#if defined(IPSEC) || defined(FAST_IPSEC)
948		case IP_IPSEC_POLICY:
949		{
950			caddr_t req;
951			size_t len = 0;
952			int priv;
953			struct mbuf *m;
954			int optname;
955
956			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
957				break;
958			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
959				break;
960			priv = (sopt->sopt_td != NULL &&
961				suser(sopt->sopt_td) != 0) ? 0 : 1;
962			req = mtod(m, caddr_t);
963			len = m->m_len;
964			optname = sopt->sopt_name;
965			error = ipsec4_set_policy(inp, optname, req, len, priv);
966			m_freem(m);
967			break;
968		}
969#endif /*IPSEC*/
970
971		default:
972			error = ENOPROTOOPT;
973			break;
974		}
975		break;
976
977	case SOPT_GET:
978		switch (sopt->sopt_name) {
979		case IP_OPTIONS:
980		case IP_RETOPTS:
981			if (inp->inp_options)
982				error = sooptcopyout(sopt,
983						     mtod(inp->inp_options,
984							  char *),
985						     inp->inp_options->m_len);
986			else
987				sopt->sopt_valsize = 0;
988			break;
989
990		case IP_TOS:
991		case IP_TTL:
992		case IP_MINTTL:
993		case IP_RECVOPTS:
994		case IP_RECVRETOPTS:
995		case IP_RECVDSTADDR:
996		case IP_RECVTTL:
997		case IP_RECVIF:
998		case IP_PORTRANGE:
999		case IP_FAITH:
1000		case IP_ONESBCAST:
1001		case IP_DONTFRAG:
1002			switch (sopt->sopt_name) {
1003
1004			case IP_TOS:
1005				optval = inp->inp_ip_tos;
1006				break;
1007
1008			case IP_TTL:
1009				optval = inp->inp_ip_ttl;
1010				break;
1011
1012			case IP_MINTTL:
1013				optval = inp->inp_ip_minttl;
1014				break;
1015
1016#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1017
1018			case IP_RECVOPTS:
1019				optval = OPTBIT(INP_RECVOPTS);
1020				break;
1021
1022			case IP_RECVRETOPTS:
1023				optval = OPTBIT(INP_RECVRETOPTS);
1024				break;
1025
1026			case IP_RECVDSTADDR:
1027				optval = OPTBIT(INP_RECVDSTADDR);
1028				break;
1029
1030			case IP_RECVTTL:
1031				optval = OPTBIT(INP_RECVTTL);
1032				break;
1033
1034			case IP_RECVIF:
1035				optval = OPTBIT(INP_RECVIF);
1036				break;
1037
1038			case IP_PORTRANGE:
1039				if (inp->inp_flags & INP_HIGHPORT)
1040					optval = IP_PORTRANGE_HIGH;
1041				else if (inp->inp_flags & INP_LOWPORT)
1042					optval = IP_PORTRANGE_LOW;
1043				else
1044					optval = 0;
1045				break;
1046
1047			case IP_FAITH:
1048				optval = OPTBIT(INP_FAITH);
1049				break;
1050
1051			case IP_ONESBCAST:
1052				optval = OPTBIT(INP_ONESBCAST);
1053				break;
1054			case IP_DONTFRAG:
1055				optval = OPTBIT(INP_DONTFRAG);
1056				break;
1057			}
1058			error = sooptcopyout(sopt, &optval, sizeof optval);
1059			break;
1060
1061		case IP_MULTICAST_IF:
1062		case IP_MULTICAST_VIF:
1063		case IP_MULTICAST_TTL:
1064		case IP_MULTICAST_LOOP:
1065		case IP_ADD_MEMBERSHIP:
1066		case IP_DROP_MEMBERSHIP:
1067			error = ip_getmoptions(inp, sopt);
1068			break;
1069
1070#if defined(IPSEC) || defined(FAST_IPSEC)
1071		case IP_IPSEC_POLICY:
1072		{
1073			struct mbuf *m = NULL;
1074			caddr_t req = NULL;
1075			size_t len = 0;
1076
1077			if (m != 0) {
1078				req = mtod(m, caddr_t);
1079				len = m->m_len;
1080			}
1081			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1082			if (error == 0)
1083				error = soopt_mcopyout(sopt, m); /* XXX */
1084			if (error == 0)
1085				m_freem(m);
1086			break;
1087		}
1088#endif /*IPSEC*/
1089
1090		default:
1091			error = ENOPROTOOPT;
1092			break;
1093		}
1094		break;
1095	}
1096	return (error);
1097}
1098
1099/*
1100 * XXX
1101 * The whole multicast option thing needs to be re-thought.
1102 * Several of these options are equally applicable to non-multicast
1103 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1104 * standard option (IP_TTL).
1105 */
1106
1107/*
1108 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1109 */
1110static struct ifnet *
1111ip_multicast_if(a, ifindexp)
1112	struct in_addr *a;
1113	int *ifindexp;
1114{
1115	int ifindex;
1116	struct ifnet *ifp;
1117
1118	if (ifindexp)
1119		*ifindexp = 0;
1120	if (ntohl(a->s_addr) >> 24 == 0) {
1121		ifindex = ntohl(a->s_addr) & 0xffffff;
1122		if (ifindex < 0 || if_index < ifindex)
1123			return NULL;
1124		ifp = ifnet_byindex(ifindex);
1125		if (ifindexp)
1126			*ifindexp = ifindex;
1127	} else {
1128		INADDR_TO_IFP(*a, ifp);
1129	}
1130	return ifp;
1131}
1132
1133/*
1134 * Given an inpcb, return its multicast options structure pointer.  Accepts
1135 * an unlocked inpcb pointer, but will return it locked.  May sleep.
1136 */
1137static struct ip_moptions *
1138ip_findmoptions(struct inpcb *inp)
1139{
1140	struct ip_moptions *imo;
1141
1142	INP_LOCK(inp);
1143	if (inp->inp_moptions != NULL)
1144		return (inp->inp_moptions);
1145
1146	INP_UNLOCK(inp);
1147
1148	imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
1149
1150	imo->imo_multicast_ifp = NULL;
1151	imo->imo_multicast_addr.s_addr = INADDR_ANY;
1152	imo->imo_multicast_vif = -1;
1153	imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1154	imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1155	imo->imo_num_memberships = 0;
1156
1157	INP_LOCK(inp);
1158	if (inp->inp_moptions != NULL) {
1159		free(imo, M_IPMOPTS);
1160		return (inp->inp_moptions);
1161	}
1162	inp->inp_moptions = imo;
1163	return (imo);
1164}
1165
1166/*
1167 * Set the IP multicast options in response to user setsockopt().
1168 */
1169static int
1170ip_setmoptions(struct inpcb *inp, struct sockopt *sopt)
1171{
1172	int error = 0;
1173	int i;
1174	struct in_addr addr;
1175	struct ip_mreq mreq;
1176	struct ifnet *ifp;
1177	struct ip_moptions *imo;
1178	struct route ro;
1179	struct sockaddr_in *dst;
1180	int ifindex;
1181	int s;
1182
1183	switch (sopt->sopt_name) {
1184	/* store an index number for the vif you wanna use in the send */
1185	case IP_MULTICAST_VIF:
1186		if (legal_vif_num == 0) {
1187			error = EOPNOTSUPP;
1188			break;
1189		}
1190		error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1191		if (error)
1192			break;
1193		if (!legal_vif_num(i) && (i != -1)) {
1194			error = EINVAL;
1195			break;
1196		}
1197		imo = ip_findmoptions(inp);
1198		imo->imo_multicast_vif = i;
1199		INP_UNLOCK(inp);
1200		break;
1201
1202	case IP_MULTICAST_IF:
1203		/*
1204		 * Select the interface for outgoing multicast packets.
1205		 */
1206		error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1207		if (error)
1208			break;
1209		/*
1210		 * INADDR_ANY is used to remove a previous selection.
1211		 * When no interface is selected, a default one is
1212		 * chosen every time a multicast packet is sent.
1213		 */
1214		imo = ip_findmoptions(inp);
1215		if (addr.s_addr == INADDR_ANY) {
1216			imo->imo_multicast_ifp = NULL;
1217			INP_UNLOCK(inp);
1218			break;
1219		}
1220		/*
1221		 * The selected interface is identified by its local
1222		 * IP address.  Find the interface and confirm that
1223		 * it supports multicasting.
1224		 */
1225		s = splimp();
1226		ifp = ip_multicast_if(&addr, &ifindex);
1227		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1228			INP_UNLOCK(inp);
1229			splx(s);
1230			error = EADDRNOTAVAIL;
1231			break;
1232		}
1233		imo->imo_multicast_ifp = ifp;
1234		if (ifindex)
1235			imo->imo_multicast_addr = addr;
1236		else
1237			imo->imo_multicast_addr.s_addr = INADDR_ANY;
1238		INP_UNLOCK(inp);
1239		splx(s);
1240		break;
1241
1242	case IP_MULTICAST_TTL:
1243		/*
1244		 * Set the IP time-to-live for outgoing multicast packets.
1245		 * The original multicast API required a char argument,
1246		 * which is inconsistent with the rest of the socket API.
1247		 * We allow either a char or an int.
1248		 */
1249		if (sopt->sopt_valsize == 1) {
1250			u_char ttl;
1251			error = sooptcopyin(sopt, &ttl, 1, 1);
1252			if (error)
1253				break;
1254			imo = ip_findmoptions(inp);
1255			imo->imo_multicast_ttl = ttl;
1256			INP_UNLOCK(inp);
1257		} else {
1258			u_int ttl;
1259			error = sooptcopyin(sopt, &ttl, sizeof ttl,
1260					    sizeof ttl);
1261			if (error)
1262				break;
1263			if (ttl > 255)
1264				error = EINVAL;
1265			else {
1266				imo = ip_findmoptions(inp);
1267				imo->imo_multicast_ttl = ttl;
1268				INP_UNLOCK(inp);
1269			}
1270		}
1271		break;
1272
1273	case IP_MULTICAST_LOOP:
1274		/*
1275		 * Set the loopback flag for outgoing multicast packets.
1276		 * Must be zero or one.  The original multicast API required a
1277		 * char argument, which is inconsistent with the rest
1278		 * of the socket API.  We allow either a char or an int.
1279		 */
1280		if (sopt->sopt_valsize == 1) {
1281			u_char loop;
1282			error = sooptcopyin(sopt, &loop, 1, 1);
1283			if (error)
1284				break;
1285			imo = ip_findmoptions(inp);
1286			imo->imo_multicast_loop = !!loop;
1287			INP_UNLOCK(inp);
1288		} else {
1289			u_int loop;
1290			error = sooptcopyin(sopt, &loop, sizeof loop,
1291					    sizeof loop);
1292			if (error)
1293				break;
1294			imo = ip_findmoptions(inp);
1295			imo->imo_multicast_loop = !!loop;
1296			INP_UNLOCK(inp);
1297		}
1298		break;
1299
1300	case IP_ADD_MEMBERSHIP:
1301		/*
1302		 * Add a multicast group membership.
1303		 * Group must be a valid IP multicast address.
1304		 */
1305		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1306		if (error)
1307			break;
1308
1309		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1310			error = EINVAL;
1311			break;
1312		}
1313		s = splimp();
1314		/*
1315		 * If no interface address was provided, use the interface of
1316		 * the route to the given multicast address.
1317		 */
1318		if (mreq.imr_interface.s_addr == INADDR_ANY) {
1319			bzero((caddr_t)&ro, sizeof(ro));
1320			dst = (struct sockaddr_in *)&ro.ro_dst;
1321			dst->sin_len = sizeof(*dst);
1322			dst->sin_family = AF_INET;
1323			dst->sin_addr = mreq.imr_multiaddr;
1324			rtalloc_ign(&ro, RTF_CLONING);
1325			if (ro.ro_rt == NULL) {
1326				error = EADDRNOTAVAIL;
1327				splx(s);
1328				break;
1329			}
1330			ifp = ro.ro_rt->rt_ifp;
1331			RTFREE(ro.ro_rt);
1332		}
1333		else {
1334			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1335		}
1336
1337		/*
1338		 * See if we found an interface, and confirm that it
1339		 * supports multicast.
1340		 */
1341		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1342			error = EADDRNOTAVAIL;
1343			splx(s);
1344			break;
1345		}
1346		/*
1347		 * See if the membership already exists or if all the
1348		 * membership slots are full.
1349		 */
1350		imo = ip_findmoptions(inp);
1351		for (i = 0; i < imo->imo_num_memberships; ++i) {
1352			if (imo->imo_membership[i]->inm_ifp == ifp &&
1353			    imo->imo_membership[i]->inm_addr.s_addr
1354						== mreq.imr_multiaddr.s_addr)
1355				break;
1356		}
1357		if (i < imo->imo_num_memberships) {
1358			INP_UNLOCK(inp);
1359			error = EADDRINUSE;
1360			splx(s);
1361			break;
1362		}
1363		if (i == IP_MAX_MEMBERSHIPS) {
1364			INP_UNLOCK(inp);
1365			error = ETOOMANYREFS;
1366			splx(s);
1367			break;
1368		}
1369		/*
1370		 * Everything looks good; add a new record to the multicast
1371		 * address list for the given interface.
1372		 */
1373		if ((imo->imo_membership[i] =
1374		    in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
1375			INP_UNLOCK(inp);
1376			error = ENOBUFS;
1377			splx(s);
1378			break;
1379		}
1380		++imo->imo_num_memberships;
1381		INP_UNLOCK(inp);
1382		splx(s);
1383		break;
1384
1385	case IP_DROP_MEMBERSHIP:
1386		/*
1387		 * Drop a multicast group membership.
1388		 * Group must be a valid IP multicast address.
1389		 */
1390		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1391		if (error)
1392			break;
1393
1394		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1395			error = EINVAL;
1396			break;
1397		}
1398
1399		s = splimp();
1400		/*
1401		 * If an interface address was specified, get a pointer
1402		 * to its ifnet structure.
1403		 */
1404		if (mreq.imr_interface.s_addr == INADDR_ANY)
1405			ifp = NULL;
1406		else {
1407			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1408			if (ifp == NULL) {
1409				error = EADDRNOTAVAIL;
1410				splx(s);
1411				break;
1412			}
1413		}
1414		/*
1415		 * Find the membership in the membership array.
1416		 */
1417		imo = ip_findmoptions(inp);
1418		for (i = 0; i < imo->imo_num_memberships; ++i) {
1419			if ((ifp == NULL ||
1420			     imo->imo_membership[i]->inm_ifp == ifp) &&
1421			     imo->imo_membership[i]->inm_addr.s_addr ==
1422			     mreq.imr_multiaddr.s_addr)
1423				break;
1424		}
1425		if (i == imo->imo_num_memberships) {
1426			INP_UNLOCK(inp);
1427			error = EADDRNOTAVAIL;
1428			splx(s);
1429			break;
1430		}
1431		/*
1432		 * Give up the multicast address record to which the
1433		 * membership points.
1434		 */
1435		in_delmulti(imo->imo_membership[i]);
1436		/*
1437		 * Remove the gap in the membership array.
1438		 */
1439		for (++i; i < imo->imo_num_memberships; ++i)
1440			imo->imo_membership[i-1] = imo->imo_membership[i];
1441		--imo->imo_num_memberships;
1442		INP_UNLOCK(inp);
1443		splx(s);
1444		break;
1445
1446	default:
1447		error = EOPNOTSUPP;
1448		break;
1449	}
1450
1451	return (error);
1452}
1453
1454/*
1455 * Return the IP multicast options in response to user getsockopt().
1456 */
1457static int
1458ip_getmoptions(struct inpcb *inp, struct sockopt *sopt)
1459{
1460	struct ip_moptions *imo;
1461	struct in_addr addr;
1462	struct in_ifaddr *ia;
1463	int error, optval;
1464	u_char coptval;
1465
1466	INP_LOCK(inp);
1467	imo = inp->inp_moptions;
1468
1469	error = 0;
1470	switch (sopt->sopt_name) {
1471	case IP_MULTICAST_VIF:
1472		if (imo != NULL)
1473			optval = imo->imo_multicast_vif;
1474		else
1475			optval = -1;
1476		INP_UNLOCK(inp);
1477		error = sooptcopyout(sopt, &optval, sizeof optval);
1478		break;
1479
1480	case IP_MULTICAST_IF:
1481		if (imo == NULL || imo->imo_multicast_ifp == NULL)
1482			addr.s_addr = INADDR_ANY;
1483		else if (imo->imo_multicast_addr.s_addr) {
1484			/* return the value user has set */
1485			addr = imo->imo_multicast_addr;
1486		} else {
1487			IFP_TO_IA(imo->imo_multicast_ifp, ia);
1488			addr.s_addr = (ia == NULL) ? INADDR_ANY
1489				: IA_SIN(ia)->sin_addr.s_addr;
1490		}
1491		INP_UNLOCK(inp);
1492		error = sooptcopyout(sopt, &addr, sizeof addr);
1493		break;
1494
1495	case IP_MULTICAST_TTL:
1496		if (imo == 0)
1497			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
1498		else
1499			optval = coptval = imo->imo_multicast_ttl;
1500		INP_UNLOCK(inp);
1501		if (sopt->sopt_valsize == 1)
1502			error = sooptcopyout(sopt, &coptval, 1);
1503		else
1504			error = sooptcopyout(sopt, &optval, sizeof optval);
1505		break;
1506
1507	case IP_MULTICAST_LOOP:
1508		if (imo == 0)
1509			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
1510		else
1511			optval = coptval = imo->imo_multicast_loop;
1512		INP_UNLOCK(inp);
1513		if (sopt->sopt_valsize == 1)
1514			error = sooptcopyout(sopt, &coptval, 1);
1515		else
1516			error = sooptcopyout(sopt, &optval, sizeof optval);
1517		break;
1518
1519	default:
1520		INP_UNLOCK(inp);
1521		error = ENOPROTOOPT;
1522		break;
1523	}
1524	INP_UNLOCK_ASSERT(inp);
1525
1526	return (error);
1527}
1528
1529/*
1530 * Discard the IP multicast options.
1531 */
1532void
1533ip_freemoptions(imo)
1534	register struct ip_moptions *imo;
1535{
1536	register int i;
1537
1538	if (imo != NULL) {
1539		for (i = 0; i < imo->imo_num_memberships; ++i)
1540			in_delmulti(imo->imo_membership[i]);
1541		free(imo, M_IPMOPTS);
1542	}
1543}
1544
1545/*
1546 * Routine called from ip_output() to loop back a copy of an IP multicast
1547 * packet to the input queue of a specified interface.  Note that this
1548 * calls the output routine of the loopback "driver", but with an interface
1549 * pointer that might NOT be a loopback interface -- evil, but easier than
1550 * replicating that code here.
1551 */
1552static void
1553ip_mloopback(ifp, m, dst, hlen)
1554	struct ifnet *ifp;
1555	register struct mbuf *m;
1556	register struct sockaddr_in *dst;
1557	int hlen;
1558{
1559	register struct ip *ip;
1560	struct mbuf *copym;
1561
1562	copym = m_copy(m, 0, M_COPYALL);
1563	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
1564		copym = m_pullup(copym, hlen);
1565	if (copym != NULL) {
1566		/* If needed, compute the checksum and mark it as valid. */
1567		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1568			in_delayed_cksum(copym);
1569			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1570			copym->m_pkthdr.csum_flags |=
1571			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1572			copym->m_pkthdr.csum_data = 0xffff;
1573		}
1574		/*
1575		 * We don't bother to fragment if the IP length is greater
1576		 * than the interface's MTU.  Can this possibly matter?
1577		 */
1578		ip = mtod(copym, struct ip *);
1579		ip->ip_len = htons(ip->ip_len);
1580		ip->ip_off = htons(ip->ip_off);
1581		ip->ip_sum = 0;
1582		ip->ip_sum = in_cksum(copym, hlen);
1583		/*
1584		 * NB:
1585		 * It's not clear whether there are any lingering
1586		 * reentrancy problems in other areas which might
1587		 * be exposed by using ip_input directly (in
1588		 * particular, everything which modifies the packet
1589		 * in-place).  Yet another option is using the
1590		 * protosw directly to deliver the looped back
1591		 * packet.  For the moment, we'll err on the side
1592		 * of safety by using if_simloop().
1593		 */
1594#if 1 /* XXX */
1595		if (dst->sin_family != AF_INET) {
1596			printf("ip_mloopback: bad address family %d\n",
1597						dst->sin_family);
1598			dst->sin_family = AF_INET;
1599		}
1600#endif
1601
1602#ifdef notdef
1603		copym->m_pkthdr.rcvif = ifp;
1604		ip_input(copym);
1605#else
1606		if_simloop(ifp, copym, dst->sin_family, 0);
1607#endif
1608	}
1609}
1610