ip_output.c revision 161380
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
30 * $FreeBSD: head/sys/netinet/ip_output.c 161380 2006-08-17 00:37:03Z julian $
31 */
32
33#include "opt_ipfw.h"
34#include "opt_ipsec.h"
35#include "opt_mac.h"
36#include "opt_mbuf_stress_test.h"
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/kernel.h>
41#include <sys/mac.h>
42#include <sys/malloc.h>
43#include <sys/mbuf.h>
44#include <sys/protosw.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
47#include <sys/sysctl.h>
48
49#include <net/if.h>
50#include <net/netisr.h>
51#include <net/pfil.h>
52#include <net/route.h>
53
54#include <netinet/in.h>
55#include <netinet/in_systm.h>
56#include <netinet/ip.h>
57#include <netinet/in_pcb.h>
58#include <netinet/in_var.h>
59#include <netinet/ip_var.h>
60#include <netinet/ip_options.h>
61
62#if defined(IPSEC) || defined(FAST_IPSEC)
63#include <netinet/ip_ipsec.h>
64#ifdef IPSEC
65#include <netinet6/ipsec.h>
66#endif
67#ifdef FAST_IPSEC
68#include <netipsec/ipsec.h>
69#endif
70#endif /*IPSEC*/
71
72#include <machine/in_cksum.h>
73
74static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
75
76#define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
77				x, (ntohl(a.s_addr)>>24)&0xFF,\
78				  (ntohl(a.s_addr)>>16)&0xFF,\
79				  (ntohl(a.s_addr)>>8)&0xFF,\
80				  (ntohl(a.s_addr))&0xFF, y);
81
82u_short ip_id;
83
84#ifdef MBUF_STRESS_TEST
85int mbuf_frag_size = 0;
86SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
87	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
88#endif
89
90static struct ifnet *ip_multicast_if(struct in_addr *, int *);
91static void	ip_mloopback
92	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
93static int	ip_getmoptions(struct inpcb *, struct sockopt *);
94static int	ip_setmoptions(struct inpcb *, struct sockopt *);
95
96
97extern	struct protosw inetsw[];
98
99/*
100 * IP output.  The packet in mbuf chain m contains a skeletal IP
101 * header (with len, off, ttl, proto, tos, src, dst).
102 * The mbuf chain containing the packet will be freed.
103 * The mbuf opt, if present, will not be freed.
104 * In the IP forwarding case, the packet will arrive with options already
105 * inserted, so must have a NULL opt pointer.
106 */
107int
108ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro,
109	int flags, struct ip_moptions *imo, struct inpcb *inp)
110{
111	struct ip *ip;
112	struct ifnet *ifp = NULL;	/* keep compiler happy */
113	struct mbuf *m0;
114	int hlen = sizeof (struct ip);
115	int len, error = 0;
116	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
117	struct in_ifaddr *ia = NULL;
118	int isbroadcast, sw_csum;
119	struct route iproute;
120	struct in_addr odst;
121#ifdef IPFIREWALL_FORWARD
122	struct m_tag *fwd_tag = NULL;
123#endif
124	M_ASSERTPKTHDR(m);
125
126	if (ro == NULL) {
127		ro = &iproute;
128		bzero(ro, sizeof (*ro));
129	}
130
131	if (inp != NULL)
132		INP_LOCK_ASSERT(inp);
133
134	if (opt) {
135		len = 0;
136		m = ip_insertoptions(m, opt, &len);
137		if (len != 0)
138			hlen = len;
139	}
140	ip = mtod(m, struct ip *);
141
142	/*
143	 * Fill in IP header.  If we are not allowing fragmentation,
144	 * then the ip_id field is meaningless, but we don't set it
145	 * to zero.  Doing so causes various problems when devices along
146	 * the path (routers, load balancers, firewalls, etc.) illegally
147	 * disable DF on our packet.  Note that a 16-bit counter
148	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
149	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
150	 * for Counting NATted Hosts", Proc. IMW'02, available at
151	 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
152	 */
153	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
154		ip->ip_v = IPVERSION;
155		ip->ip_hl = hlen >> 2;
156		ip->ip_id = ip_newid();
157		ipstat.ips_localout++;
158	} else {
159		hlen = ip->ip_hl << 2;
160	}
161
162	dst = (struct sockaddr_in *)&ro->ro_dst;
163again:
164	/*
165	 * If there is a cached route,
166	 * check that it is to the same destination
167	 * and is still up.  If not, free it and try again.
168	 * The address family should also be checked in case of sharing the
169	 * cache with IPv6.
170	 */
171	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
172			  dst->sin_family != AF_INET ||
173			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
174		RTFREE(ro->ro_rt);
175		ro->ro_rt = (struct rtentry *)0;
176	}
177#ifdef IPFIREWALL_FORWARD
178	if (ro->ro_rt == NULL && fwd_tag == NULL) {
179#else
180	if (ro->ro_rt == NULL) {
181#endif
182		bzero(dst, sizeof(*dst));
183		dst->sin_family = AF_INET;
184		dst->sin_len = sizeof(*dst);
185		dst->sin_addr = ip->ip_dst;
186	}
187	/*
188	 * If routing to interface only,
189	 * short circuit routing lookup.
190	 */
191	if (flags & IP_ROUTETOIF) {
192		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
193		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
194			ipstat.ips_noroute++;
195			error = ENETUNREACH;
196			goto bad;
197		}
198		ifp = ia->ia_ifp;
199		ip->ip_ttl = 1;
200		isbroadcast = in_broadcast(dst->sin_addr, ifp);
201	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
202	    imo != NULL && imo->imo_multicast_ifp != NULL) {
203		/*
204		 * Bypass the normal routing lookup for multicast
205		 * packets if the interface is specified.
206		 */
207		ifp = imo->imo_multicast_ifp;
208		IFP_TO_IA(ifp, ia);
209		isbroadcast = 0;	/* fool gcc */
210	} else {
211		/*
212		 * We want to do any cloning requested by the link layer,
213		 * as this is probably required in all cases for correct
214		 * operation (as it is for ARP).
215		 */
216		if (ro->ro_rt == NULL)
217			rtalloc_ign(ro, 0);
218		if (ro->ro_rt == NULL) {
219			ipstat.ips_noroute++;
220			error = EHOSTUNREACH;
221			goto bad;
222		}
223		ia = ifatoia(ro->ro_rt->rt_ifa);
224		ifp = ro->ro_rt->rt_ifp;
225		ro->ro_rt->rt_rmx.rmx_pksent++;
226		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
227			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
228		if (ro->ro_rt->rt_flags & RTF_HOST)
229			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
230		else
231			isbroadcast = in_broadcast(dst->sin_addr, ifp);
232	}
233	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
234		struct in_multi *inm;
235
236		m->m_flags |= M_MCAST;
237		/*
238		 * IP destination address is multicast.  Make sure "dst"
239		 * still points to the address in "ro".  (It may have been
240		 * changed to point to a gateway address, above.)
241		 */
242		dst = (struct sockaddr_in *)&ro->ro_dst;
243		/*
244		 * See if the caller provided any multicast options
245		 */
246		if (imo != NULL) {
247			ip->ip_ttl = imo->imo_multicast_ttl;
248			if (imo->imo_multicast_vif != -1)
249				ip->ip_src.s_addr =
250				    ip_mcast_src ?
251				    ip_mcast_src(imo->imo_multicast_vif) :
252				    INADDR_ANY;
253		} else
254			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
255		/*
256		 * Confirm that the outgoing interface supports multicast.
257		 */
258		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
259			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
260				ipstat.ips_noroute++;
261				error = ENETUNREACH;
262				goto bad;
263			}
264		}
265		/*
266		 * If source address not specified yet, use address
267		 * of outgoing interface.
268		 */
269		if (ip->ip_src.s_addr == INADDR_ANY) {
270			/* Interface may have no addresses. */
271			if (ia != NULL)
272				ip->ip_src = IA_SIN(ia)->sin_addr;
273		}
274
275		IN_MULTI_LOCK();
276		IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
277		if (inm != NULL &&
278		   (imo == NULL || imo->imo_multicast_loop)) {
279			IN_MULTI_UNLOCK();
280			/*
281			 * If we belong to the destination multicast group
282			 * on the outgoing interface, and the caller did not
283			 * forbid loopback, loop back a copy.
284			 */
285			ip_mloopback(ifp, m, dst, hlen);
286		}
287		else {
288			IN_MULTI_UNLOCK();
289			/*
290			 * If we are acting as a multicast router, perform
291			 * multicast forwarding as if the packet had just
292			 * arrived on the interface to which we are about
293			 * to send.  The multicast forwarding function
294			 * recursively calls this function, using the
295			 * IP_FORWARDING flag to prevent infinite recursion.
296			 *
297			 * Multicasts that are looped back by ip_mloopback(),
298			 * above, will be forwarded by the ip_input() routine,
299			 * if necessary.
300			 */
301			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
302				/*
303				 * If rsvp daemon is not running, do not
304				 * set ip_moptions. This ensures that the packet
305				 * is multicast and not just sent down one link
306				 * as prescribed by rsvpd.
307				 */
308				if (!rsvp_on)
309					imo = NULL;
310				if (ip_mforward &&
311				    ip_mforward(ip, ifp, m, imo) != 0) {
312					m_freem(m);
313					goto done;
314				}
315			}
316		}
317
318		/*
319		 * Multicasts with a time-to-live of zero may be looped-
320		 * back, above, but must not be transmitted on a network.
321		 * Also, multicasts addressed to the loopback interface
322		 * are not sent -- the above call to ip_mloopback() will
323		 * loop back a copy if this host actually belongs to the
324		 * destination group on the loopback interface.
325		 */
326		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
327			m_freem(m);
328			goto done;
329		}
330
331		goto sendit;
332	}
333#ifndef notdef
334	/*
335	 * If the source address is not specified yet, use the address
336	 * of the outoing interface.
337	 */
338	if (ip->ip_src.s_addr == INADDR_ANY) {
339		/* Interface may have no addresses. */
340		if (ia != NULL) {
341			ip->ip_src = IA_SIN(ia)->sin_addr;
342		}
343	}
344#endif /* notdef */
345	/*
346	 * Verify that we have any chance at all of being able to queue the
347	 * packet or packet fragments, unless ALTQ is enabled on the given
348	 * interface in which case packetdrop should be done by queueing.
349	 */
350#ifdef ALTQ
351	if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
352	    ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
353	    ifp->if_snd.ifq_maxlen))
354#else
355	if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
356	    ifp->if_snd.ifq_maxlen)
357#endif /* ALTQ */
358	{
359		error = ENOBUFS;
360		ipstat.ips_odropped++;
361		ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
362		goto bad;
363	}
364
365	/*
366	 * Look for broadcast address and
367	 * verify user is allowed to send
368	 * such a packet.
369	 */
370	if (isbroadcast) {
371		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
372			error = EADDRNOTAVAIL;
373			goto bad;
374		}
375		if ((flags & IP_ALLOWBROADCAST) == 0) {
376			error = EACCES;
377			goto bad;
378		}
379		/* don't allow broadcast messages to be fragmented */
380		if (ip->ip_len > ifp->if_mtu) {
381			error = EMSGSIZE;
382			goto bad;
383		}
384		if (flags & IP_SENDONES)
385			ip->ip_dst.s_addr = INADDR_BROADCAST;
386		m->m_flags |= M_BCAST;
387	} else {
388		m->m_flags &= ~M_BCAST;
389	}
390
391sendit:
392#if defined(IPSEC) || defined(FAST_IPSEC)
393	switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) {
394	case 1:
395		goto bad;
396	case -1:
397		goto done;
398	case 0:
399	default:
400		break;	/* Continue with packet processing. */
401	}
402	/* Update variables that are affected by ipsec4_output(). */
403	ip = mtod(m, struct ip *);
404	hlen = ip->ip_hl << 2;
405#endif /* IPSEC */
406
407	/* Jump over all PFIL processing if hooks are not active. */
408	if (!PFIL_HOOKED(&inet_pfil_hook))
409		goto passout;
410
411	/* Run through list of hooks for output packets. */
412	odst.s_addr = ip->ip_dst.s_addr;
413	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
414	if (error != 0 || m == NULL)
415		goto done;
416
417	ip = mtod(m, struct ip *);
418
419	/* See if destination IP address was changed by packet filter. */
420	if (odst.s_addr != ip->ip_dst.s_addr) {
421		m->m_flags |= M_SKIP_FIREWALL;
422		/* If destination is now ourself drop to ip_input(). */
423		if (in_localip(ip->ip_dst)) {
424			m->m_flags |= M_FASTFWD_OURS;
425			if (m->m_pkthdr.rcvif == NULL)
426				m->m_pkthdr.rcvif = loif;
427			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
428				m->m_pkthdr.csum_flags |=
429				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
430				m->m_pkthdr.csum_data = 0xffff;
431			}
432			m->m_pkthdr.csum_flags |=
433			    CSUM_IP_CHECKED | CSUM_IP_VALID;
434
435			error = netisr_queue(NETISR_IP, m);
436			goto done;
437		} else
438			goto again;	/* Redo the routing table lookup. */
439	}
440
441#ifdef IPFIREWALL_FORWARD
442	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
443	if (m->m_flags & M_FASTFWD_OURS) {
444		if (m->m_pkthdr.rcvif == NULL)
445			m->m_pkthdr.rcvif = loif;
446		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
447			m->m_pkthdr.csum_flags |=
448			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
449			m->m_pkthdr.csum_data = 0xffff;
450		}
451		m->m_pkthdr.csum_flags |=
452			    CSUM_IP_CHECKED | CSUM_IP_VALID;
453
454		error = netisr_queue(NETISR_IP, m);
455		goto done;
456	}
457	/* Or forward to some other address? */
458	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
459	if (fwd_tag) {
460		dst = (struct sockaddr_in *)&ro->ro_dst;
461		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
462		m->m_flags |= M_SKIP_FIREWALL;
463		m_tag_delete(m, fwd_tag);
464		goto again;
465	}
466#endif /* IPFIREWALL_FORWARD */
467
468passout:
469	/* 127/8 must not appear on wire - RFC1122. */
470	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
471	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
472		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
473			ipstat.ips_badaddr++;
474			error = EADDRNOTAVAIL;
475			goto bad;
476		}
477	}
478
479	m->m_pkthdr.csum_flags |= CSUM_IP;
480	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
481	if (sw_csum & CSUM_DELAY_DATA) {
482		in_delayed_cksum(m);
483		sw_csum &= ~CSUM_DELAY_DATA;
484	}
485	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
486
487	/*
488	 * If small enough for interface, or the interface will take
489	 * care of the fragmentation for us, can just send directly.
490	 */
491	if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT &&
492	    ((ip->ip_off & IP_DF) == 0))) {
493		ip->ip_len = htons(ip->ip_len);
494		ip->ip_off = htons(ip->ip_off);
495		ip->ip_sum = 0;
496		if (sw_csum & CSUM_DELAY_IP)
497			ip->ip_sum = in_cksum(m, hlen);
498
499		/* Record statistics for this interface address. */
500		if (!(flags & IP_FORWARDING) && ia) {
501			ia->ia_ifa.if_opackets++;
502			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
503		}
504#ifdef IPSEC
505		/* clean ipsec history once it goes out of the node */
506		ipsec_delaux(m);
507#endif
508#ifdef MBUF_STRESS_TEST
509		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
510			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
511#endif
512		/*
513		 * Reset layer specific mbuf flags
514		 * to avoid confusing lower layers.
515		 */
516		m->m_flags &= ~(M_PROTOFLAGS);
517
518		error = (*ifp->if_output)(ifp, m,
519				(struct sockaddr *)dst, ro->ro_rt);
520		goto done;
521	}
522
523	if (ip->ip_off & IP_DF) {
524		error = EMSGSIZE;
525		/*
526		 * This case can happen if the user changed the MTU
527		 * of an interface after enabling IP on it.  Because
528		 * most netifs don't keep track of routes pointing to
529		 * them, there is no way for one to update all its
530		 * routes when the MTU is changed.
531		 */
532		if (ro != NULL &&
533		    (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
534		    (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
535			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
536		}
537		ipstat.ips_cantfrag++;
538		goto bad;
539	}
540
541	/*
542	 * Too large for interface; fragment if possible. If successful,
543	 * on return, m will point to a list of packets to be sent.
544	 */
545	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
546	if (error)
547		goto bad;
548	for (; m; m = m0) {
549		m0 = m->m_nextpkt;
550		m->m_nextpkt = 0;
551#ifdef IPSEC
552		/* clean ipsec history once it goes out of the node */
553		ipsec_delaux(m);
554#endif
555		if (error == 0) {
556			/* Record statistics for this interface address. */
557			if (ia != NULL) {
558				ia->ia_ifa.if_opackets++;
559				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
560			}
561			/*
562			 * Reset layer specific mbuf flags
563			 * to avoid confusing upper layers.
564			 */
565			m->m_flags &= ~(M_PROTOFLAGS);
566
567			error = (*ifp->if_output)(ifp, m,
568			    (struct sockaddr *)dst, ro->ro_rt);
569		} else
570			m_freem(m);
571	}
572
573	if (error == 0)
574		ipstat.ips_fragmented++;
575
576done:
577	if (ro == &iproute && ro->ro_rt) {
578		RTFREE(ro->ro_rt);
579	}
580	return (error);
581bad:
582	m_freem(m);
583	goto done;
584}
585
586/*
587 * Create a chain of fragments which fit the given mtu. m_frag points to the
588 * mbuf to be fragmented; on return it points to the chain with the fragments.
589 * Return 0 if no error. If error, m_frag may contain a partially built
590 * chain of fragments that should be freed by the caller.
591 *
592 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
593 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
594 */
595int
596ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
597	    u_long if_hwassist_flags, int sw_csum)
598{
599	int error = 0;
600	int hlen = ip->ip_hl << 2;
601	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
602	int off;
603	struct mbuf *m0 = *m_frag;	/* the original packet		*/
604	int firstlen;
605	struct mbuf **mnext;
606	int nfrags;
607
608	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
609		ipstat.ips_cantfrag++;
610		return EMSGSIZE;
611	}
612
613	/*
614	 * Must be able to put at least 8 bytes per fragment.
615	 */
616	if (len < 8)
617		return EMSGSIZE;
618
619	/*
620	 * If the interface will not calculate checksums on
621	 * fragmented packets, then do it here.
622	 */
623	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
624	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
625		in_delayed_cksum(m0);
626		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
627	}
628
629	if (len > PAGE_SIZE) {
630		/*
631		 * Fragment large datagrams such that each segment
632		 * contains a multiple of PAGE_SIZE amount of data,
633		 * plus headers. This enables a receiver to perform
634		 * page-flipping zero-copy optimizations.
635		 *
636		 * XXX When does this help given that sender and receiver
637		 * could have different page sizes, and also mtu could
638		 * be less than the receiver's page size ?
639		 */
640		int newlen;
641		struct mbuf *m;
642
643		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
644			off += m->m_len;
645
646		/*
647		 * firstlen (off - hlen) must be aligned on an
648		 * 8-byte boundary
649		 */
650		if (off < hlen)
651			goto smart_frag_failure;
652		off = ((off - hlen) & ~7) + hlen;
653		newlen = (~PAGE_MASK) & mtu;
654		if ((newlen + sizeof (struct ip)) > mtu) {
655			/* we failed, go back the default */
656smart_frag_failure:
657			newlen = len;
658			off = hlen + len;
659		}
660		len = newlen;
661
662	} else {
663		off = hlen + len;
664	}
665
666	firstlen = off - hlen;
667	mnext = &m0->m_nextpkt;		/* pointer to next packet */
668
669	/*
670	 * Loop through length of segment after first fragment,
671	 * make new header and copy data of each part and link onto chain.
672	 * Here, m0 is the original packet, m is the fragment being created.
673	 * The fragments are linked off the m_nextpkt of the original
674	 * packet, which after processing serves as the first fragment.
675	 */
676	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
677		struct ip *mhip;	/* ip header on the fragment */
678		struct mbuf *m;
679		int mhlen = sizeof (struct ip);
680
681		MGETHDR(m, M_DONTWAIT, MT_DATA);
682		if (m == NULL) {
683			error = ENOBUFS;
684			ipstat.ips_odropped++;
685			goto done;
686		}
687		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
688		/*
689		 * In the first mbuf, leave room for the link header, then
690		 * copy the original IP header including options. The payload
691		 * goes into an additional mbuf chain returned by m_copy().
692		 */
693		m->m_data += max_linkhdr;
694		mhip = mtod(m, struct ip *);
695		*mhip = *ip;
696		if (hlen > sizeof (struct ip)) {
697			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
698			mhip->ip_v = IPVERSION;
699			mhip->ip_hl = mhlen >> 2;
700		}
701		m->m_len = mhlen;
702		/* XXX do we need to add ip->ip_off below ? */
703		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
704		if (off + len >= ip->ip_len) {	/* last fragment */
705			len = ip->ip_len - off;
706			m->m_flags |= M_LASTFRAG;
707		} else
708			mhip->ip_off |= IP_MF;
709		mhip->ip_len = htons((u_short)(len + mhlen));
710		m->m_next = m_copy(m0, off, len);
711		if (m->m_next == NULL) {	/* copy failed */
712			m_free(m);
713			error = ENOBUFS;	/* ??? */
714			ipstat.ips_odropped++;
715			goto done;
716		}
717		m->m_pkthdr.len = mhlen + len;
718		m->m_pkthdr.rcvif = NULL;
719#ifdef MAC
720		mac_create_fragment(m0, m);
721#endif
722		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
723		mhip->ip_off = htons(mhip->ip_off);
724		mhip->ip_sum = 0;
725		if (sw_csum & CSUM_DELAY_IP)
726			mhip->ip_sum = in_cksum(m, mhlen);
727		*mnext = m;
728		mnext = &m->m_nextpkt;
729	}
730	ipstat.ips_ofragments += nfrags;
731
732	/* set first marker for fragment chain */
733	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
734	m0->m_pkthdr.csum_data = nfrags;
735
736	/*
737	 * Update first fragment by trimming what's been copied out
738	 * and updating header.
739	 */
740	m_adj(m0, hlen + firstlen - ip->ip_len);
741	m0->m_pkthdr.len = hlen + firstlen;
742	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
743	ip->ip_off |= IP_MF;
744	ip->ip_off = htons(ip->ip_off);
745	ip->ip_sum = 0;
746	if (sw_csum & CSUM_DELAY_IP)
747		ip->ip_sum = in_cksum(m0, hlen);
748
749done:
750	*m_frag = m0;
751	return error;
752}
753
754void
755in_delayed_cksum(struct mbuf *m)
756{
757	struct ip *ip;
758	u_short csum, offset;
759
760	ip = mtod(m, struct ip *);
761	offset = ip->ip_hl << 2 ;
762	csum = in_cksum_skip(m, ip->ip_len, offset);
763	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
764		csum = 0xffff;
765	offset += m->m_pkthdr.csum_data;	/* checksum offset */
766
767	if (offset + sizeof(u_short) > m->m_len) {
768		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
769		    m->m_len, offset, ip->ip_p);
770		/*
771		 * XXX
772		 * this shouldn't happen, but if it does, the
773		 * correct behavior may be to insert the checksum
774		 * in the appropriate next mbuf in the chain.
775		 */
776		return;
777	}
778	*(u_short *)(m->m_data + offset) = csum;
779}
780
781/*
782 * IP socket option processing.
783 */
784int
785ip_ctloutput(so, sopt)
786	struct socket *so;
787	struct sockopt *sopt;
788{
789	struct	inpcb *inp = sotoinpcb(so);
790	int	error, optval;
791
792	error = optval = 0;
793	if (sopt->sopt_level != IPPROTO_IP) {
794		return (EINVAL);
795	}
796
797	switch (sopt->sopt_dir) {
798	case SOPT_SET:
799		switch (sopt->sopt_name) {
800		case IP_OPTIONS:
801#ifdef notyet
802		case IP_RETOPTS:
803#endif
804		{
805			struct mbuf *m;
806			if (sopt->sopt_valsize > MLEN) {
807				error = EMSGSIZE;
808				break;
809			}
810			MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
811			if (m == NULL) {
812				error = ENOBUFS;
813				break;
814			}
815			m->m_len = sopt->sopt_valsize;
816			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
817					    m->m_len);
818			if (error) {
819				m_free(m);
820				break;
821			}
822			INP_LOCK(inp);
823			error = ip_pcbopts(inp, sopt->sopt_name, m);
824			INP_UNLOCK(inp);
825			return (error);
826		}
827
828		case IP_TOS:
829		case IP_TTL:
830		case IP_MINTTL:
831		case IP_RECVOPTS:
832		case IP_RECVRETOPTS:
833		case IP_RECVDSTADDR:
834		case IP_RECVTTL:
835		case IP_RECVIF:
836		case IP_FAITH:
837		case IP_ONESBCAST:
838		case IP_DONTFRAG:
839			error = sooptcopyin(sopt, &optval, sizeof optval,
840					    sizeof optval);
841			if (error)
842				break;
843
844			switch (sopt->sopt_name) {
845			case IP_TOS:
846				inp->inp_ip_tos = optval;
847				break;
848
849			case IP_TTL:
850				inp->inp_ip_ttl = optval;
851				break;
852
853			case IP_MINTTL:
854				if (optval > 0 && optval <= MAXTTL)
855					inp->inp_ip_minttl = optval;
856				else
857					error = EINVAL;
858				break;
859
860#define	OPTSET(bit) do {						\
861	INP_LOCK(inp);							\
862	if (optval)							\
863		inp->inp_flags |= bit;					\
864	else								\
865		inp->inp_flags &= ~bit;					\
866	INP_UNLOCK(inp);						\
867} while (0)
868
869			case IP_RECVOPTS:
870				OPTSET(INP_RECVOPTS);
871				break;
872
873			case IP_RECVRETOPTS:
874				OPTSET(INP_RECVRETOPTS);
875				break;
876
877			case IP_RECVDSTADDR:
878				OPTSET(INP_RECVDSTADDR);
879				break;
880
881			case IP_RECVTTL:
882				OPTSET(INP_RECVTTL);
883				break;
884
885			case IP_RECVIF:
886				OPTSET(INP_RECVIF);
887				break;
888
889			case IP_FAITH:
890				OPTSET(INP_FAITH);
891				break;
892
893			case IP_ONESBCAST:
894				OPTSET(INP_ONESBCAST);
895				break;
896			case IP_DONTFRAG:
897				OPTSET(INP_DONTFRAG);
898				break;
899			}
900			break;
901#undef OPTSET
902
903		case IP_MULTICAST_IF:
904		case IP_MULTICAST_VIF:
905		case IP_MULTICAST_TTL:
906		case IP_MULTICAST_LOOP:
907		case IP_ADD_MEMBERSHIP:
908		case IP_DROP_MEMBERSHIP:
909			error = ip_setmoptions(inp, sopt);
910			break;
911
912		case IP_PORTRANGE:
913			error = sooptcopyin(sopt, &optval, sizeof optval,
914					    sizeof optval);
915			if (error)
916				break;
917
918			INP_LOCK(inp);
919			switch (optval) {
920			case IP_PORTRANGE_DEFAULT:
921				inp->inp_flags &= ~(INP_LOWPORT);
922				inp->inp_flags &= ~(INP_HIGHPORT);
923				break;
924
925			case IP_PORTRANGE_HIGH:
926				inp->inp_flags &= ~(INP_LOWPORT);
927				inp->inp_flags |= INP_HIGHPORT;
928				break;
929
930			case IP_PORTRANGE_LOW:
931				inp->inp_flags &= ~(INP_HIGHPORT);
932				inp->inp_flags |= INP_LOWPORT;
933				break;
934
935			default:
936				error = EINVAL;
937				break;
938			}
939			INP_UNLOCK(inp);
940			break;
941
942#if defined(IPSEC) || defined(FAST_IPSEC)
943		case IP_IPSEC_POLICY:
944		{
945			caddr_t req;
946			size_t len = 0;
947			int priv;
948			struct mbuf *m;
949			int optname;
950
951			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
952				break;
953			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
954				break;
955			priv = (sopt->sopt_td != NULL &&
956				suser(sopt->sopt_td) != 0) ? 0 : 1;
957			req = mtod(m, caddr_t);
958			len = m->m_len;
959			optname = sopt->sopt_name;
960			error = ipsec4_set_policy(inp, optname, req, len, priv);
961			m_freem(m);
962			break;
963		}
964#endif /*IPSEC*/
965
966		default:
967			error = ENOPROTOOPT;
968			break;
969		}
970		break;
971
972	case SOPT_GET:
973		switch (sopt->sopt_name) {
974		case IP_OPTIONS:
975		case IP_RETOPTS:
976			if (inp->inp_options)
977				error = sooptcopyout(sopt,
978						     mtod(inp->inp_options,
979							  char *),
980						     inp->inp_options->m_len);
981			else
982				sopt->sopt_valsize = 0;
983			break;
984
985		case IP_TOS:
986		case IP_TTL:
987		case IP_MINTTL:
988		case IP_RECVOPTS:
989		case IP_RECVRETOPTS:
990		case IP_RECVDSTADDR:
991		case IP_RECVTTL:
992		case IP_RECVIF:
993		case IP_PORTRANGE:
994		case IP_FAITH:
995		case IP_ONESBCAST:
996		case IP_DONTFRAG:
997			switch (sopt->sopt_name) {
998
999			case IP_TOS:
1000				optval = inp->inp_ip_tos;
1001				break;
1002
1003			case IP_TTL:
1004				optval = inp->inp_ip_ttl;
1005				break;
1006
1007			case IP_MINTTL:
1008				optval = inp->inp_ip_minttl;
1009				break;
1010
1011#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1012
1013			case IP_RECVOPTS:
1014				optval = OPTBIT(INP_RECVOPTS);
1015				break;
1016
1017			case IP_RECVRETOPTS:
1018				optval = OPTBIT(INP_RECVRETOPTS);
1019				break;
1020
1021			case IP_RECVDSTADDR:
1022				optval = OPTBIT(INP_RECVDSTADDR);
1023				break;
1024
1025			case IP_RECVTTL:
1026				optval = OPTBIT(INP_RECVTTL);
1027				break;
1028
1029			case IP_RECVIF:
1030				optval = OPTBIT(INP_RECVIF);
1031				break;
1032
1033			case IP_PORTRANGE:
1034				if (inp->inp_flags & INP_HIGHPORT)
1035					optval = IP_PORTRANGE_HIGH;
1036				else if (inp->inp_flags & INP_LOWPORT)
1037					optval = IP_PORTRANGE_LOW;
1038				else
1039					optval = 0;
1040				break;
1041
1042			case IP_FAITH:
1043				optval = OPTBIT(INP_FAITH);
1044				break;
1045
1046			case IP_ONESBCAST:
1047				optval = OPTBIT(INP_ONESBCAST);
1048				break;
1049			case IP_DONTFRAG:
1050				optval = OPTBIT(INP_DONTFRAG);
1051				break;
1052			}
1053			error = sooptcopyout(sopt, &optval, sizeof optval);
1054			break;
1055
1056		case IP_MULTICAST_IF:
1057		case IP_MULTICAST_VIF:
1058		case IP_MULTICAST_TTL:
1059		case IP_MULTICAST_LOOP:
1060		case IP_ADD_MEMBERSHIP:
1061		case IP_DROP_MEMBERSHIP:
1062			error = ip_getmoptions(inp, sopt);
1063			break;
1064
1065#if defined(IPSEC) || defined(FAST_IPSEC)
1066		case IP_IPSEC_POLICY:
1067		{
1068			struct mbuf *m = NULL;
1069			caddr_t req = NULL;
1070			size_t len = 0;
1071
1072			if (m != 0) {
1073				req = mtod(m, caddr_t);
1074				len = m->m_len;
1075			}
1076			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1077			if (error == 0)
1078				error = soopt_mcopyout(sopt, m); /* XXX */
1079			if (error == 0)
1080				m_freem(m);
1081			break;
1082		}
1083#endif /*IPSEC*/
1084
1085		default:
1086			error = ENOPROTOOPT;
1087			break;
1088		}
1089		break;
1090	}
1091	return (error);
1092}
1093
1094/*
1095 * XXX
1096 * The whole multicast option thing needs to be re-thought.
1097 * Several of these options are equally applicable to non-multicast
1098 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1099 * standard option (IP_TTL).
1100 */
1101
1102/*
1103 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1104 */
1105static struct ifnet *
1106ip_multicast_if(a, ifindexp)
1107	struct in_addr *a;
1108	int *ifindexp;
1109{
1110	int ifindex;
1111	struct ifnet *ifp;
1112
1113	if (ifindexp)
1114		*ifindexp = 0;
1115	if (ntohl(a->s_addr) >> 24 == 0) {
1116		ifindex = ntohl(a->s_addr) & 0xffffff;
1117		if (ifindex < 0 || if_index < ifindex)
1118			return NULL;
1119		ifp = ifnet_byindex(ifindex);
1120		if (ifindexp)
1121			*ifindexp = ifindex;
1122	} else {
1123		INADDR_TO_IFP(*a, ifp);
1124	}
1125	return ifp;
1126}
1127
1128/*
1129 * Given an inpcb, return its multicast options structure pointer.  Accepts
1130 * an unlocked inpcb pointer, but will return it locked.  May sleep.
1131 */
1132static struct ip_moptions *
1133ip_findmoptions(struct inpcb *inp)
1134{
1135	struct ip_moptions *imo;
1136	struct in_multi **immp;
1137
1138	INP_LOCK(inp);
1139	if (inp->inp_moptions != NULL)
1140		return (inp->inp_moptions);
1141
1142	INP_UNLOCK(inp);
1143
1144	imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
1145	immp = (struct in_multi **)malloc((sizeof(*immp) * IP_MIN_MEMBERSHIPS),
1146					  M_IPMOPTS, M_WAITOK);
1147
1148	imo->imo_multicast_ifp = NULL;
1149	imo->imo_multicast_addr.s_addr = INADDR_ANY;
1150	imo->imo_multicast_vif = -1;
1151	imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1152	imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1153	imo->imo_num_memberships = 0;
1154	imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
1155	imo->imo_membership = immp;
1156
1157	INP_LOCK(inp);
1158	if (inp->inp_moptions != NULL) {
1159		free(immp, M_IPMOPTS);
1160		free(imo, M_IPMOPTS);
1161		return (inp->inp_moptions);
1162	}
1163	inp->inp_moptions = imo;
1164	return (imo);
1165}
1166
1167/*
1168 * Set the IP multicast options in response to user setsockopt().
1169 */
1170static int
1171ip_setmoptions(struct inpcb *inp, struct sockopt *sopt)
1172{
1173	int error = 0;
1174	int i;
1175	struct in_addr addr;
1176	struct ip_mreq mreq;
1177	struct ifnet *ifp;
1178	struct ip_moptions *imo;
1179	struct route ro;
1180	struct sockaddr_in *dst;
1181	int ifindex;
1182	int s;
1183
1184	switch (sopt->sopt_name) {
1185	/* store an index number for the vif you wanna use in the send */
1186	case IP_MULTICAST_VIF:
1187		if (legal_vif_num == 0) {
1188			error = EOPNOTSUPP;
1189			break;
1190		}
1191		error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1192		if (error)
1193			break;
1194		if (!legal_vif_num(i) && (i != -1)) {
1195			error = EINVAL;
1196			break;
1197		}
1198		imo = ip_findmoptions(inp);
1199		imo->imo_multicast_vif = i;
1200		INP_UNLOCK(inp);
1201		break;
1202
1203	case IP_MULTICAST_IF:
1204		/*
1205		 * Select the interface for outgoing multicast packets.
1206		 */
1207		error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1208		if (error)
1209			break;
1210		/*
1211		 * INADDR_ANY is used to remove a previous selection.
1212		 * When no interface is selected, a default one is
1213		 * chosen every time a multicast packet is sent.
1214		 */
1215		imo = ip_findmoptions(inp);
1216		if (addr.s_addr == INADDR_ANY) {
1217			imo->imo_multicast_ifp = NULL;
1218			INP_UNLOCK(inp);
1219			break;
1220		}
1221		/*
1222		 * The selected interface is identified by its local
1223		 * IP address.  Find the interface and confirm that
1224		 * it supports multicasting.
1225		 */
1226		s = splimp();
1227		ifp = ip_multicast_if(&addr, &ifindex);
1228		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1229			INP_UNLOCK(inp);
1230			splx(s);
1231			error = EADDRNOTAVAIL;
1232			break;
1233		}
1234		imo->imo_multicast_ifp = ifp;
1235		if (ifindex)
1236			imo->imo_multicast_addr = addr;
1237		else
1238			imo->imo_multicast_addr.s_addr = INADDR_ANY;
1239		INP_UNLOCK(inp);
1240		splx(s);
1241		break;
1242
1243	case IP_MULTICAST_TTL:
1244		/*
1245		 * Set the IP time-to-live for outgoing multicast packets.
1246		 * The original multicast API required a char argument,
1247		 * which is inconsistent with the rest of the socket API.
1248		 * We allow either a char or an int.
1249		 */
1250		if (sopt->sopt_valsize == 1) {
1251			u_char ttl;
1252			error = sooptcopyin(sopt, &ttl, 1, 1);
1253			if (error)
1254				break;
1255			imo = ip_findmoptions(inp);
1256			imo->imo_multicast_ttl = ttl;
1257			INP_UNLOCK(inp);
1258		} else {
1259			u_int ttl;
1260			error = sooptcopyin(sopt, &ttl, sizeof ttl,
1261					    sizeof ttl);
1262			if (error)
1263				break;
1264			if (ttl > 255)
1265				error = EINVAL;
1266			else {
1267				imo = ip_findmoptions(inp);
1268				imo->imo_multicast_ttl = ttl;
1269				INP_UNLOCK(inp);
1270			}
1271		}
1272		break;
1273
1274	case IP_MULTICAST_LOOP:
1275		/*
1276		 * Set the loopback flag for outgoing multicast packets.
1277		 * Must be zero or one.  The original multicast API required a
1278		 * char argument, which is inconsistent with the rest
1279		 * of the socket API.  We allow either a char or an int.
1280		 */
1281		if (sopt->sopt_valsize == 1) {
1282			u_char loop;
1283			error = sooptcopyin(sopt, &loop, 1, 1);
1284			if (error)
1285				break;
1286			imo = ip_findmoptions(inp);
1287			imo->imo_multicast_loop = !!loop;
1288			INP_UNLOCK(inp);
1289		} else {
1290			u_int loop;
1291			error = sooptcopyin(sopt, &loop, sizeof loop,
1292					    sizeof loop);
1293			if (error)
1294				break;
1295			imo = ip_findmoptions(inp);
1296			imo->imo_multicast_loop = !!loop;
1297			INP_UNLOCK(inp);
1298		}
1299		break;
1300
1301	case IP_ADD_MEMBERSHIP:
1302		/*
1303		 * Add a multicast group membership.
1304		 * Group must be a valid IP multicast address.
1305		 */
1306		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1307		if (error)
1308			break;
1309
1310		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1311			error = EINVAL;
1312			break;
1313		}
1314		s = splimp();
1315		/*
1316		 * If no interface address was provided, use the interface of
1317		 * the route to the given multicast address.
1318		 */
1319		if (mreq.imr_interface.s_addr == INADDR_ANY) {
1320			bzero((caddr_t)&ro, sizeof(ro));
1321			dst = (struct sockaddr_in *)&ro.ro_dst;
1322			dst->sin_len = sizeof(*dst);
1323			dst->sin_family = AF_INET;
1324			dst->sin_addr = mreq.imr_multiaddr;
1325			rtalloc_ign(&ro, RTF_CLONING);
1326			if (ro.ro_rt == NULL) {
1327				error = EADDRNOTAVAIL;
1328				splx(s);
1329				break;
1330			}
1331			ifp = ro.ro_rt->rt_ifp;
1332			RTFREE(ro.ro_rt);
1333		}
1334		else {
1335			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1336		}
1337
1338		/*
1339		 * See if we found an interface, and confirm that it
1340		 * supports multicast.
1341		 */
1342		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1343			error = EADDRNOTAVAIL;
1344			splx(s);
1345			break;
1346		}
1347		/*
1348		 * See if the membership already exists or if all the
1349		 * membership slots are full.
1350		 */
1351		imo = ip_findmoptions(inp);
1352		for (i = 0; i < imo->imo_num_memberships; ++i) {
1353			if (imo->imo_membership[i]->inm_ifp == ifp &&
1354			    imo->imo_membership[i]->inm_addr.s_addr
1355						== mreq.imr_multiaddr.s_addr)
1356				break;
1357		}
1358		if (i < imo->imo_num_memberships) {
1359			INP_UNLOCK(inp);
1360			error = EADDRINUSE;
1361			splx(s);
1362			break;
1363		}
1364		if (imo->imo_num_memberships == imo->imo_max_memberships) {
1365		    struct in_multi **nmships, **omships;
1366		    size_t newmax;
1367		    /*
1368		     * Resize the vector to next power-of-two minus 1. If the
1369		     * size would exceed the maximum then we know we've really
1370		     * run out of entries. Otherwise, we realloc() the vector
1371		     * with the INP lock held to avoid introducing a race.
1372		     */
1373		    nmships = NULL;
1374		    omships = imo->imo_membership;
1375		    newmax = ((imo->imo_max_memberships + 1) * 2) - 1;
1376		    if (newmax <= IP_MAX_MEMBERSHIPS) {
1377			nmships = (struct in_multi **)realloc(omships,
1378sizeof(*nmships) * newmax, M_IPMOPTS, M_NOWAIT);
1379			if (nmships != NULL) {
1380			    imo->imo_membership = nmships;
1381			    imo->imo_max_memberships = newmax;
1382			}
1383		    }
1384		    if (nmships == NULL) {
1385			INP_UNLOCK(inp);
1386			error = ETOOMANYREFS;
1387			splx(s);
1388			break;
1389		    }
1390		}
1391		/*
1392		 * Everything looks good; add a new record to the multicast
1393		 * address list for the given interface.
1394		 */
1395		if ((imo->imo_membership[i] =
1396		    in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
1397			INP_UNLOCK(inp);
1398			error = ENOBUFS;
1399			splx(s);
1400			break;
1401		}
1402		++imo->imo_num_memberships;
1403		INP_UNLOCK(inp);
1404		splx(s);
1405		break;
1406
1407	case IP_DROP_MEMBERSHIP:
1408		/*
1409		 * Drop a multicast group membership.
1410		 * Group must be a valid IP multicast address.
1411		 */
1412		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1413		if (error)
1414			break;
1415
1416		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1417			error = EINVAL;
1418			break;
1419		}
1420
1421		s = splimp();
1422		/*
1423		 * If an interface address was specified, get a pointer
1424		 * to its ifnet structure.
1425		 */
1426		if (mreq.imr_interface.s_addr == INADDR_ANY)
1427			ifp = NULL;
1428		else {
1429			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1430			if (ifp == NULL) {
1431				error = EADDRNOTAVAIL;
1432				splx(s);
1433				break;
1434			}
1435		}
1436		/*
1437		 * Find the membership in the membership array.
1438		 */
1439		imo = ip_findmoptions(inp);
1440		for (i = 0; i < imo->imo_num_memberships; ++i) {
1441			if ((ifp == NULL ||
1442			     imo->imo_membership[i]->inm_ifp == ifp) &&
1443			     imo->imo_membership[i]->inm_addr.s_addr ==
1444			     mreq.imr_multiaddr.s_addr)
1445				break;
1446		}
1447		if (i == imo->imo_num_memberships) {
1448			INP_UNLOCK(inp);
1449			error = EADDRNOTAVAIL;
1450			splx(s);
1451			break;
1452		}
1453		/*
1454		 * Give up the multicast address record to which the
1455		 * membership points.
1456		 */
1457		in_delmulti(imo->imo_membership[i]);
1458		/*
1459		 * Remove the gap in the membership array.
1460		 */
1461		for (++i; i < imo->imo_num_memberships; ++i)
1462			imo->imo_membership[i-1] = imo->imo_membership[i];
1463		--imo->imo_num_memberships;
1464		INP_UNLOCK(inp);
1465		splx(s);
1466		break;
1467
1468	default:
1469		error = EOPNOTSUPP;
1470		break;
1471	}
1472
1473	return (error);
1474}
1475
1476/*
1477 * Return the IP multicast options in response to user getsockopt().
1478 */
1479static int
1480ip_getmoptions(struct inpcb *inp, struct sockopt *sopt)
1481{
1482	struct ip_moptions *imo;
1483	struct in_addr addr;
1484	struct in_ifaddr *ia;
1485	int error, optval;
1486	u_char coptval;
1487
1488	INP_LOCK(inp);
1489	imo = inp->inp_moptions;
1490
1491	error = 0;
1492	switch (sopt->sopt_name) {
1493	case IP_MULTICAST_VIF:
1494		if (imo != NULL)
1495			optval = imo->imo_multicast_vif;
1496		else
1497			optval = -1;
1498		INP_UNLOCK(inp);
1499		error = sooptcopyout(sopt, &optval, sizeof optval);
1500		break;
1501
1502	case IP_MULTICAST_IF:
1503		if (imo == NULL || imo->imo_multicast_ifp == NULL)
1504			addr.s_addr = INADDR_ANY;
1505		else if (imo->imo_multicast_addr.s_addr) {
1506			/* return the value user has set */
1507			addr = imo->imo_multicast_addr;
1508		} else {
1509			IFP_TO_IA(imo->imo_multicast_ifp, ia);
1510			addr.s_addr = (ia == NULL) ? INADDR_ANY
1511				: IA_SIN(ia)->sin_addr.s_addr;
1512		}
1513		INP_UNLOCK(inp);
1514		error = sooptcopyout(sopt, &addr, sizeof addr);
1515		break;
1516
1517	case IP_MULTICAST_TTL:
1518		if (imo == 0)
1519			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
1520		else
1521			optval = coptval = imo->imo_multicast_ttl;
1522		INP_UNLOCK(inp);
1523		if (sopt->sopt_valsize == 1)
1524			error = sooptcopyout(sopt, &coptval, 1);
1525		else
1526			error = sooptcopyout(sopt, &optval, sizeof optval);
1527		break;
1528
1529	case IP_MULTICAST_LOOP:
1530		if (imo == 0)
1531			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
1532		else
1533			optval = coptval = imo->imo_multicast_loop;
1534		INP_UNLOCK(inp);
1535		if (sopt->sopt_valsize == 1)
1536			error = sooptcopyout(sopt, &coptval, 1);
1537		else
1538			error = sooptcopyout(sopt, &optval, sizeof optval);
1539		break;
1540
1541	default:
1542		INP_UNLOCK(inp);
1543		error = ENOPROTOOPT;
1544		break;
1545	}
1546	INP_UNLOCK_ASSERT(inp);
1547
1548	return (error);
1549}
1550
1551/*
1552 * Discard the IP multicast options.
1553 */
1554void
1555ip_freemoptions(imo)
1556	register struct ip_moptions *imo;
1557{
1558	register int i;
1559
1560	if (imo != NULL) {
1561		for (i = 0; i < imo->imo_num_memberships; ++i)
1562			in_delmulti(imo->imo_membership[i]);
1563		free(imo->imo_membership, M_IPMOPTS);
1564		free(imo, M_IPMOPTS);
1565	}
1566}
1567
1568/*
1569 * Routine called from ip_output() to loop back a copy of an IP multicast
1570 * packet to the input queue of a specified interface.  Note that this
1571 * calls the output routine of the loopback "driver", but with an interface
1572 * pointer that might NOT be a loopback interface -- evil, but easier than
1573 * replicating that code here.
1574 */
1575static void
1576ip_mloopback(ifp, m, dst, hlen)
1577	struct ifnet *ifp;
1578	register struct mbuf *m;
1579	register struct sockaddr_in *dst;
1580	int hlen;
1581{
1582	register struct ip *ip;
1583	struct mbuf *copym;
1584
1585	copym = m_copy(m, 0, M_COPYALL);
1586	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
1587		copym = m_pullup(copym, hlen);
1588	if (copym != NULL) {
1589		/* If needed, compute the checksum and mark it as valid. */
1590		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1591			in_delayed_cksum(copym);
1592			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1593			copym->m_pkthdr.csum_flags |=
1594			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1595			copym->m_pkthdr.csum_data = 0xffff;
1596		}
1597		/*
1598		 * We don't bother to fragment if the IP length is greater
1599		 * than the interface's MTU.  Can this possibly matter?
1600		 */
1601		ip = mtod(copym, struct ip *);
1602		ip->ip_len = htons(ip->ip_len);
1603		ip->ip_off = htons(ip->ip_off);
1604		ip->ip_sum = 0;
1605		ip->ip_sum = in_cksum(copym, hlen);
1606		/*
1607		 * NB:
1608		 * It's not clear whether there are any lingering
1609		 * reentrancy problems in other areas which might
1610		 * be exposed by using ip_input directly (in
1611		 * particular, everything which modifies the packet
1612		 * in-place).  Yet another option is using the
1613		 * protosw directly to deliver the looped back
1614		 * packet.  For the moment, we'll err on the side
1615		 * of safety by using if_simloop().
1616		 */
1617#if 1 /* XXX */
1618		if (dst->sin_family != AF_INET) {
1619			printf("ip_mloopback: bad address family %d\n",
1620						dst->sin_family);
1621			dst->sin_family = AF_INET;
1622		}
1623#endif
1624
1625#ifdef notdef
1626		copym->m_pkthdr.rcvif = ifp;
1627		ip_input(copym);
1628#else
1629		if_simloop(ifp, copym, dst->sin_family, 0);
1630#endif
1631	}
1632}
1633