ip_output.c revision 124247
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
34 * $FreeBSD: head/sys/netinet/ip_output.c 124247 2004-01-08 11:13:40Z andre $
35 */
36
37#include "opt_ipfw.h"
38#include "opt_ipdn.h"
39#include "opt_ipdivert.h"
40#include "opt_ipfilter.h"
41#include "opt_ipsec.h"
42#include "opt_mac.h"
43#include "opt_pfil_hooks.h"
44#include "opt_random_ip_id.h"
45#include "opt_mbuf_stress_test.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/kernel.h>
50#include <sys/mac.h>
51#include <sys/malloc.h>
52#include <sys/mbuf.h>
53#include <sys/protosw.h>
54#include <sys/socket.h>
55#include <sys/socketvar.h>
56#include <sys/sysctl.h>
57
58#include <net/if.h>
59#include <net/route.h>
60
61#include <netinet/in.h>
62#include <netinet/in_systm.h>
63#include <netinet/ip.h>
64#include <netinet/in_pcb.h>
65#include <netinet/in_var.h>
66#include <netinet/ip_var.h>
67
68#ifdef PFIL_HOOKS
69#include <net/pfil.h>
70#endif
71
72#include <machine/in_cksum.h>
73
74static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
75
76#ifdef IPSEC
77#include <netinet6/ipsec.h>
78#include <netkey/key.h>
79#ifdef IPSEC_DEBUG
80#include <netkey/key_debug.h>
81#else
82#define	KEYDEBUG(lev,arg)
83#endif
84#endif /*IPSEC*/
85
86#ifdef FAST_IPSEC
87#include <netipsec/ipsec.h>
88#include <netipsec/xform.h>
89#include <netipsec/key.h>
90#endif /*FAST_IPSEC*/
91
92#include <netinet/ip_fw.h>
93#include <netinet/ip_dummynet.h>
94
95#define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
96				x, (ntohl(a.s_addr)>>24)&0xFF,\
97				  (ntohl(a.s_addr)>>16)&0xFF,\
98				  (ntohl(a.s_addr)>>8)&0xFF,\
99				  (ntohl(a.s_addr))&0xFF, y);
100
101u_short ip_id;
102
103#ifdef MBUF_STRESS_TEST
104int mbuf_frag_size = 0;
105SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
106	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
107#endif
108
109static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
110static struct ifnet *ip_multicast_if(struct in_addr *, int *);
111static void	ip_mloopback
112	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
113static int	ip_getmoptions
114	(struct sockopt *, struct ip_moptions *);
115static int	ip_pcbopts(int, struct mbuf **, struct mbuf *);
116static int	ip_setmoptions
117	(struct sockopt *, struct ip_moptions **);
118
119int	ip_optcopy(struct ip *, struct ip *);
120
121
122extern	struct protosw inetsw[];
123
124/*
125 * IP output.  The packet in mbuf chain m contains a skeletal IP
126 * header (with len, off, ttl, proto, tos, src, dst).
127 * The mbuf chain containing the packet will be freed.
128 * The mbuf opt, if present, will not be freed.
129 * In the IP forwarding case, the packet will arrive with options already
130 * inserted, so must have a NULL opt pointer.
131 */
132int
133ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
134	int flags, struct ip_moptions *imo, struct inpcb *inp)
135{
136	struct ip *ip;
137	struct ifnet *ifp = NULL;	/* keep compiler happy */
138	struct mbuf *m;
139	int hlen = sizeof (struct ip);
140	int len, off, error = 0;
141	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
142	struct in_ifaddr *ia = NULL;
143	int isbroadcast, sw_csum;
144	struct in_addr pkt_dst;
145	struct route iproute;
146#ifdef IPSEC
147	struct socket *so;
148	struct secpolicy *sp = NULL;
149#endif
150#ifdef FAST_IPSEC
151	struct m_tag *mtag;
152	struct secpolicy *sp = NULL;
153	struct tdb_ident *tdbi;
154	int s;
155#endif /* FAST_IPSEC */
156	struct ip_fw_args args;
157	int src_was_INADDR_ANY = 0;	/* as the name says... */
158
159	args.eh = NULL;
160	args.rule = NULL;
161	args.next_hop = NULL;
162	args.divert_rule = 0;			/* divert cookie */
163
164	/* Grab info from MT_TAG mbufs prepended to the chain. */
165	for (; m0 && m0->m_type == MT_TAG; m0 = m0->m_next) {
166		switch(m0->_m_tag_id) {
167		default:
168			printf("ip_output: unrecognised MT_TAG tag %d\n",
169			    m0->_m_tag_id);
170			break;
171
172		case PACKET_TAG_DUMMYNET:
173			/*
174			 * the packet was already tagged, so part of the
175			 * processing was already done, and we need to go down.
176			 * Get parameters from the header.
177			 */
178			args.rule = ((struct dn_pkt *)m0)->rule;
179			opt = NULL ;
180			ro = & ( ((struct dn_pkt *)m0)->ro ) ;
181			imo = NULL ;
182			dst = ((struct dn_pkt *)m0)->dn_dst ;
183			ifp = ((struct dn_pkt *)m0)->ifp ;
184			flags = ((struct dn_pkt *)m0)->flags ;
185			break;
186
187		case PACKET_TAG_DIVERT:
188			args.divert_rule = (intptr_t)m0->m_data & 0xffff;
189			break;
190
191		case PACKET_TAG_IPFORWARD:
192			args.next_hop = (struct sockaddr_in *)m0->m_data;
193			break;
194		}
195	}
196	m = m0;
197
198#ifdef IPSEC
199	so = ipsec_getsocket(m);
200	(void)ipsec_setsocket(m, NULL);
201#endif /*IPSEC*/
202
203	M_ASSERTPKTHDR(m);
204
205	if (ro == NULL) {
206		ro = &iproute;
207		bzero(ro, sizeof (*ro));
208	}
209
210	if (inp != NULL)
211		INP_LOCK_ASSERT(inp);
212
213	if (args.rule != NULL) {	/* dummynet already saw us */
214		ip = mtod(m, struct ip *);
215		hlen = ip->ip_hl << 2 ;
216		if (ro->ro_rt)
217			ia = ifatoia(ro->ro_rt->rt_ifa);
218		goto sendit;
219	}
220
221	if (opt) {
222		len = 0;
223		m = ip_insertoptions(m, opt, &len);
224		if (len != 0)
225			hlen = len;
226	}
227	ip = mtod(m, struct ip *);
228	pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
229
230	/*
231	 * Fill in IP header.  If we are not allowing fragmentation,
232	 * then the ip_id field is meaningless, but we don't set it
233	 * to zero.  Doing so causes various problems when devices along
234	 * the path (routers, load balancers, firewalls, etc.) illegally
235	 * disable DF on our packet.  Note that a 16-bit counter
236	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
237	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
238	 * for Counting NATted Hosts", Proc. IMW'02, available at
239	 * <http://www.research.att.com/~smb/papers/fnat.pdf>.
240	 */
241	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
242		ip->ip_v = IPVERSION;
243		ip->ip_hl = hlen >> 2;
244#ifdef RANDOM_IP_ID
245		ip->ip_id = ip_randomid();
246#else
247		ip->ip_id = htons(ip_id++);
248#endif
249		ipstat.ips_localout++;
250	} else {
251		hlen = ip->ip_hl << 2;
252	}
253
254	dst = (struct sockaddr_in *)&ro->ro_dst;
255	/*
256	 * If there is a cached route,
257	 * check that it is to the same destination
258	 * and is still up.  If not, free it and try again.
259	 * The address family should also be checked in case of sharing the
260	 * cache with IPv6.
261	 */
262	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
263			  dst->sin_family != AF_INET ||
264			  dst->sin_addr.s_addr != pkt_dst.s_addr)) {
265		RTFREE(ro->ro_rt);
266		ro->ro_rt = (struct rtentry *)0;
267	}
268	if (ro->ro_rt == 0) {
269		bzero(dst, sizeof(*dst));
270		dst->sin_family = AF_INET;
271		dst->sin_len = sizeof(*dst);
272		dst->sin_addr = pkt_dst;
273	}
274	/*
275	 * If routing to interface only,
276	 * short circuit routing lookup.
277	 */
278	if (flags & IP_ROUTETOIF) {
279		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 &&
280		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
281			ipstat.ips_noroute++;
282			error = ENETUNREACH;
283			goto bad;
284		}
285		ifp = ia->ia_ifp;
286		ip->ip_ttl = 1;
287		isbroadcast = in_broadcast(dst->sin_addr, ifp);
288	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
289	    imo != NULL && imo->imo_multicast_ifp != NULL) {
290		/*
291		 * Bypass the normal routing lookup for multicast
292		 * packets if the interface is specified.
293		 */
294		ifp = imo->imo_multicast_ifp;
295		IFP_TO_IA(ifp, ia);
296		isbroadcast = 0;	/* fool gcc */
297	} else {
298		/*
299		 * We want to do any cloning requested by the link layer,
300		 * as this is probably required in all cases for correct
301		 * operation (as it is for ARP).
302		 */
303		if (ro->ro_rt == 0)
304			rtalloc(ro);
305		if (ro->ro_rt == 0) {
306			ipstat.ips_noroute++;
307			error = EHOSTUNREACH;
308			goto bad;
309		}
310		ia = ifatoia(ro->ro_rt->rt_ifa);
311		ifp = ro->ro_rt->rt_ifp;
312		ro->ro_rt->rt_rmx.rmx_pksent++;
313		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
314			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
315		if (ro->ro_rt->rt_flags & RTF_HOST)
316			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
317		else
318			isbroadcast = in_broadcast(dst->sin_addr, ifp);
319	}
320	if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
321		struct in_multi *inm;
322
323		m->m_flags |= M_MCAST;
324		/*
325		 * IP destination address is multicast.  Make sure "dst"
326		 * still points to the address in "ro".  (It may have been
327		 * changed to point to a gateway address, above.)
328		 */
329		dst = (struct sockaddr_in *)&ro->ro_dst;
330		/*
331		 * See if the caller provided any multicast options
332		 */
333		if (imo != NULL) {
334			ip->ip_ttl = imo->imo_multicast_ttl;
335			if (imo->imo_multicast_vif != -1)
336				ip->ip_src.s_addr =
337				    ip_mcast_src ?
338				    ip_mcast_src(imo->imo_multicast_vif) :
339				    INADDR_ANY;
340		} else
341			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
342		/*
343		 * Confirm that the outgoing interface supports multicast.
344		 */
345		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
346			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
347				ipstat.ips_noroute++;
348				error = ENETUNREACH;
349				goto bad;
350			}
351		}
352		/*
353		 * If source address not specified yet, use address
354		 * of outgoing interface.
355		 */
356		if (ip->ip_src.s_addr == INADDR_ANY) {
357			/* Interface may have no addresses. */
358			if (ia != NULL)
359				ip->ip_src = IA_SIN(ia)->sin_addr;
360		}
361
362		if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
363			/*
364			 * XXX
365			 * delayed checksums are not currently
366			 * compatible with IP multicast routing
367			 */
368			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
369				in_delayed_cksum(m);
370				m->m_pkthdr.csum_flags &=
371					~CSUM_DELAY_DATA;
372			}
373		}
374		IN_LOOKUP_MULTI(pkt_dst, ifp, inm);
375		if (inm != NULL &&
376		   (imo == NULL || imo->imo_multicast_loop)) {
377			/*
378			 * If we belong to the destination multicast group
379			 * on the outgoing interface, and the caller did not
380			 * forbid loopback, loop back a copy.
381			 */
382			ip_mloopback(ifp, m, dst, hlen);
383		}
384		else {
385			/*
386			 * If we are acting as a multicast router, perform
387			 * multicast forwarding as if the packet had just
388			 * arrived on the interface to which we are about
389			 * to send.  The multicast forwarding function
390			 * recursively calls this function, using the
391			 * IP_FORWARDING flag to prevent infinite recursion.
392			 *
393			 * Multicasts that are looped back by ip_mloopback(),
394			 * above, will be forwarded by the ip_input() routine,
395			 * if necessary.
396			 */
397			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
398				/*
399				 * If rsvp daemon is not running, do not
400				 * set ip_moptions. This ensures that the packet
401				 * is multicast and not just sent down one link
402				 * as prescribed by rsvpd.
403				 */
404				if (!rsvp_on)
405					imo = NULL;
406				if (ip_mforward &&
407				    ip_mforward(ip, ifp, m, imo) != 0) {
408					m_freem(m);
409					goto done;
410				}
411			}
412		}
413
414		/*
415		 * Multicasts with a time-to-live of zero may be looped-
416		 * back, above, but must not be transmitted on a network.
417		 * Also, multicasts addressed to the loopback interface
418		 * are not sent -- the above call to ip_mloopback() will
419		 * loop back a copy if this host actually belongs to the
420		 * destination group on the loopback interface.
421		 */
422		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
423			m_freem(m);
424			goto done;
425		}
426
427		goto sendit;
428	}
429#ifndef notdef
430	/*
431	 * If the source address is not specified yet, use the address
432	 * of the outoing interface. In case, keep note we did that, so
433	 * if the the firewall changes the next-hop causing the output
434	 * interface to change, we can fix that.
435	 */
436	if (ip->ip_src.s_addr == INADDR_ANY) {
437		/* Interface may have no addresses. */
438		if (ia != NULL) {
439			ip->ip_src = IA_SIN(ia)->sin_addr;
440			src_was_INADDR_ANY = 1;
441		}
442	}
443#endif /* notdef */
444	/*
445	 * Verify that we have any chance at all of being able to queue
446	 *      the packet or packet fragments
447	 */
448	if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
449		ifp->if_snd.ifq_maxlen) {
450			error = ENOBUFS;
451			ipstat.ips_odropped++;
452			goto bad;
453	}
454
455	/*
456	 * Look for broadcast address and
457	 * verify user is allowed to send
458	 * such a packet.
459	 */
460	if (isbroadcast) {
461		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
462			error = EADDRNOTAVAIL;
463			goto bad;
464		}
465		if ((flags & IP_ALLOWBROADCAST) == 0) {
466			error = EACCES;
467			goto bad;
468		}
469		/* don't allow broadcast messages to be fragmented */
470		if (ip->ip_len > ifp->if_mtu) {
471			error = EMSGSIZE;
472			goto bad;
473		}
474		if (flags & IP_SENDONES)
475			ip->ip_dst.s_addr = INADDR_BROADCAST;
476		m->m_flags |= M_BCAST;
477	} else {
478		m->m_flags &= ~M_BCAST;
479	}
480
481sendit:
482#ifdef IPSEC
483	/* get SP for this packet */
484	if (so == NULL)
485		sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
486		    flags, &error);
487	else
488		sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error);
489
490	if (sp == NULL) {
491		ipsecstat.out_inval++;
492		goto bad;
493	}
494
495	error = 0;
496
497	/* check policy */
498	switch (sp->policy) {
499	case IPSEC_POLICY_DISCARD:
500		/*
501		 * This packet is just discarded.
502		 */
503		ipsecstat.out_polvio++;
504		goto bad;
505
506	case IPSEC_POLICY_BYPASS:
507	case IPSEC_POLICY_NONE:
508		/* no need to do IPsec. */
509		goto skip_ipsec;
510
511	case IPSEC_POLICY_IPSEC:
512		if (sp->req == NULL) {
513			/* acquire a policy */
514			error = key_spdacquire(sp);
515			goto bad;
516		}
517		break;
518
519	case IPSEC_POLICY_ENTRUST:
520	default:
521		printf("ip_output: Invalid policy found. %d\n", sp->policy);
522	}
523    {
524	struct ipsec_output_state state;
525	bzero(&state, sizeof(state));
526	state.m = m;
527	if (flags & IP_ROUTETOIF) {
528		state.ro = &iproute;
529		bzero(&iproute, sizeof(iproute));
530	} else
531		state.ro = ro;
532	state.dst = (struct sockaddr *)dst;
533
534	ip->ip_sum = 0;
535
536	/*
537	 * XXX
538	 * delayed checksums are not currently compatible with IPsec
539	 */
540	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
541		in_delayed_cksum(m);
542		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
543	}
544
545	ip->ip_len = htons(ip->ip_len);
546	ip->ip_off = htons(ip->ip_off);
547
548	error = ipsec4_output(&state, sp, flags);
549
550	m = state.m;
551	if (flags & IP_ROUTETOIF) {
552		/*
553		 * if we have tunnel mode SA, we may need to ignore
554		 * IP_ROUTETOIF.
555		 */
556		if (state.ro != &iproute || state.ro->ro_rt != NULL) {
557			flags &= ~IP_ROUTETOIF;
558			ro = state.ro;
559		}
560	} else
561		ro = state.ro;
562	dst = (struct sockaddr_in *)state.dst;
563	if (error) {
564		/* mbuf is already reclaimed in ipsec4_output. */
565		m0 = NULL;
566		switch (error) {
567		case EHOSTUNREACH:
568		case ENETUNREACH:
569		case EMSGSIZE:
570		case ENOBUFS:
571		case ENOMEM:
572			break;
573		default:
574			printf("ip4_output (ipsec): error code %d\n", error);
575			/*fall through*/
576		case ENOENT:
577			/* don't show these error codes to the user */
578			error = 0;
579			break;
580		}
581		goto bad;
582	}
583    }
584
585	/* be sure to update variables that are affected by ipsec4_output() */
586	ip = mtod(m, struct ip *);
587	hlen = ip->ip_hl << 2;
588	if (ro->ro_rt == NULL) {
589		if ((flags & IP_ROUTETOIF) == 0) {
590			printf("ip_output: "
591				"can't update route after IPsec processing\n");
592			error = EHOSTUNREACH;	/*XXX*/
593			goto bad;
594		}
595	} else {
596		ia = ifatoia(ro->ro_rt->rt_ifa);
597		ifp = ro->ro_rt->rt_ifp;
598	}
599
600	/* make it flipped, again. */
601	ip->ip_len = ntohs(ip->ip_len);
602	ip->ip_off = ntohs(ip->ip_off);
603skip_ipsec:
604#endif /*IPSEC*/
605#ifdef FAST_IPSEC
606	/*
607	 * Check the security policy (SP) for the packet and, if
608	 * required, do IPsec-related processing.  There are two
609	 * cases here; the first time a packet is sent through
610	 * it will be untagged and handled by ipsec4_checkpolicy.
611	 * If the packet is resubmitted to ip_output (e.g. after
612	 * AH, ESP, etc. processing), there will be a tag to bypass
613	 * the lookup and related policy checking.
614	 */
615	mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
616	s = splnet();
617	if (mtag != NULL) {
618		tdbi = (struct tdb_ident *)(mtag + 1);
619		sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND);
620		if (sp == NULL)
621			error = -EINVAL;	/* force silent drop */
622		m_tag_delete(m, mtag);
623	} else {
624		sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags,
625					&error, inp);
626	}
627	/*
628	 * There are four return cases:
629	 *    sp != NULL	 	    apply IPsec policy
630	 *    sp == NULL, error == 0	    no IPsec handling needed
631	 *    sp == NULL, error == -EINVAL  discard packet w/o error
632	 *    sp == NULL, error != 0	    discard packet, report error
633	 */
634	if (sp != NULL) {
635		/* Loop detection, check if ipsec processing already done */
636		KASSERT(sp->req != NULL, ("ip_output: no ipsec request"));
637		for (mtag = m_tag_first(m); mtag != NULL;
638		     mtag = m_tag_next(m, mtag)) {
639			if (mtag->m_tag_cookie != MTAG_ABI_COMPAT)
640				continue;
641			if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
642			    mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
643				continue;
644			/*
645			 * Check if policy has an SA associated with it.
646			 * This can happen when an SP has yet to acquire
647			 * an SA; e.g. on first reference.  If it occurs,
648			 * then we let ipsec4_process_packet do its thing.
649			 */
650			if (sp->req->sav == NULL)
651				break;
652			tdbi = (struct tdb_ident *)(mtag + 1);
653			if (tdbi->spi == sp->req->sav->spi &&
654			    tdbi->proto == sp->req->sav->sah->saidx.proto &&
655			    bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst,
656				 sizeof (union sockaddr_union)) == 0) {
657				/*
658				 * No IPsec processing is needed, free
659				 * reference to SP.
660				 *
661				 * NB: null pointer to avoid free at
662				 *     done: below.
663				 */
664				KEY_FREESP(&sp), sp = NULL;
665				splx(s);
666				goto spd_done;
667			}
668		}
669
670		/*
671		 * Do delayed checksums now because we send before
672		 * this is done in the normal processing path.
673		 */
674		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
675			in_delayed_cksum(m);
676			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
677		}
678
679		ip->ip_len = htons(ip->ip_len);
680		ip->ip_off = htons(ip->ip_off);
681
682		/* NB: callee frees mbuf */
683		error = ipsec4_process_packet(m, sp->req, flags, 0);
684		/*
685		 * Preserve KAME behaviour: ENOENT can be returned
686		 * when an SA acquire is in progress.  Don't propagate
687		 * this to user-level; it confuses applications.
688		 *
689		 * XXX this will go away when the SADB is redone.
690		 */
691		if (error == ENOENT)
692			error = 0;
693		splx(s);
694		goto done;
695	} else {
696		splx(s);
697
698		if (error != 0) {
699			/*
700			 * Hack: -EINVAL is used to signal that a packet
701			 * should be silently discarded.  This is typically
702			 * because we asked key management for an SA and
703			 * it was delayed (e.g. kicked up to IKE).
704			 */
705			if (error == -EINVAL)
706				error = 0;
707			goto bad;
708		} else {
709			/* No IPsec processing for this packet. */
710		}
711#ifdef notyet
712		/*
713		 * If deferred crypto processing is needed, check that
714		 * the interface supports it.
715		 */
716		mtag = m_tag_find(m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL);
717		if (mtag != NULL && (ifp->if_capenable & IFCAP_IPSEC) == 0) {
718			/* notify IPsec to do its own crypto */
719			ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1));
720			error = EHOSTUNREACH;
721			goto bad;
722		}
723#endif
724	}
725spd_done:
726#endif /* FAST_IPSEC */
727
728	/*
729	 * IpHack's section.
730	 * - Xlate: translate packet's addr/port (NAT).
731	 * - Firewall: deny/allow/etc.
732	 * - Wrap: fake packet's addr/port <unimpl.>
733	 * - Encapsulate: put it in another IP and send out. <unimp.>
734	 */
735#ifdef PFIL_HOOKS
736	/*
737	 * Run through list of hooks for output packets.
738	 */
739	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT);
740	if (error != 0 || m == NULL)
741		goto done;
742	ip = mtod(m, struct ip *);
743#endif /* PFIL_HOOKS */
744
745	/*
746	 * Check with the firewall...
747	 * but not if we are already being fwd'd from a firewall.
748	 */
749	if (fw_enable && IPFW_LOADED && !args.next_hop) {
750		struct sockaddr_in *old = dst;
751
752		args.m = m;
753		args.next_hop = dst;
754		args.oif = ifp;
755		off = ip_fw_chk_ptr(&args);
756		m = args.m;
757		dst = args.next_hop;
758
759                /*
760		 * On return we must do the following:
761		 * m == NULL	-> drop the pkt (old interface, deprecated)
762		 * (off & IP_FW_PORT_DENY_FLAG)	-> drop the pkt (new interface)
763		 * 1<=off<= 0xffff		-> DIVERT
764		 * (off & IP_FW_PORT_DYNT_FLAG)	-> send to a DUMMYNET pipe
765		 * (off & IP_FW_PORT_TEE_FLAG)	-> TEE the packet
766		 * dst != old			-> IPFIREWALL_FORWARD
767		 * off==0, dst==old		-> accept
768		 * If some of the above modules are not compiled in, then
769		 * we should't have to check the corresponding condition
770		 * (because the ipfw control socket should not accept
771		 * unsupported rules), but better play safe and drop
772		 * packets in case of doubt.
773		 */
774		if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) {
775			if (m)
776				m_freem(m);
777			error = EACCES;
778			goto done;
779		}
780		ip = mtod(m, struct ip *);
781		if (off == 0 && dst == old)		/* common case */
782			goto pass;
783                if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) {
784			/*
785			 * pass the pkt to dummynet. Need to include
786			 * pipe number, m, ifp, ro, dst because these are
787			 * not recomputed in the next pass.
788			 * All other parameters have been already used and
789			 * so they are not needed anymore.
790			 * XXX note: if the ifp or ro entry are deleted
791			 * while a pkt is in dummynet, we are in trouble!
792			 */
793			args.ro = ro;
794			args.dst = dst;
795			args.flags = flags;
796
797			error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
798				&args);
799			goto done;
800		}
801#ifdef IPDIVERT
802		if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
803			struct mbuf *clone = NULL;
804
805			/* Clone packet if we're doing a 'tee' */
806			if ((off & IP_FW_PORT_TEE_FLAG) != 0)
807				clone = m_dup(m, M_DONTWAIT);
808
809			/*
810			 * XXX
811			 * delayed checksums are not currently compatible
812			 * with divert sockets.
813			 */
814			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
815				in_delayed_cksum(m);
816				m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
817			}
818
819			/* Restore packet header fields to original values */
820			ip->ip_len = htons(ip->ip_len);
821			ip->ip_off = htons(ip->ip_off);
822
823			/* Deliver packet to divert input routine */
824			divert_packet(m, 0, off & 0xffff, args.divert_rule);
825
826			/* If 'tee', continue with original packet */
827			if (clone != NULL) {
828				m = clone;
829				ip = mtod(m, struct ip *);
830				goto pass;
831			}
832			goto done;
833		}
834#endif
835
836		/* IPFIREWALL_FORWARD */
837		/*
838		 * Check dst to make sure it is directly reachable on the
839		 * interface we previously thought it was.
840		 * If it isn't (which may be likely in some situations) we have
841		 * to re-route it (ie, find a route for the next-hop and the
842		 * associated interface) and set them here. This is nested
843		 * forwarding which in most cases is undesirable, except where
844		 * such control is nigh impossible. So we do it here.
845		 * And I'm babbling.
846		 */
847		if (off == 0 && old != dst) { /* FORWARD, dst has changed */
848#if 0
849			/*
850			 * XXX To improve readability, this block should be
851			 * changed into a function call as below:
852			 */
853			error = ip_ipforward(&m, &dst, &ifp);
854			if (error)
855				goto bad;
856			if (m == NULL) /* ip_input consumed the mbuf */
857				goto done;
858#else
859			struct in_ifaddr *ia;
860
861			/*
862			 * XXX sro_fwd below is static, and a pointer
863			 * to it gets passed to routines downstream.
864			 * This could have surprisingly bad results in
865			 * practice, because its content is overwritten
866			 * by subsequent packets.
867			 */
868			/* There must be a better way to do this next line... */
869			static struct route sro_fwd;
870			struct route *ro_fwd = &sro_fwd;
871
872#if 0
873			print_ip("IPFIREWALL_FORWARD: New dst ip: ",
874			    dst->sin_addr, "\n");
875#endif
876
877			/*
878			 * We need to figure out if we have been forwarded
879			 * to a local socket. If so, then we should somehow
880			 * "loop back" to ip_input, and get directed to the
881			 * PCB as if we had received this packet. This is
882			 * because it may be dificult to identify the packets
883			 * you want to forward until they are being output
884			 * and have selected an interface. (e.g. locally
885			 * initiated packets) If we used the loopback inteface,
886			 * we would not be able to control what happens
887			 * as the packet runs through ip_input() as
888			 * it is done through an ISR.
889			 */
890			LIST_FOREACH(ia,
891			    INADDR_HASH(dst->sin_addr.s_addr), ia_hash) {
892				/*
893				 * If the addr to forward to is one
894				 * of ours, we pretend to
895				 * be the destination for this packet.
896				 */
897				if (IA_SIN(ia)->sin_addr.s_addr ==
898						 dst->sin_addr.s_addr)
899					break;
900			}
901			if (ia) {	/* tell ip_input "dont filter" */
902				struct m_hdr tag;
903
904				tag.mh_type = MT_TAG;
905				tag.mh_flags = PACKET_TAG_IPFORWARD;
906				tag.mh_data = (caddr_t)args.next_hop;
907				tag.mh_next = m;
908				tag.mh_nextpkt = NULL;
909
910				if (m->m_pkthdr.rcvif == NULL)
911					m->m_pkthdr.rcvif = ifunit("lo0");
912				if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
913					m->m_pkthdr.csum_flags |=
914					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
915					m0->m_pkthdr.csum_data = 0xffff;
916				}
917				m->m_pkthdr.csum_flags |=
918				    CSUM_IP_CHECKED | CSUM_IP_VALID;
919				ip->ip_len = htons(ip->ip_len);
920				ip->ip_off = htons(ip->ip_off);
921				ip_input((struct mbuf *)&tag);
922				goto done;
923			}
924			/*
925			 * Some of the logic for this was
926			 * nicked from above.
927			 */
928			bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
929
930			ro_fwd->ro_rt = 0;
931			rtalloc_ign(ro_fwd, RTF_CLONING);
932
933			if (ro_fwd->ro_rt == 0) {
934				ipstat.ips_noroute++;
935				error = EHOSTUNREACH;
936				goto bad;
937			}
938
939			ia = ifatoia(ro_fwd->ro_rt->rt_ifa);
940			ifp = ro_fwd->ro_rt->rt_ifp;
941			ro_fwd->ro_rt->rt_rmx.rmx_pksent++;
942			if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
943				dst = (struct sockaddr_in *)
944					ro_fwd->ro_rt->rt_gateway;
945			if (ro_fwd->ro_rt->rt_flags & RTF_HOST)
946				isbroadcast =
947				    (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
948			else
949				isbroadcast = in_broadcast(dst->sin_addr, ifp);
950			if (ro->ro_rt)
951				RTFREE(ro->ro_rt);
952			ro->ro_rt = ro_fwd->ro_rt;
953			dst = (struct sockaddr_in *)&ro_fwd->ro_dst;
954
955#endif	/* ... block to be put into a function */
956			/*
957			 * If we added a default src ip earlier,
958			 * which would have been gotten from the-then
959			 * interface, do it again, from the new one.
960			 */
961			if (src_was_INADDR_ANY)
962				ip->ip_src = IA_SIN(ia)->sin_addr;
963			goto pass ;
964		}
965
966                /*
967                 * if we get here, none of the above matches, and
968                 * we have to drop the pkt
969                 */
970		m_freem(m);
971                error = EACCES; /* not sure this is the right error msg */
972                goto done;
973	}
974
975pass:
976	/* 127/8 must not appear on wire - RFC1122. */
977	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
978	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
979		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
980			ipstat.ips_badaddr++;
981			error = EADDRNOTAVAIL;
982			goto bad;
983		}
984	}
985
986	m->m_pkthdr.csum_flags |= CSUM_IP;
987	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
988	if (sw_csum & CSUM_DELAY_DATA) {
989		in_delayed_cksum(m);
990		sw_csum &= ~CSUM_DELAY_DATA;
991	}
992	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
993
994	/*
995	 * If small enough for interface, or the interface will take
996	 * care of the fragmentation for us, can just send directly.
997	 */
998	if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT &&
999	    ((ip->ip_off & IP_DF) == 0))) {
1000		ip->ip_len = htons(ip->ip_len);
1001		ip->ip_off = htons(ip->ip_off);
1002		ip->ip_sum = 0;
1003		if (sw_csum & CSUM_DELAY_IP)
1004			ip->ip_sum = in_cksum(m, hlen);
1005
1006		/* Record statistics for this interface address. */
1007		if (!(flags & IP_FORWARDING) && ia) {
1008			ia->ia_ifa.if_opackets++;
1009			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1010		}
1011
1012#ifdef IPSEC
1013		/* clean ipsec history once it goes out of the node */
1014		ipsec_delaux(m);
1015#endif
1016
1017#ifdef MBUF_STRESS_TEST
1018		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
1019			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
1020#endif
1021		error = (*ifp->if_output)(ifp, m,
1022				(struct sockaddr *)dst, ro->ro_rt);
1023		goto done;
1024	}
1025
1026	if (ip->ip_off & IP_DF) {
1027		error = EMSGSIZE;
1028		/*
1029		 * This case can happen if the user changed the MTU
1030		 * of an interface after enabling IP on it.  Because
1031		 * most netifs don't keep track of routes pointing to
1032		 * them, there is no way for one to update all its
1033		 * routes when the MTU is changed.
1034		 */
1035		if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
1036		    (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
1037			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
1038		}
1039		ipstat.ips_cantfrag++;
1040		goto bad;
1041	}
1042
1043	/*
1044	 * Too large for interface; fragment if possible. If successful,
1045	 * on return, m will point to a list of packets to be sent.
1046	 */
1047	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
1048	if (error)
1049		goto bad;
1050	for (; m; m = m0) {
1051		m0 = m->m_nextpkt;
1052		m->m_nextpkt = 0;
1053#ifdef IPSEC
1054		/* clean ipsec history once it goes out of the node */
1055		ipsec_delaux(m);
1056#endif
1057		if (error == 0) {
1058			/* Record statistics for this interface address. */
1059			if (ia != NULL) {
1060				ia->ia_ifa.if_opackets++;
1061				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1062			}
1063
1064			error = (*ifp->if_output)(ifp, m,
1065			    (struct sockaddr *)dst, ro->ro_rt);
1066		} else
1067			m_freem(m);
1068	}
1069
1070	if (error == 0)
1071		ipstat.ips_fragmented++;
1072
1073done:
1074	if (ro == &iproute && ro->ro_rt) {
1075		RTFREE(ro->ro_rt);
1076		ro->ro_rt = NULL;
1077	}
1078#ifdef IPSEC
1079	if (sp != NULL) {
1080		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1081			printf("DP ip_output call free SP:%p\n", sp));
1082		key_freesp(sp);
1083	}
1084#endif
1085#ifdef FAST_IPSEC
1086	if (sp != NULL)
1087		KEY_FREESP(&sp);
1088#endif
1089	return (error);
1090bad:
1091	m_freem(m);
1092	goto done;
1093}
1094
1095/*
1096 * Create a chain of fragments which fit the given mtu. m_frag points to the
1097 * mbuf to be fragmented; on return it points to the chain with the fragments.
1098 * Return 0 if no error. If error, m_frag may contain a partially built
1099 * chain of fragments that should be freed by the caller.
1100 *
1101 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
1102 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
1103 */
1104int
1105ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
1106	    u_long if_hwassist_flags, int sw_csum)
1107{
1108	int error = 0;
1109	int hlen = ip->ip_hl << 2;
1110	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
1111	int off;
1112	struct mbuf *m0 = *m_frag;	/* the original packet		*/
1113	int firstlen;
1114	struct mbuf **mnext;
1115	int nfrags;
1116
1117	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
1118		ipstat.ips_cantfrag++;
1119		return EMSGSIZE;
1120	}
1121
1122	/*
1123	 * Must be able to put at least 8 bytes per fragment.
1124	 */
1125	if (len < 8)
1126		return EMSGSIZE;
1127
1128	/*
1129	 * If the interface will not calculate checksums on
1130	 * fragmented packets, then do it here.
1131	 */
1132	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
1133	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
1134		in_delayed_cksum(m0);
1135		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1136	}
1137
1138	if (len > PAGE_SIZE) {
1139		/*
1140		 * Fragment large datagrams such that each segment
1141		 * contains a multiple of PAGE_SIZE amount of data,
1142		 * plus headers. This enables a receiver to perform
1143		 * page-flipping zero-copy optimizations.
1144		 *
1145		 * XXX When does this help given that sender and receiver
1146		 * could have different page sizes, and also mtu could
1147		 * be less than the receiver's page size ?
1148		 */
1149		int newlen;
1150		struct mbuf *m;
1151
1152		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
1153			off += m->m_len;
1154
1155		/*
1156		 * firstlen (off - hlen) must be aligned on an
1157		 * 8-byte boundary
1158		 */
1159		if (off < hlen)
1160			goto smart_frag_failure;
1161		off = ((off - hlen) & ~7) + hlen;
1162		newlen = (~PAGE_MASK) & mtu;
1163		if ((newlen + sizeof (struct ip)) > mtu) {
1164			/* we failed, go back the default */
1165smart_frag_failure:
1166			newlen = len;
1167			off = hlen + len;
1168		}
1169		len = newlen;
1170
1171	} else {
1172		off = hlen + len;
1173	}
1174
1175	firstlen = off - hlen;
1176	mnext = &m0->m_nextpkt;		/* pointer to next packet */
1177
1178	/*
1179	 * Loop through length of segment after first fragment,
1180	 * make new header and copy data of each part and link onto chain.
1181	 * Here, m0 is the original packet, m is the fragment being created.
1182	 * The fragments are linked off the m_nextpkt of the original
1183	 * packet, which after processing serves as the first fragment.
1184	 */
1185	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
1186		struct ip *mhip;	/* ip header on the fragment */
1187		struct mbuf *m;
1188		int mhlen = sizeof (struct ip);
1189
1190		MGETHDR(m, M_DONTWAIT, MT_HEADER);
1191		if (m == 0) {
1192			error = ENOBUFS;
1193			ipstat.ips_odropped++;
1194			goto done;
1195		}
1196		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1197		/*
1198		 * In the first mbuf, leave room for the link header, then
1199		 * copy the original IP header including options. The payload
1200		 * goes into an additional mbuf chain returned by m_copy().
1201		 */
1202		m->m_data += max_linkhdr;
1203		mhip = mtod(m, struct ip *);
1204		*mhip = *ip;
1205		if (hlen > sizeof (struct ip)) {
1206			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
1207			mhip->ip_v = IPVERSION;
1208			mhip->ip_hl = mhlen >> 2;
1209		}
1210		m->m_len = mhlen;
1211		/* XXX do we need to add ip->ip_off below ? */
1212		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
1213		if (off + len >= ip->ip_len) {	/* last fragment */
1214			len = ip->ip_len - off;
1215			m->m_flags |= M_LASTFRAG;
1216		} else
1217			mhip->ip_off |= IP_MF;
1218		mhip->ip_len = htons((u_short)(len + mhlen));
1219		m->m_next = m_copy(m0, off, len);
1220		if (m->m_next == 0) {		/* copy failed */
1221			m_free(m);
1222			error = ENOBUFS;	/* ??? */
1223			ipstat.ips_odropped++;
1224			goto done;
1225		}
1226		m->m_pkthdr.len = mhlen + len;
1227		m->m_pkthdr.rcvif = (struct ifnet *)0;
1228#ifdef MAC
1229		mac_create_fragment(m0, m);
1230#endif
1231		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1232		mhip->ip_off = htons(mhip->ip_off);
1233		mhip->ip_sum = 0;
1234		if (sw_csum & CSUM_DELAY_IP)
1235			mhip->ip_sum = in_cksum(m, mhlen);
1236		*mnext = m;
1237		mnext = &m->m_nextpkt;
1238	}
1239	ipstat.ips_ofragments += nfrags;
1240
1241	/* set first marker for fragment chain */
1242	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1243	m0->m_pkthdr.csum_data = nfrags;
1244
1245	/*
1246	 * Update first fragment by trimming what's been copied out
1247	 * and updating header.
1248	 */
1249	m_adj(m0, hlen + firstlen - ip->ip_len);
1250	m0->m_pkthdr.len = hlen + firstlen;
1251	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
1252	ip->ip_off |= IP_MF;
1253	ip->ip_off = htons(ip->ip_off);
1254	ip->ip_sum = 0;
1255	if (sw_csum & CSUM_DELAY_IP)
1256		ip->ip_sum = in_cksum(m0, hlen);
1257
1258done:
1259	*m_frag = m0;
1260	return error;
1261}
1262
1263void
1264in_delayed_cksum(struct mbuf *m)
1265{
1266	struct ip *ip;
1267	u_short csum, offset;
1268
1269	ip = mtod(m, struct ip *);
1270	offset = ip->ip_hl << 2 ;
1271	csum = in_cksum_skip(m, ip->ip_len, offset);
1272	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
1273		csum = 0xffff;
1274	offset += m->m_pkthdr.csum_data;	/* checksum offset */
1275
1276	if (offset + sizeof(u_short) > m->m_len) {
1277		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
1278		    m->m_len, offset, ip->ip_p);
1279		/*
1280		 * XXX
1281		 * this shouldn't happen, but if it does, the
1282		 * correct behavior may be to insert the checksum
1283		 * in the existing chain instead of rearranging it.
1284		 */
1285		m = m_pullup(m, offset + sizeof(u_short));
1286	}
1287	*(u_short *)(m->m_data + offset) = csum;
1288}
1289
1290/*
1291 * Insert IP options into preformed packet.
1292 * Adjust IP destination as required for IP source routing,
1293 * as indicated by a non-zero in_addr at the start of the options.
1294 *
1295 * XXX This routine assumes that the packet has no options in place.
1296 */
1297static struct mbuf *
1298ip_insertoptions(m, opt, phlen)
1299	register struct mbuf *m;
1300	struct mbuf *opt;
1301	int *phlen;
1302{
1303	register struct ipoption *p = mtod(opt, struct ipoption *);
1304	struct mbuf *n;
1305	register struct ip *ip = mtod(m, struct ip *);
1306	unsigned optlen;
1307
1308	optlen = opt->m_len - sizeof(p->ipopt_dst);
1309	if (optlen + ip->ip_len > IP_MAXPACKET) {
1310		*phlen = 0;
1311		return (m);		/* XXX should fail */
1312	}
1313	if (p->ipopt_dst.s_addr)
1314		ip->ip_dst = p->ipopt_dst;
1315	if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
1316		MGETHDR(n, M_DONTWAIT, MT_HEADER);
1317		if (n == 0) {
1318			*phlen = 0;
1319			return (m);
1320		}
1321		n->m_pkthdr.rcvif = (struct ifnet *)0;
1322#ifdef MAC
1323		mac_create_mbuf_from_mbuf(m, n);
1324#endif
1325		n->m_pkthdr.len = m->m_pkthdr.len + optlen;
1326		m->m_len -= sizeof(struct ip);
1327		m->m_data += sizeof(struct ip);
1328		n->m_next = m;
1329		m = n;
1330		m->m_len = optlen + sizeof(struct ip);
1331		m->m_data += max_linkhdr;
1332		bcopy(ip, mtod(m, void *), sizeof(struct ip));
1333	} else {
1334		m->m_data -= optlen;
1335		m->m_len += optlen;
1336		m->m_pkthdr.len += optlen;
1337		bcopy(ip, mtod(m, void *), sizeof(struct ip));
1338	}
1339	ip = mtod(m, struct ip *);
1340	bcopy(p->ipopt_list, ip + 1, optlen);
1341	*phlen = sizeof(struct ip) + optlen;
1342	ip->ip_v = IPVERSION;
1343	ip->ip_hl = *phlen >> 2;
1344	ip->ip_len += optlen;
1345	return (m);
1346}
1347
1348/*
1349 * Copy options from ip to jp,
1350 * omitting those not copied during fragmentation.
1351 */
1352int
1353ip_optcopy(ip, jp)
1354	struct ip *ip, *jp;
1355{
1356	register u_char *cp, *dp;
1357	int opt, optlen, cnt;
1358
1359	cp = (u_char *)(ip + 1);
1360	dp = (u_char *)(jp + 1);
1361	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
1362	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1363		opt = cp[0];
1364		if (opt == IPOPT_EOL)
1365			break;
1366		if (opt == IPOPT_NOP) {
1367			/* Preserve for IP mcast tunnel's LSRR alignment. */
1368			*dp++ = IPOPT_NOP;
1369			optlen = 1;
1370			continue;
1371		}
1372
1373		KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
1374		    ("ip_optcopy: malformed ipv4 option"));
1375		optlen = cp[IPOPT_OLEN];
1376		KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
1377		    ("ip_optcopy: malformed ipv4 option"));
1378
1379		/* bogus lengths should have been caught by ip_dooptions */
1380		if (optlen > cnt)
1381			optlen = cnt;
1382		if (IPOPT_COPIED(opt)) {
1383			bcopy(cp, dp, optlen);
1384			dp += optlen;
1385		}
1386	}
1387	for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
1388		*dp++ = IPOPT_EOL;
1389	return (optlen);
1390}
1391
1392/*
1393 * IP socket option processing.
1394 */
1395int
1396ip_ctloutput(so, sopt)
1397	struct socket *so;
1398	struct sockopt *sopt;
1399{
1400	struct	inpcb *inp = sotoinpcb(so);
1401	int	error, optval;
1402
1403	error = optval = 0;
1404	if (sopt->sopt_level != IPPROTO_IP) {
1405		return (EINVAL);
1406	}
1407
1408	switch (sopt->sopt_dir) {
1409	case SOPT_SET:
1410		switch (sopt->sopt_name) {
1411		case IP_OPTIONS:
1412#ifdef notyet
1413		case IP_RETOPTS:
1414#endif
1415		{
1416			struct mbuf *m;
1417			if (sopt->sopt_valsize > MLEN) {
1418				error = EMSGSIZE;
1419				break;
1420			}
1421			MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_HEADER);
1422			if (m == 0) {
1423				error = ENOBUFS;
1424				break;
1425			}
1426			m->m_len = sopt->sopt_valsize;
1427			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
1428					    m->m_len);
1429
1430			return (ip_pcbopts(sopt->sopt_name, &inp->inp_options,
1431					   m));
1432		}
1433
1434		case IP_TOS:
1435		case IP_TTL:
1436		case IP_RECVOPTS:
1437		case IP_RECVRETOPTS:
1438		case IP_RECVDSTADDR:
1439		case IP_RECVTTL:
1440		case IP_RECVIF:
1441		case IP_FAITH:
1442		case IP_ONESBCAST:
1443			error = sooptcopyin(sopt, &optval, sizeof optval,
1444					    sizeof optval);
1445			if (error)
1446				break;
1447
1448			switch (sopt->sopt_name) {
1449			case IP_TOS:
1450				inp->inp_ip_tos = optval;
1451				break;
1452
1453			case IP_TTL:
1454				inp->inp_ip_ttl = optval;
1455				break;
1456#define	OPTSET(bit) \
1457	if (optval) \
1458		inp->inp_flags |= bit; \
1459	else \
1460		inp->inp_flags &= ~bit;
1461
1462			case IP_RECVOPTS:
1463				OPTSET(INP_RECVOPTS);
1464				break;
1465
1466			case IP_RECVRETOPTS:
1467				OPTSET(INP_RECVRETOPTS);
1468				break;
1469
1470			case IP_RECVDSTADDR:
1471				OPTSET(INP_RECVDSTADDR);
1472				break;
1473
1474			case IP_RECVTTL:
1475				OPTSET(INP_RECVTTL);
1476				break;
1477
1478			case IP_RECVIF:
1479				OPTSET(INP_RECVIF);
1480				break;
1481
1482			case IP_FAITH:
1483				OPTSET(INP_FAITH);
1484				break;
1485
1486			case IP_ONESBCAST:
1487				OPTSET(INP_ONESBCAST);
1488				break;
1489			}
1490			break;
1491#undef OPTSET
1492
1493		case IP_MULTICAST_IF:
1494		case IP_MULTICAST_VIF:
1495		case IP_MULTICAST_TTL:
1496		case IP_MULTICAST_LOOP:
1497		case IP_ADD_MEMBERSHIP:
1498		case IP_DROP_MEMBERSHIP:
1499			error = ip_setmoptions(sopt, &inp->inp_moptions);
1500			break;
1501
1502		case IP_PORTRANGE:
1503			error = sooptcopyin(sopt, &optval, sizeof optval,
1504					    sizeof optval);
1505			if (error)
1506				break;
1507
1508			switch (optval) {
1509			case IP_PORTRANGE_DEFAULT:
1510				inp->inp_flags &= ~(INP_LOWPORT);
1511				inp->inp_flags &= ~(INP_HIGHPORT);
1512				break;
1513
1514			case IP_PORTRANGE_HIGH:
1515				inp->inp_flags &= ~(INP_LOWPORT);
1516				inp->inp_flags |= INP_HIGHPORT;
1517				break;
1518
1519			case IP_PORTRANGE_LOW:
1520				inp->inp_flags &= ~(INP_HIGHPORT);
1521				inp->inp_flags |= INP_LOWPORT;
1522				break;
1523
1524			default:
1525				error = EINVAL;
1526				break;
1527			}
1528			break;
1529
1530#if defined(IPSEC) || defined(FAST_IPSEC)
1531		case IP_IPSEC_POLICY:
1532		{
1533			caddr_t req;
1534			size_t len = 0;
1535			int priv;
1536			struct mbuf *m;
1537			int optname;
1538
1539			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1540				break;
1541			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1542				break;
1543			priv = (sopt->sopt_td != NULL &&
1544				suser(sopt->sopt_td) != 0) ? 0 : 1;
1545			req = mtod(m, caddr_t);
1546			len = m->m_len;
1547			optname = sopt->sopt_name;
1548			error = ipsec4_set_policy(inp, optname, req, len, priv);
1549			m_freem(m);
1550			break;
1551		}
1552#endif /*IPSEC*/
1553
1554		default:
1555			error = ENOPROTOOPT;
1556			break;
1557		}
1558		break;
1559
1560	case SOPT_GET:
1561		switch (sopt->sopt_name) {
1562		case IP_OPTIONS:
1563		case IP_RETOPTS:
1564			if (inp->inp_options)
1565				error = sooptcopyout(sopt,
1566						     mtod(inp->inp_options,
1567							  char *),
1568						     inp->inp_options->m_len);
1569			else
1570				sopt->sopt_valsize = 0;
1571			break;
1572
1573		case IP_TOS:
1574		case IP_TTL:
1575		case IP_RECVOPTS:
1576		case IP_RECVRETOPTS:
1577		case IP_RECVDSTADDR:
1578		case IP_RECVTTL:
1579		case IP_RECVIF:
1580		case IP_PORTRANGE:
1581		case IP_FAITH:
1582		case IP_ONESBCAST:
1583			switch (sopt->sopt_name) {
1584
1585			case IP_TOS:
1586				optval = inp->inp_ip_tos;
1587				break;
1588
1589			case IP_TTL:
1590				optval = inp->inp_ip_ttl;
1591				break;
1592
1593#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1594
1595			case IP_RECVOPTS:
1596				optval = OPTBIT(INP_RECVOPTS);
1597				break;
1598
1599			case IP_RECVRETOPTS:
1600				optval = OPTBIT(INP_RECVRETOPTS);
1601				break;
1602
1603			case IP_RECVDSTADDR:
1604				optval = OPTBIT(INP_RECVDSTADDR);
1605				break;
1606
1607			case IP_RECVTTL:
1608				optval = OPTBIT(INP_RECVTTL);
1609				break;
1610
1611			case IP_RECVIF:
1612				optval = OPTBIT(INP_RECVIF);
1613				break;
1614
1615			case IP_PORTRANGE:
1616				if (inp->inp_flags & INP_HIGHPORT)
1617					optval = IP_PORTRANGE_HIGH;
1618				else if (inp->inp_flags & INP_LOWPORT)
1619					optval = IP_PORTRANGE_LOW;
1620				else
1621					optval = 0;
1622				break;
1623
1624			case IP_FAITH:
1625				optval = OPTBIT(INP_FAITH);
1626				break;
1627
1628			case IP_ONESBCAST:
1629				optval = OPTBIT(INP_ONESBCAST);
1630				break;
1631			}
1632			error = sooptcopyout(sopt, &optval, sizeof optval);
1633			break;
1634
1635		case IP_MULTICAST_IF:
1636		case IP_MULTICAST_VIF:
1637		case IP_MULTICAST_TTL:
1638		case IP_MULTICAST_LOOP:
1639		case IP_ADD_MEMBERSHIP:
1640		case IP_DROP_MEMBERSHIP:
1641			error = ip_getmoptions(sopt, inp->inp_moptions);
1642			break;
1643
1644#if defined(IPSEC) || defined(FAST_IPSEC)
1645		case IP_IPSEC_POLICY:
1646		{
1647			struct mbuf *m = NULL;
1648			caddr_t req = NULL;
1649			size_t len = 0;
1650
1651			if (m != 0) {
1652				req = mtod(m, caddr_t);
1653				len = m->m_len;
1654			}
1655			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1656			if (error == 0)
1657				error = soopt_mcopyout(sopt, m); /* XXX */
1658			if (error == 0)
1659				m_freem(m);
1660			break;
1661		}
1662#endif /*IPSEC*/
1663
1664		default:
1665			error = ENOPROTOOPT;
1666			break;
1667		}
1668		break;
1669	}
1670	return (error);
1671}
1672
1673/*
1674 * Set up IP options in pcb for insertion in output packets.
1675 * Store in mbuf with pointer in pcbopt, adding pseudo-option
1676 * with destination address if source routed.
1677 */
1678static int
1679ip_pcbopts(optname, pcbopt, m)
1680	int optname;
1681	struct mbuf **pcbopt;
1682	register struct mbuf *m;
1683{
1684	register int cnt, optlen;
1685	register u_char *cp;
1686	u_char opt;
1687
1688	/* turn off any old options */
1689	if (*pcbopt)
1690		(void)m_free(*pcbopt);
1691	*pcbopt = 0;
1692	if (m == (struct mbuf *)0 || m->m_len == 0) {
1693		/*
1694		 * Only turning off any previous options.
1695		 */
1696		if (m)
1697			(void)m_free(m);
1698		return (0);
1699	}
1700
1701	if (m->m_len % sizeof(int32_t))
1702		goto bad;
1703	/*
1704	 * IP first-hop destination address will be stored before
1705	 * actual options; move other options back
1706	 * and clear it when none present.
1707	 */
1708	if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
1709		goto bad;
1710	cnt = m->m_len;
1711	m->m_len += sizeof(struct in_addr);
1712	cp = mtod(m, u_char *) + sizeof(struct in_addr);
1713	bcopy(mtod(m, void *), cp, (unsigned)cnt);
1714	bzero(mtod(m, void *), sizeof(struct in_addr));
1715
1716	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1717		opt = cp[IPOPT_OPTVAL];
1718		if (opt == IPOPT_EOL)
1719			break;
1720		if (opt == IPOPT_NOP)
1721			optlen = 1;
1722		else {
1723			if (cnt < IPOPT_OLEN + sizeof(*cp))
1724				goto bad;
1725			optlen = cp[IPOPT_OLEN];
1726			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
1727				goto bad;
1728		}
1729		switch (opt) {
1730
1731		default:
1732			break;
1733
1734		case IPOPT_LSRR:
1735		case IPOPT_SSRR:
1736			/*
1737			 * user process specifies route as:
1738			 *	->A->B->C->D
1739			 * D must be our final destination (but we can't
1740			 * check that since we may not have connected yet).
1741			 * A is first hop destination, which doesn't appear in
1742			 * actual IP option, but is stored before the options.
1743			 */
1744			if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
1745				goto bad;
1746			m->m_len -= sizeof(struct in_addr);
1747			cnt -= sizeof(struct in_addr);
1748			optlen -= sizeof(struct in_addr);
1749			cp[IPOPT_OLEN] = optlen;
1750			/*
1751			 * Move first hop before start of options.
1752			 */
1753			bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
1754			    sizeof(struct in_addr));
1755			/*
1756			 * Then copy rest of options back
1757			 * to close up the deleted entry.
1758			 */
1759			bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)),
1760			    &cp[IPOPT_OFFSET+1],
1761			    (unsigned)cnt + sizeof(struct in_addr));
1762			break;
1763		}
1764	}
1765	if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
1766		goto bad;
1767	*pcbopt = m;
1768	return (0);
1769
1770bad:
1771	(void)m_free(m);
1772	return (EINVAL);
1773}
1774
1775/*
1776 * XXX
1777 * The whole multicast option thing needs to be re-thought.
1778 * Several of these options are equally applicable to non-multicast
1779 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1780 * standard option (IP_TTL).
1781 */
1782
1783/*
1784 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1785 */
1786static struct ifnet *
1787ip_multicast_if(a, ifindexp)
1788	struct in_addr *a;
1789	int *ifindexp;
1790{
1791	int ifindex;
1792	struct ifnet *ifp;
1793
1794	if (ifindexp)
1795		*ifindexp = 0;
1796	if (ntohl(a->s_addr) >> 24 == 0) {
1797		ifindex = ntohl(a->s_addr) & 0xffffff;
1798		if (ifindex < 0 || if_index < ifindex)
1799			return NULL;
1800		ifp = ifnet_byindex(ifindex);
1801		if (ifindexp)
1802			*ifindexp = ifindex;
1803	} else {
1804		INADDR_TO_IFP(*a, ifp);
1805	}
1806	return ifp;
1807}
1808
1809/*
1810 * Set the IP multicast options in response to user setsockopt().
1811 */
1812static int
1813ip_setmoptions(sopt, imop)
1814	struct sockopt *sopt;
1815	struct ip_moptions **imop;
1816{
1817	int error = 0;
1818	int i;
1819	struct in_addr addr;
1820	struct ip_mreq mreq;
1821	struct ifnet *ifp;
1822	struct ip_moptions *imo = *imop;
1823	struct route ro;
1824	struct sockaddr_in *dst;
1825	int ifindex;
1826	int s;
1827
1828	if (imo == NULL) {
1829		/*
1830		 * No multicast option buffer attached to the pcb;
1831		 * allocate one and initialize to default values.
1832		 */
1833		imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS,
1834		    M_WAITOK);
1835
1836		if (imo == NULL)
1837			return (ENOBUFS);
1838		*imop = imo;
1839		imo->imo_multicast_ifp = NULL;
1840		imo->imo_multicast_addr.s_addr = INADDR_ANY;
1841		imo->imo_multicast_vif = -1;
1842		imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1843		imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1844		imo->imo_num_memberships = 0;
1845	}
1846
1847	switch (sopt->sopt_name) {
1848	/* store an index number for the vif you wanna use in the send */
1849	case IP_MULTICAST_VIF:
1850		if (legal_vif_num == 0) {
1851			error = EOPNOTSUPP;
1852			break;
1853		}
1854		error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1855		if (error)
1856			break;
1857		if (!legal_vif_num(i) && (i != -1)) {
1858			error = EINVAL;
1859			break;
1860		}
1861		imo->imo_multicast_vif = i;
1862		break;
1863
1864	case IP_MULTICAST_IF:
1865		/*
1866		 * Select the interface for outgoing multicast packets.
1867		 */
1868		error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1869		if (error)
1870			break;
1871		/*
1872		 * INADDR_ANY is used to remove a previous selection.
1873		 * When no interface is selected, a default one is
1874		 * chosen every time a multicast packet is sent.
1875		 */
1876		if (addr.s_addr == INADDR_ANY) {
1877			imo->imo_multicast_ifp = NULL;
1878			break;
1879		}
1880		/*
1881		 * The selected interface is identified by its local
1882		 * IP address.  Find the interface and confirm that
1883		 * it supports multicasting.
1884		 */
1885		s = splimp();
1886		ifp = ip_multicast_if(&addr, &ifindex);
1887		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1888			splx(s);
1889			error = EADDRNOTAVAIL;
1890			break;
1891		}
1892		imo->imo_multicast_ifp = ifp;
1893		if (ifindex)
1894			imo->imo_multicast_addr = addr;
1895		else
1896			imo->imo_multicast_addr.s_addr = INADDR_ANY;
1897		splx(s);
1898		break;
1899
1900	case IP_MULTICAST_TTL:
1901		/*
1902		 * Set the IP time-to-live for outgoing multicast packets.
1903		 * The original multicast API required a char argument,
1904		 * which is inconsistent with the rest of the socket API.
1905		 * We allow either a char or an int.
1906		 */
1907		if (sopt->sopt_valsize == 1) {
1908			u_char ttl;
1909			error = sooptcopyin(sopt, &ttl, 1, 1);
1910			if (error)
1911				break;
1912			imo->imo_multicast_ttl = ttl;
1913		} else {
1914			u_int ttl;
1915			error = sooptcopyin(sopt, &ttl, sizeof ttl,
1916					    sizeof ttl);
1917			if (error)
1918				break;
1919			if (ttl > 255)
1920				error = EINVAL;
1921			else
1922				imo->imo_multicast_ttl = ttl;
1923		}
1924		break;
1925
1926	case IP_MULTICAST_LOOP:
1927		/*
1928		 * Set the loopback flag for outgoing multicast packets.
1929		 * Must be zero or one.  The original multicast API required a
1930		 * char argument, which is inconsistent with the rest
1931		 * of the socket API.  We allow either a char or an int.
1932		 */
1933		if (sopt->sopt_valsize == 1) {
1934			u_char loop;
1935			error = sooptcopyin(sopt, &loop, 1, 1);
1936			if (error)
1937				break;
1938			imo->imo_multicast_loop = !!loop;
1939		} else {
1940			u_int loop;
1941			error = sooptcopyin(sopt, &loop, sizeof loop,
1942					    sizeof loop);
1943			if (error)
1944				break;
1945			imo->imo_multicast_loop = !!loop;
1946		}
1947		break;
1948
1949	case IP_ADD_MEMBERSHIP:
1950		/*
1951		 * Add a multicast group membership.
1952		 * Group must be a valid IP multicast address.
1953		 */
1954		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1955		if (error)
1956			break;
1957
1958		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1959			error = EINVAL;
1960			break;
1961		}
1962		s = splimp();
1963		/*
1964		 * If no interface address was provided, use the interface of
1965		 * the route to the given multicast address.
1966		 */
1967		if (mreq.imr_interface.s_addr == INADDR_ANY) {
1968			bzero((caddr_t)&ro, sizeof(ro));
1969			dst = (struct sockaddr_in *)&ro.ro_dst;
1970			dst->sin_len = sizeof(*dst);
1971			dst->sin_family = AF_INET;
1972			dst->sin_addr = mreq.imr_multiaddr;
1973			rtalloc_ign(&ro, RTF_CLONING);
1974			if (ro.ro_rt == NULL) {
1975				error = EADDRNOTAVAIL;
1976				splx(s);
1977				break;
1978			}
1979			ifp = ro.ro_rt->rt_ifp;
1980			RTFREE(ro.ro_rt);
1981		}
1982		else {
1983			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1984		}
1985
1986		/*
1987		 * See if we found an interface, and confirm that it
1988		 * supports multicast.
1989		 */
1990		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1991			error = EADDRNOTAVAIL;
1992			splx(s);
1993			break;
1994		}
1995		/*
1996		 * See if the membership already exists or if all the
1997		 * membership slots are full.
1998		 */
1999		for (i = 0; i < imo->imo_num_memberships; ++i) {
2000			if (imo->imo_membership[i]->inm_ifp == ifp &&
2001			    imo->imo_membership[i]->inm_addr.s_addr
2002						== mreq.imr_multiaddr.s_addr)
2003				break;
2004		}
2005		if (i < imo->imo_num_memberships) {
2006			error = EADDRINUSE;
2007			splx(s);
2008			break;
2009		}
2010		if (i == IP_MAX_MEMBERSHIPS) {
2011			error = ETOOMANYREFS;
2012			splx(s);
2013			break;
2014		}
2015		/*
2016		 * Everything looks good; add a new record to the multicast
2017		 * address list for the given interface.
2018		 */
2019		if ((imo->imo_membership[i] =
2020		    in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
2021			error = ENOBUFS;
2022			splx(s);
2023			break;
2024		}
2025		++imo->imo_num_memberships;
2026		splx(s);
2027		break;
2028
2029	case IP_DROP_MEMBERSHIP:
2030		/*
2031		 * Drop a multicast group membership.
2032		 * Group must be a valid IP multicast address.
2033		 */
2034		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
2035		if (error)
2036			break;
2037
2038		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
2039			error = EINVAL;
2040			break;
2041		}
2042
2043		s = splimp();
2044		/*
2045		 * If an interface address was specified, get a pointer
2046		 * to its ifnet structure.
2047		 */
2048		if (mreq.imr_interface.s_addr == INADDR_ANY)
2049			ifp = NULL;
2050		else {
2051			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
2052			if (ifp == NULL) {
2053				error = EADDRNOTAVAIL;
2054				splx(s);
2055				break;
2056			}
2057		}
2058		/*
2059		 * Find the membership in the membership array.
2060		 */
2061		for (i = 0; i < imo->imo_num_memberships; ++i) {
2062			if ((ifp == NULL ||
2063			     imo->imo_membership[i]->inm_ifp == ifp) &&
2064			     imo->imo_membership[i]->inm_addr.s_addr ==
2065			     mreq.imr_multiaddr.s_addr)
2066				break;
2067		}
2068		if (i == imo->imo_num_memberships) {
2069			error = EADDRNOTAVAIL;
2070			splx(s);
2071			break;
2072		}
2073		/*
2074		 * Give up the multicast address record to which the
2075		 * membership points.
2076		 */
2077		in_delmulti(imo->imo_membership[i]);
2078		/*
2079		 * Remove the gap in the membership array.
2080		 */
2081		for (++i; i < imo->imo_num_memberships; ++i)
2082			imo->imo_membership[i-1] = imo->imo_membership[i];
2083		--imo->imo_num_memberships;
2084		splx(s);
2085		break;
2086
2087	default:
2088		error = EOPNOTSUPP;
2089		break;
2090	}
2091
2092	/*
2093	 * If all options have default values, no need to keep the mbuf.
2094	 */
2095	if (imo->imo_multicast_ifp == NULL &&
2096	    imo->imo_multicast_vif == -1 &&
2097	    imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
2098	    imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
2099	    imo->imo_num_memberships == 0) {
2100		free(*imop, M_IPMOPTS);
2101		*imop = NULL;
2102	}
2103
2104	return (error);
2105}
2106
2107/*
2108 * Return the IP multicast options in response to user getsockopt().
2109 */
2110static int
2111ip_getmoptions(sopt, imo)
2112	struct sockopt *sopt;
2113	register struct ip_moptions *imo;
2114{
2115	struct in_addr addr;
2116	struct in_ifaddr *ia;
2117	int error, optval;
2118	u_char coptval;
2119
2120	error = 0;
2121	switch (sopt->sopt_name) {
2122	case IP_MULTICAST_VIF:
2123		if (imo != NULL)
2124			optval = imo->imo_multicast_vif;
2125		else
2126			optval = -1;
2127		error = sooptcopyout(sopt, &optval, sizeof optval);
2128		break;
2129
2130	case IP_MULTICAST_IF:
2131		if (imo == NULL || imo->imo_multicast_ifp == NULL)
2132			addr.s_addr = INADDR_ANY;
2133		else if (imo->imo_multicast_addr.s_addr) {
2134			/* return the value user has set */
2135			addr = imo->imo_multicast_addr;
2136		} else {
2137			IFP_TO_IA(imo->imo_multicast_ifp, ia);
2138			addr.s_addr = (ia == NULL) ? INADDR_ANY
2139				: IA_SIN(ia)->sin_addr.s_addr;
2140		}
2141		error = sooptcopyout(sopt, &addr, sizeof addr);
2142		break;
2143
2144	case IP_MULTICAST_TTL:
2145		if (imo == 0)
2146			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
2147		else
2148			optval = coptval = imo->imo_multicast_ttl;
2149		if (sopt->sopt_valsize == 1)
2150			error = sooptcopyout(sopt, &coptval, 1);
2151		else
2152			error = sooptcopyout(sopt, &optval, sizeof optval);
2153		break;
2154
2155	case IP_MULTICAST_LOOP:
2156		if (imo == 0)
2157			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
2158		else
2159			optval = coptval = imo->imo_multicast_loop;
2160		if (sopt->sopt_valsize == 1)
2161			error = sooptcopyout(sopt, &coptval, 1);
2162		else
2163			error = sooptcopyout(sopt, &optval, sizeof optval);
2164		break;
2165
2166	default:
2167		error = ENOPROTOOPT;
2168		break;
2169	}
2170	return (error);
2171}
2172
2173/*
2174 * Discard the IP multicast options.
2175 */
2176void
2177ip_freemoptions(imo)
2178	register struct ip_moptions *imo;
2179{
2180	register int i;
2181
2182	if (imo != NULL) {
2183		for (i = 0; i < imo->imo_num_memberships; ++i)
2184			in_delmulti(imo->imo_membership[i]);
2185		free(imo, M_IPMOPTS);
2186	}
2187}
2188
2189/*
2190 * Routine called from ip_output() to loop back a copy of an IP multicast
2191 * packet to the input queue of a specified interface.  Note that this
2192 * calls the output routine of the loopback "driver", but with an interface
2193 * pointer that might NOT be a loopback interface -- evil, but easier than
2194 * replicating that code here.
2195 */
2196static void
2197ip_mloopback(ifp, m, dst, hlen)
2198	struct ifnet *ifp;
2199	register struct mbuf *m;
2200	register struct sockaddr_in *dst;
2201	int hlen;
2202{
2203	register struct ip *ip;
2204	struct mbuf *copym;
2205
2206	copym = m_copy(m, 0, M_COPYALL);
2207	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
2208		copym = m_pullup(copym, hlen);
2209	if (copym != NULL) {
2210		/*
2211		 * We don't bother to fragment if the IP length is greater
2212		 * than the interface's MTU.  Can this possibly matter?
2213		 */
2214		ip = mtod(copym, struct ip *);
2215		ip->ip_len = htons(ip->ip_len);
2216		ip->ip_off = htons(ip->ip_off);
2217		ip->ip_sum = 0;
2218		ip->ip_sum = in_cksum(copym, hlen);
2219		/*
2220		 * NB:
2221		 * It's not clear whether there are any lingering
2222		 * reentrancy problems in other areas which might
2223		 * be exposed by using ip_input directly (in
2224		 * particular, everything which modifies the packet
2225		 * in-place).  Yet another option is using the
2226		 * protosw directly to deliver the looped back
2227		 * packet.  For the moment, we'll err on the side
2228		 * of safety by using if_simloop().
2229		 */
2230#if 1 /* XXX */
2231		if (dst->sin_family != AF_INET) {
2232			printf("ip_mloopback: bad address family %d\n",
2233						dst->sin_family);
2234			dst->sin_family = AF_INET;
2235		}
2236#endif
2237
2238#ifdef notdef
2239		copym->m_pkthdr.rcvif = ifp;
2240		ip_input(copym);
2241#else
2242		/* if the checksum hasn't been computed, mark it as valid */
2243		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
2244			copym->m_pkthdr.csum_flags |=
2245			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
2246			copym->m_pkthdr.csum_data = 0xffff;
2247		}
2248		if_simloop(ifp, copym, dst->sin_family, 0);
2249#endif
2250	}
2251}
2252