ip_output.c revision 125784
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
34 * $FreeBSD: head/sys/netinet/ip_output.c 125784 2004-02-13 19:14:16Z mlaier $
35 */
36
37#include "opt_ipfw.h"
38#include "opt_ipdn.h"
39#include "opt_ipdivert.h"
40#include "opt_ipfilter.h"
41#include "opt_ipsec.h"
42#include "opt_mac.h"
43#include "opt_pfil_hooks.h"
44#include "opt_random_ip_id.h"
45#include "opt_mbuf_stress_test.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/kernel.h>
50#include <sys/mac.h>
51#include <sys/malloc.h>
52#include <sys/mbuf.h>
53#include <sys/protosw.h>
54#include <sys/socket.h>
55#include <sys/socketvar.h>
56#include <sys/sysctl.h>
57
58#include <net/if.h>
59#include <net/route.h>
60
61#include <netinet/in.h>
62#include <netinet/in_systm.h>
63#include <netinet/ip.h>
64#include <netinet/in_pcb.h>
65#include <netinet/in_var.h>
66#include <netinet/ip_var.h>
67
68#ifdef PFIL_HOOKS
69#include <net/pfil.h>
70#endif
71
72#include <machine/in_cksum.h>
73
74static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
75
76#ifdef IPSEC
77#include <netinet6/ipsec.h>
78#include <netkey/key.h>
79#ifdef IPSEC_DEBUG
80#include <netkey/key_debug.h>
81#else
82#define	KEYDEBUG(lev,arg)
83#endif
84#endif /*IPSEC*/
85
86#ifdef FAST_IPSEC
87#include <netipsec/ipsec.h>
88#include <netipsec/xform.h>
89#include <netipsec/key.h>
90#endif /*FAST_IPSEC*/
91
92#include <netinet/ip_fw.h>
93#include <netinet/ip_divert.h>
94#include <netinet/ip_dummynet.h>
95
96#define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
97				x, (ntohl(a.s_addr)>>24)&0xFF,\
98				  (ntohl(a.s_addr)>>16)&0xFF,\
99				  (ntohl(a.s_addr)>>8)&0xFF,\
100				  (ntohl(a.s_addr))&0xFF, y);
101
102u_short ip_id;
103
104#ifdef MBUF_STRESS_TEST
105int mbuf_frag_size = 0;
106SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
107	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
108#endif
109
110static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
111static struct ifnet *ip_multicast_if(struct in_addr *, int *);
112static void	ip_mloopback
113	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
114static int	ip_getmoptions
115	(struct sockopt *, struct ip_moptions *);
116static int	ip_pcbopts(int, struct mbuf **, struct mbuf *);
117static int	ip_setmoptions
118	(struct sockopt *, struct ip_moptions **);
119
120int	ip_optcopy(struct ip *, struct ip *);
121
122
123extern	struct protosw inetsw[];
124
125/*
126 * IP output.  The packet in mbuf chain m contains a skeletal IP
127 * header (with len, off, ttl, proto, tos, src, dst).
128 * The mbuf chain containing the packet will be freed.
129 * The mbuf opt, if present, will not be freed.
130 * In the IP forwarding case, the packet will arrive with options already
131 * inserted, so must have a NULL opt pointer.
132 */
133int
134ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro,
135	int flags, struct ip_moptions *imo, struct inpcb *inp)
136{
137	struct ip *ip;
138	struct ifnet *ifp = NULL;	/* keep compiler happy */
139	int hlen = sizeof (struct ip);
140	int len, off, error = 0;
141	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
142	struct in_ifaddr *ia = NULL;
143	int isbroadcast, sw_csum;
144	struct in_addr pkt_dst;
145	struct route iproute;
146	struct m_tag *dummytag;		/* dummynet packet tag */
147	struct m_tag *mtag;
148	struct mbuf *m0;		/* XXX */
149#ifdef IPSEC
150	struct secpolicy *sp = NULL;
151#endif
152#ifdef FAST_IPSEC
153	struct secpolicy *sp = NULL;
154	struct tdb_ident *tdbi;
155	int s;
156#endif /* FAST_IPSEC */
157	struct ip_fw_args args;
158	int src_was_INADDR_ANY = 0;	/* as the name says... */
159
160	args.eh = NULL;
161	args.rule = NULL;
162	args.next_hop = ip_claim_next_hop(m);
163
164	M_ASSERTPKTHDR(m);
165
166	if (ro == NULL) {
167		ro = &iproute;
168		bzero(ro, sizeof (*ro));
169	}
170
171	if (inp != NULL)
172		INP_LOCK_ASSERT(inp);
173
174	/*
175	 * When packet comes from dummynet restore state from
176	 * previous processing instead of the header.  Yech!
177	 *
178	 * XXX add conditional compilation?
179	 */
180	dummytag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
181	if (dummytag != NULL) {
182		struct dn_pkt_tag *dt = (struct dn_pkt_tag *)(dummytag+1);
183
184		/*
185		 * NB: the route in the tag is known to have a
186		 * reference that must be free'd, but doing this
187		 * before the storage is reclaimed is painful due
188		 * to some of the contorted code in this routine.
189		 * So instead unlink the tag from the mbuf so it
190		 * doesn't get reclaimed and do the cleanup explicitly
191		 * below.  We should be able to do this automatically
192		 * using a uma dtor method when m_tag's can be
193		 * allocated from zones.
194		 */
195		m_tag_unlink(m, dummytag);
196
197		args.rule = dt->rule;
198		ro = &dt->ro;
199		dst = dt->dn_dst;
200		ifp = dt->ifp;
201
202		ip = mtod(m, struct ip *);
203		hlen = ip->ip_hl << 2 ;
204		if (ro->ro_rt)
205			ia = ifatoia(ro->ro_rt->rt_ifa);
206		goto sendit;
207	}
208
209	if (opt) {
210		len = 0;
211		m = ip_insertoptions(m, opt, &len);
212		if (len != 0)
213			hlen = len;
214	}
215	ip = mtod(m, struct ip *);
216	pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
217
218	/*
219	 * Fill in IP header.  If we are not allowing fragmentation,
220	 * then the ip_id field is meaningless, but we don't set it
221	 * to zero.  Doing so causes various problems when devices along
222	 * the path (routers, load balancers, firewalls, etc.) illegally
223	 * disable DF on our packet.  Note that a 16-bit counter
224	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
225	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
226	 * for Counting NATted Hosts", Proc. IMW'02, available at
227	 * <http://www.research.att.com/~smb/papers/fnat.pdf>.
228	 */
229	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
230		ip->ip_v = IPVERSION;
231		ip->ip_hl = hlen >> 2;
232#ifdef RANDOM_IP_ID
233		ip->ip_id = ip_randomid();
234#else
235		ip->ip_id = htons(ip_id++);
236#endif
237		ipstat.ips_localout++;
238	} else {
239		hlen = ip->ip_hl << 2;
240	}
241
242	dst = (struct sockaddr_in *)&ro->ro_dst;
243	/*
244	 * If there is a cached route,
245	 * check that it is to the same destination
246	 * and is still up.  If not, free it and try again.
247	 * The address family should also be checked in case of sharing the
248	 * cache with IPv6.
249	 */
250	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
251			  dst->sin_family != AF_INET ||
252			  dst->sin_addr.s_addr != pkt_dst.s_addr)) {
253		RTFREE(ro->ro_rt);
254		ro->ro_rt = (struct rtentry *)0;
255	}
256	if (ro->ro_rt == 0) {
257		bzero(dst, sizeof(*dst));
258		dst->sin_family = AF_INET;
259		dst->sin_len = sizeof(*dst);
260		dst->sin_addr = pkt_dst;
261	}
262	/*
263	 * If routing to interface only,
264	 * short circuit routing lookup.
265	 */
266	if (flags & IP_ROUTETOIF) {
267		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 &&
268		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
269			ipstat.ips_noroute++;
270			error = ENETUNREACH;
271			goto bad;
272		}
273		ifp = ia->ia_ifp;
274		ip->ip_ttl = 1;
275		isbroadcast = in_broadcast(dst->sin_addr, ifp);
276	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
277	    imo != NULL && imo->imo_multicast_ifp != NULL) {
278		/*
279		 * Bypass the normal routing lookup for multicast
280		 * packets if the interface is specified.
281		 */
282		ifp = imo->imo_multicast_ifp;
283		IFP_TO_IA(ifp, ia);
284		isbroadcast = 0;	/* fool gcc */
285	} else {
286		/*
287		 * We want to do any cloning requested by the link layer,
288		 * as this is probably required in all cases for correct
289		 * operation (as it is for ARP).
290		 */
291		if (ro->ro_rt == 0)
292			rtalloc(ro);
293		if (ro->ro_rt == 0) {
294			ipstat.ips_noroute++;
295			error = EHOSTUNREACH;
296			goto bad;
297		}
298		ia = ifatoia(ro->ro_rt->rt_ifa);
299		ifp = ro->ro_rt->rt_ifp;
300		ro->ro_rt->rt_rmx.rmx_pksent++;
301		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
302			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
303		if (ro->ro_rt->rt_flags & RTF_HOST)
304			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
305		else
306			isbroadcast = in_broadcast(dst->sin_addr, ifp);
307	}
308	if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
309		struct in_multi *inm;
310
311		m->m_flags |= M_MCAST;
312		/*
313		 * IP destination address is multicast.  Make sure "dst"
314		 * still points to the address in "ro".  (It may have been
315		 * changed to point to a gateway address, above.)
316		 */
317		dst = (struct sockaddr_in *)&ro->ro_dst;
318		/*
319		 * See if the caller provided any multicast options
320		 */
321		if (imo != NULL) {
322			ip->ip_ttl = imo->imo_multicast_ttl;
323			if (imo->imo_multicast_vif != -1)
324				ip->ip_src.s_addr =
325				    ip_mcast_src ?
326				    ip_mcast_src(imo->imo_multicast_vif) :
327				    INADDR_ANY;
328		} else
329			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
330		/*
331		 * Confirm that the outgoing interface supports multicast.
332		 */
333		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
334			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
335				ipstat.ips_noroute++;
336				error = ENETUNREACH;
337				goto bad;
338			}
339		}
340		/*
341		 * If source address not specified yet, use address
342		 * of outgoing interface.
343		 */
344		if (ip->ip_src.s_addr == INADDR_ANY) {
345			/* Interface may have no addresses. */
346			if (ia != NULL)
347				ip->ip_src = IA_SIN(ia)->sin_addr;
348		}
349
350		if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
351			/*
352			 * XXX
353			 * delayed checksums are not currently
354			 * compatible with IP multicast routing
355			 */
356			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
357				in_delayed_cksum(m);
358				m->m_pkthdr.csum_flags &=
359					~CSUM_DELAY_DATA;
360			}
361		}
362		IN_LOOKUP_MULTI(pkt_dst, ifp, inm);
363		if (inm != NULL &&
364		   (imo == NULL || imo->imo_multicast_loop)) {
365			/*
366			 * If we belong to the destination multicast group
367			 * on the outgoing interface, and the caller did not
368			 * forbid loopback, loop back a copy.
369			 */
370			ip_mloopback(ifp, m, dst, hlen);
371		}
372		else {
373			/*
374			 * If we are acting as a multicast router, perform
375			 * multicast forwarding as if the packet had just
376			 * arrived on the interface to which we are about
377			 * to send.  The multicast forwarding function
378			 * recursively calls this function, using the
379			 * IP_FORWARDING flag to prevent infinite recursion.
380			 *
381			 * Multicasts that are looped back by ip_mloopback(),
382			 * above, will be forwarded by the ip_input() routine,
383			 * if necessary.
384			 */
385			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
386				/*
387				 * If rsvp daemon is not running, do not
388				 * set ip_moptions. This ensures that the packet
389				 * is multicast and not just sent down one link
390				 * as prescribed by rsvpd.
391				 */
392				if (!rsvp_on)
393					imo = NULL;
394				if (ip_mforward &&
395				    ip_mforward(ip, ifp, m, imo) != 0) {
396					m_freem(m);
397					goto done;
398				}
399			}
400		}
401
402		/*
403		 * Multicasts with a time-to-live of zero may be looped-
404		 * back, above, but must not be transmitted on a network.
405		 * Also, multicasts addressed to the loopback interface
406		 * are not sent -- the above call to ip_mloopback() will
407		 * loop back a copy if this host actually belongs to the
408		 * destination group on the loopback interface.
409		 */
410		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
411			m_freem(m);
412			goto done;
413		}
414
415		goto sendit;
416	}
417#ifndef notdef
418	/*
419	 * If the source address is not specified yet, use the address
420	 * of the outoing interface. In case, keep note we did that, so
421	 * if the the firewall changes the next-hop causing the output
422	 * interface to change, we can fix that.
423	 */
424	if (ip->ip_src.s_addr == INADDR_ANY) {
425		/* Interface may have no addresses. */
426		if (ia != NULL) {
427			ip->ip_src = IA_SIN(ia)->sin_addr;
428			src_was_INADDR_ANY = 1;
429		}
430	}
431#endif /* notdef */
432	/*
433	 * Verify that we have any chance at all of being able to queue
434	 *      the packet or packet fragments
435	 */
436	if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
437		ifp->if_snd.ifq_maxlen) {
438			error = ENOBUFS;
439			ipstat.ips_odropped++;
440			goto bad;
441	}
442
443	/*
444	 * Look for broadcast address and
445	 * verify user is allowed to send
446	 * such a packet.
447	 */
448	if (isbroadcast) {
449		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
450			error = EADDRNOTAVAIL;
451			goto bad;
452		}
453		if ((flags & IP_ALLOWBROADCAST) == 0) {
454			error = EACCES;
455			goto bad;
456		}
457		/* don't allow broadcast messages to be fragmented */
458		if (ip->ip_len > ifp->if_mtu) {
459			error = EMSGSIZE;
460			goto bad;
461		}
462		if (flags & IP_SENDONES)
463			ip->ip_dst.s_addr = INADDR_BROADCAST;
464		m->m_flags |= M_BCAST;
465	} else {
466		m->m_flags &= ~M_BCAST;
467	}
468
469sendit:
470#ifdef IPSEC
471	/* get SP for this packet */
472	if (inp == NULL)
473		sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
474		    flags, &error);
475	else
476		sp = ipsec4_getpolicybypcb(m, IPSEC_DIR_OUTBOUND, inp, &error);
477
478	if (sp == NULL) {
479		ipsecstat.out_inval++;
480		goto bad;
481	}
482
483	error = 0;
484
485	/* check policy */
486	switch (sp->policy) {
487	case IPSEC_POLICY_DISCARD:
488		/*
489		 * This packet is just discarded.
490		 */
491		ipsecstat.out_polvio++;
492		goto bad;
493
494	case IPSEC_POLICY_BYPASS:
495	case IPSEC_POLICY_NONE:
496	case IPSEC_POLICY_TCP:
497		/* no need to do IPsec. */
498		goto skip_ipsec;
499
500	case IPSEC_POLICY_IPSEC:
501		if (sp->req == NULL) {
502			/* acquire a policy */
503			error = key_spdacquire(sp);
504			goto bad;
505		}
506		break;
507
508	case IPSEC_POLICY_ENTRUST:
509	default:
510		printf("ip_output: Invalid policy found. %d\n", sp->policy);
511	}
512    {
513	struct ipsec_output_state state;
514	bzero(&state, sizeof(state));
515	state.m = m;
516	if (flags & IP_ROUTETOIF) {
517		state.ro = &iproute;
518		bzero(&iproute, sizeof(iproute));
519	} else
520		state.ro = ro;
521	state.dst = (struct sockaddr *)dst;
522
523	ip->ip_sum = 0;
524
525	/*
526	 * XXX
527	 * delayed checksums are not currently compatible with IPsec
528	 */
529	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
530		in_delayed_cksum(m);
531		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
532	}
533
534	ip->ip_len = htons(ip->ip_len);
535	ip->ip_off = htons(ip->ip_off);
536
537	error = ipsec4_output(&state, sp, flags);
538
539	m = state.m;
540	if (flags & IP_ROUTETOIF) {
541		/*
542		 * if we have tunnel mode SA, we may need to ignore
543		 * IP_ROUTETOIF.
544		 */
545		if (state.ro != &iproute || state.ro->ro_rt != NULL) {
546			flags &= ~IP_ROUTETOIF;
547			ro = state.ro;
548		}
549	} else
550		ro = state.ro;
551	dst = (struct sockaddr_in *)state.dst;
552	if (error) {
553		/* mbuf is already reclaimed in ipsec4_output. */
554		m = NULL;
555		switch (error) {
556		case EHOSTUNREACH:
557		case ENETUNREACH:
558		case EMSGSIZE:
559		case ENOBUFS:
560		case ENOMEM:
561			break;
562		default:
563			printf("ip4_output (ipsec): error code %d\n", error);
564			/*fall through*/
565		case ENOENT:
566			/* don't show these error codes to the user */
567			error = 0;
568			break;
569		}
570		goto bad;
571	}
572    }
573
574	/* be sure to update variables that are affected by ipsec4_output() */
575	ip = mtod(m, struct ip *);
576	hlen = ip->ip_hl << 2;
577	if (ro->ro_rt == NULL) {
578		if ((flags & IP_ROUTETOIF) == 0) {
579			printf("ip_output: "
580				"can't update route after IPsec processing\n");
581			error = EHOSTUNREACH;	/*XXX*/
582			goto bad;
583		}
584	} else {
585		ia = ifatoia(ro->ro_rt->rt_ifa);
586		ifp = ro->ro_rt->rt_ifp;
587	}
588
589	/* make it flipped, again. */
590	ip->ip_len = ntohs(ip->ip_len);
591	ip->ip_off = ntohs(ip->ip_off);
592skip_ipsec:
593#endif /*IPSEC*/
594#ifdef FAST_IPSEC
595	/*
596	 * Check the security policy (SP) for the packet and, if
597	 * required, do IPsec-related processing.  There are two
598	 * cases here; the first time a packet is sent through
599	 * it will be untagged and handled by ipsec4_checkpolicy.
600	 * If the packet is resubmitted to ip_output (e.g. after
601	 * AH, ESP, etc. processing), there will be a tag to bypass
602	 * the lookup and related policy checking.
603	 */
604	mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
605	s = splnet();
606	if (mtag != NULL) {
607		tdbi = (struct tdb_ident *)(mtag + 1);
608		sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND);
609		if (sp == NULL)
610			error = -EINVAL;	/* force silent drop */
611		m_tag_delete(m, mtag);
612	} else {
613		sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags,
614					&error, inp);
615	}
616	/*
617	 * There are four return cases:
618	 *    sp != NULL	 	    apply IPsec policy
619	 *    sp == NULL, error == 0	    no IPsec handling needed
620	 *    sp == NULL, error == -EINVAL  discard packet w/o error
621	 *    sp == NULL, error != 0	    discard packet, report error
622	 */
623	if (sp != NULL) {
624		/* Loop detection, check if ipsec processing already done */
625		KASSERT(sp->req != NULL, ("ip_output: no ipsec request"));
626		for (mtag = m_tag_first(m); mtag != NULL;
627		     mtag = m_tag_next(m, mtag)) {
628			if (mtag->m_tag_cookie != MTAG_ABI_COMPAT)
629				continue;
630			if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
631			    mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
632				continue;
633			/*
634			 * Check if policy has an SA associated with it.
635			 * This can happen when an SP has yet to acquire
636			 * an SA; e.g. on first reference.  If it occurs,
637			 * then we let ipsec4_process_packet do its thing.
638			 */
639			if (sp->req->sav == NULL)
640				break;
641			tdbi = (struct tdb_ident *)(mtag + 1);
642			if (tdbi->spi == sp->req->sav->spi &&
643			    tdbi->proto == sp->req->sav->sah->saidx.proto &&
644			    bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst,
645				 sizeof (union sockaddr_union)) == 0) {
646				/*
647				 * No IPsec processing is needed, free
648				 * reference to SP.
649				 *
650				 * NB: null pointer to avoid free at
651				 *     done: below.
652				 */
653				KEY_FREESP(&sp), sp = NULL;
654				splx(s);
655				goto spd_done;
656			}
657		}
658
659		/*
660		 * Do delayed checksums now because we send before
661		 * this is done in the normal processing path.
662		 */
663		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
664			in_delayed_cksum(m);
665			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
666		}
667
668		ip->ip_len = htons(ip->ip_len);
669		ip->ip_off = htons(ip->ip_off);
670
671		/* NB: callee frees mbuf */
672		error = ipsec4_process_packet(m, sp->req, flags, 0);
673		/*
674		 * Preserve KAME behaviour: ENOENT can be returned
675		 * when an SA acquire is in progress.  Don't propagate
676		 * this to user-level; it confuses applications.
677		 *
678		 * XXX this will go away when the SADB is redone.
679		 */
680		if (error == ENOENT)
681			error = 0;
682		splx(s);
683		goto done;
684	} else {
685		splx(s);
686
687		if (error != 0) {
688			/*
689			 * Hack: -EINVAL is used to signal that a packet
690			 * should be silently discarded.  This is typically
691			 * because we asked key management for an SA and
692			 * it was delayed (e.g. kicked up to IKE).
693			 */
694			if (error == -EINVAL)
695				error = 0;
696			goto bad;
697		} else {
698			/* No IPsec processing for this packet. */
699		}
700#ifdef notyet
701		/*
702		 * If deferred crypto processing is needed, check that
703		 * the interface supports it.
704		 */
705		mtag = m_tag_find(m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL);
706		if (mtag != NULL && (ifp->if_capenable & IFCAP_IPSEC) == 0) {
707			/* notify IPsec to do its own crypto */
708			ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1));
709			error = EHOSTUNREACH;
710			goto bad;
711		}
712#endif
713	}
714spd_done:
715#endif /* FAST_IPSEC */
716
717	/*
718	 * IpHack's section.
719	 * - Xlate: translate packet's addr/port (NAT).
720	 * - Firewall: deny/allow/etc.
721	 * - Wrap: fake packet's addr/port <unimpl.>
722	 * - Encapsulate: put it in another IP and send out. <unimp.>
723	 */
724#ifdef PFIL_HOOKS
725	/*
726	 * Run through list of hooks for output packets.
727	 */
728	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT);
729	if (error != 0 || m == NULL)
730		goto done;
731	ip = mtod(m, struct ip *);
732#endif /* PFIL_HOOKS */
733
734	/*
735	 * Check with the firewall...
736	 * but not if we are already being fwd'd from a firewall.
737	 */
738	if (fw_enable && IPFW_LOADED && !args.next_hop) {
739		struct sockaddr_in *old = dst;
740
741		args.m = m;
742		args.next_hop = dst;
743		args.oif = ifp;
744		off = ip_fw_chk_ptr(&args);
745		m = args.m;
746		dst = args.next_hop;
747
748                /*
749		 * On return we must do the following:
750		 * m == NULL	-> drop the pkt (old interface, deprecated)
751		 * (off & IP_FW_PORT_DENY_FLAG)	-> drop the pkt (new interface)
752		 * 1<=off<= 0xffff		-> DIVERT
753		 * (off & IP_FW_PORT_DYNT_FLAG)	-> send to a DUMMYNET pipe
754		 * (off & IP_FW_PORT_TEE_FLAG)	-> TEE the packet
755		 * dst != old			-> IPFIREWALL_FORWARD
756		 * off==0, dst==old		-> accept
757		 * If some of the above modules are not compiled in, then
758		 * we should't have to check the corresponding condition
759		 * (because the ipfw control socket should not accept
760		 * unsupported rules), but better play safe and drop
761		 * packets in case of doubt.
762		 */
763		if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) {
764			if (m)
765				m_freem(m);
766			error = EACCES;
767			goto done;
768		}
769		ip = mtod(m, struct ip *);
770		if (off == 0 && dst == old)		/* common case */
771			goto pass;
772                if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) {
773			/*
774			 * pass the pkt to dummynet. Need to include
775			 * pipe number, m, ifp, ro, dst because these are
776			 * not recomputed in the next pass.
777			 * All other parameters have been already used and
778			 * so they are not needed anymore.
779			 * XXX note: if the ifp or ro entry are deleted
780			 * while a pkt is in dummynet, we are in trouble!
781			 */
782			args.ro = ro;
783			args.dst = dst;
784			args.flags = flags;
785
786			error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
787				&args);
788			goto done;
789		}
790#ifdef IPDIVERT
791		if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
792			struct mbuf *clone;
793
794			/* Clone packet if we're doing a 'tee' */
795			if ((off & IP_FW_PORT_TEE_FLAG) != 0)
796				clone = divert_clone(m);
797			else
798				clone = NULL;
799
800			/*
801			 * XXX
802			 * delayed checksums are not currently compatible
803			 * with divert sockets.
804			 */
805			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
806				in_delayed_cksum(m);
807				m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
808			}
809
810			/* Restore packet header fields to original values */
811			ip->ip_len = htons(ip->ip_len);
812			ip->ip_off = htons(ip->ip_off);
813
814			/* Deliver packet to divert input routine */
815			divert_packet(m, 0);
816
817			/* If 'tee', continue with original packet */
818			if (clone != NULL) {
819				m = clone;
820				ip = mtod(m, struct ip *);
821				goto pass;
822			}
823			goto done;
824		}
825#endif
826
827		/* IPFIREWALL_FORWARD */
828		/*
829		 * Check dst to make sure it is directly reachable on the
830		 * interface we previously thought it was.
831		 * If it isn't (which may be likely in some situations) we have
832		 * to re-route it (ie, find a route for the next-hop and the
833		 * associated interface) and set them here. This is nested
834		 * forwarding which in most cases is undesirable, except where
835		 * such control is nigh impossible. So we do it here.
836		 * And I'm babbling.
837		 */
838		if (off == 0 && old != dst) { /* FORWARD, dst has changed */
839#if 0
840			/*
841			 * XXX To improve readability, this block should be
842			 * changed into a function call as below:
843			 */
844			error = ip_ipforward(&m, &dst, &ifp);
845			if (error)
846				goto bad;
847			if (m == NULL) /* ip_input consumed the mbuf */
848				goto done;
849#else
850			struct in_ifaddr *ia;
851
852			/*
853			 * XXX sro_fwd below is static, and a pointer
854			 * to it gets passed to routines downstream.
855			 * This could have surprisingly bad results in
856			 * practice, because its content is overwritten
857			 * by subsequent packets.
858			 */
859			/* There must be a better way to do this next line... */
860			static struct route sro_fwd;
861			struct route *ro_fwd = &sro_fwd;
862
863#if 0
864			print_ip("IPFIREWALL_FORWARD: New dst ip: ",
865			    dst->sin_addr, "\n");
866#endif
867
868			/*
869			 * We need to figure out if we have been forwarded
870			 * to a local socket. If so, then we should somehow
871			 * "loop back" to ip_input, and get directed to the
872			 * PCB as if we had received this packet. This is
873			 * because it may be dificult to identify the packets
874			 * you want to forward until they are being output
875			 * and have selected an interface. (e.g. locally
876			 * initiated packets) If we used the loopback inteface,
877			 * we would not be able to control what happens
878			 * as the packet runs through ip_input() as
879			 * it is done through an ISR.
880			 */
881			LIST_FOREACH(ia,
882			    INADDR_HASH(dst->sin_addr.s_addr), ia_hash) {
883				/*
884				 * If the addr to forward to is one
885				 * of ours, we pretend to
886				 * be the destination for this packet.
887				 */
888				if (IA_SIN(ia)->sin_addr.s_addr ==
889						 dst->sin_addr.s_addr)
890					break;
891			}
892			if (ia) {	/* tell ip_input "dont filter" */
893				mtag = m_tag_get(PACKET_TAG_IPFORWARD,
894						sizeof(struct sockaddr_in *),
895						M_NOWAIT);
896				if (mtag == NULL) {
897					/* XXX statistic */
898					error = ENOBUFS;	/* XXX */
899					goto bad;
900				}
901				*(struct sockaddr_in **)(mtag+1) =
902					args.next_hop;
903				m_tag_prepend(m, mtag);
904
905				if (m->m_pkthdr.rcvif == NULL)
906					m->m_pkthdr.rcvif = ifunit("lo0");
907				if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
908					m->m_pkthdr.csum_flags |=
909					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
910					m->m_pkthdr.csum_data = 0xffff;
911				}
912				m->m_pkthdr.csum_flags |=
913				    CSUM_IP_CHECKED | CSUM_IP_VALID;
914				ip->ip_len = htons(ip->ip_len);
915				ip->ip_off = htons(ip->ip_off);
916				/* XXX netisr_queue(NETISR_IP, m); */
917				ip_input(m);
918				goto done;
919			}
920			/*
921			 * Some of the logic for this was
922			 * nicked from above.
923			 */
924			bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
925
926			ro_fwd->ro_rt = 0;
927			rtalloc_ign(ro_fwd, RTF_CLONING);
928
929			if (ro_fwd->ro_rt == 0) {
930				ipstat.ips_noroute++;
931				error = EHOSTUNREACH;
932				goto bad;
933			}
934
935			ia = ifatoia(ro_fwd->ro_rt->rt_ifa);
936			ifp = ro_fwd->ro_rt->rt_ifp;
937			ro_fwd->ro_rt->rt_rmx.rmx_pksent++;
938			if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
939				dst = (struct sockaddr_in *)
940					ro_fwd->ro_rt->rt_gateway;
941			if (ro_fwd->ro_rt->rt_flags & RTF_HOST)
942				isbroadcast =
943				    (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
944			else
945				isbroadcast = in_broadcast(dst->sin_addr, ifp);
946			if (ro->ro_rt)
947				RTFREE(ro->ro_rt);
948			ro->ro_rt = ro_fwd->ro_rt;
949			dst = (struct sockaddr_in *)&ro_fwd->ro_dst;
950
951#endif	/* ... block to be put into a function */
952			/*
953			 * If we added a default src ip earlier,
954			 * which would have been gotten from the-then
955			 * interface, do it again, from the new one.
956			 */
957			if (src_was_INADDR_ANY)
958				ip->ip_src = IA_SIN(ia)->sin_addr;
959			goto pass ;
960		}
961
962                /*
963                 * if we get here, none of the above matches, and
964                 * we have to drop the pkt
965                 */
966		m_freem(m);
967                error = EACCES; /* not sure this is the right error msg */
968                goto done;
969	}
970
971pass:
972	/* 127/8 must not appear on wire - RFC1122. */
973	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
974	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
975		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
976			ipstat.ips_badaddr++;
977			error = EADDRNOTAVAIL;
978			goto bad;
979		}
980	}
981
982	m->m_pkthdr.csum_flags |= CSUM_IP;
983	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
984	if (sw_csum & CSUM_DELAY_DATA) {
985		in_delayed_cksum(m);
986		sw_csum &= ~CSUM_DELAY_DATA;
987	}
988	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
989
990	/*
991	 * If small enough for interface, or the interface will take
992	 * care of the fragmentation for us, can just send directly.
993	 */
994	if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT &&
995	    ((ip->ip_off & IP_DF) == 0))) {
996		ip->ip_len = htons(ip->ip_len);
997		ip->ip_off = htons(ip->ip_off);
998		ip->ip_sum = 0;
999		if (sw_csum & CSUM_DELAY_IP)
1000			ip->ip_sum = in_cksum(m, hlen);
1001
1002		/* Record statistics for this interface address. */
1003		if (!(flags & IP_FORWARDING) && ia) {
1004			ia->ia_ifa.if_opackets++;
1005			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1006		}
1007
1008#ifdef IPSEC
1009		/* clean ipsec history once it goes out of the node */
1010		ipsec_delaux(m);
1011#endif
1012
1013#ifdef MBUF_STRESS_TEST
1014		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
1015			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
1016#endif
1017		error = (*ifp->if_output)(ifp, m,
1018				(struct sockaddr *)dst, ro->ro_rt);
1019		goto done;
1020	}
1021
1022	if (ip->ip_off & IP_DF) {
1023		error = EMSGSIZE;
1024		/*
1025		 * This case can happen if the user changed the MTU
1026		 * of an interface after enabling IP on it.  Because
1027		 * most netifs don't keep track of routes pointing to
1028		 * them, there is no way for one to update all its
1029		 * routes when the MTU is changed.
1030		 */
1031		if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
1032		    (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
1033			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
1034		}
1035		ipstat.ips_cantfrag++;
1036		goto bad;
1037	}
1038
1039	/*
1040	 * Too large for interface; fragment if possible. If successful,
1041	 * on return, m will point to a list of packets to be sent.
1042	 */
1043	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
1044	if (error)
1045		goto bad;
1046	for (; m; m = m0) {
1047		m0 = m->m_nextpkt;
1048		m->m_nextpkt = 0;
1049#ifdef IPSEC
1050		/* clean ipsec history once it goes out of the node */
1051		ipsec_delaux(m);
1052#endif
1053		if (error == 0) {
1054			/* Record statistics for this interface address. */
1055			if (ia != NULL) {
1056				ia->ia_ifa.if_opackets++;
1057				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1058			}
1059
1060			error = (*ifp->if_output)(ifp, m,
1061			    (struct sockaddr *)dst, ro->ro_rt);
1062		} else
1063			m_freem(m);
1064	}
1065
1066	if (error == 0)
1067		ipstat.ips_fragmented++;
1068
1069done:
1070	if (ro == &iproute && ro->ro_rt) {
1071		RTFREE(ro->ro_rt);
1072		ro->ro_rt = NULL;
1073	}
1074	if (dummytag) {
1075		struct dn_pkt_tag *dt = (struct dn_pkt_tag *)(dummytag+1);
1076		if (dt->ro.ro_rt)
1077			RTFREE(dt->ro.ro_rt);
1078		m_tag_free(dummytag);
1079	}
1080#ifdef IPSEC
1081	if (sp != NULL) {
1082		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1083			printf("DP ip_output call free SP:%p\n", sp));
1084		key_freesp(sp);
1085	}
1086#endif
1087#ifdef FAST_IPSEC
1088	if (sp != NULL)
1089		KEY_FREESP(&sp);
1090#endif
1091	return (error);
1092bad:
1093	m_freem(m);
1094	goto done;
1095}
1096
1097/*
1098 * Create a chain of fragments which fit the given mtu. m_frag points to the
1099 * mbuf to be fragmented; on return it points to the chain with the fragments.
1100 * Return 0 if no error. If error, m_frag may contain a partially built
1101 * chain of fragments that should be freed by the caller.
1102 *
1103 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
1104 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
1105 */
1106int
1107ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
1108	    u_long if_hwassist_flags, int sw_csum)
1109{
1110	int error = 0;
1111	int hlen = ip->ip_hl << 2;
1112	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
1113	int off;
1114	struct mbuf *m0 = *m_frag;	/* the original packet		*/
1115	int firstlen;
1116	struct mbuf **mnext;
1117	int nfrags;
1118
1119	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
1120		ipstat.ips_cantfrag++;
1121		return EMSGSIZE;
1122	}
1123
1124	/*
1125	 * Must be able to put at least 8 bytes per fragment.
1126	 */
1127	if (len < 8)
1128		return EMSGSIZE;
1129
1130	/*
1131	 * If the interface will not calculate checksums on
1132	 * fragmented packets, then do it here.
1133	 */
1134	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
1135	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
1136		in_delayed_cksum(m0);
1137		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1138	}
1139
1140	if (len > PAGE_SIZE) {
1141		/*
1142		 * Fragment large datagrams such that each segment
1143		 * contains a multiple of PAGE_SIZE amount of data,
1144		 * plus headers. This enables a receiver to perform
1145		 * page-flipping zero-copy optimizations.
1146		 *
1147		 * XXX When does this help given that sender and receiver
1148		 * could have different page sizes, and also mtu could
1149		 * be less than the receiver's page size ?
1150		 */
1151		int newlen;
1152		struct mbuf *m;
1153
1154		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
1155			off += m->m_len;
1156
1157		/*
1158		 * firstlen (off - hlen) must be aligned on an
1159		 * 8-byte boundary
1160		 */
1161		if (off < hlen)
1162			goto smart_frag_failure;
1163		off = ((off - hlen) & ~7) + hlen;
1164		newlen = (~PAGE_MASK) & mtu;
1165		if ((newlen + sizeof (struct ip)) > mtu) {
1166			/* we failed, go back the default */
1167smart_frag_failure:
1168			newlen = len;
1169			off = hlen + len;
1170		}
1171		len = newlen;
1172
1173	} else {
1174		off = hlen + len;
1175	}
1176
1177	firstlen = off - hlen;
1178	mnext = &m0->m_nextpkt;		/* pointer to next packet */
1179
1180	/*
1181	 * Loop through length of segment after first fragment,
1182	 * make new header and copy data of each part and link onto chain.
1183	 * Here, m0 is the original packet, m is the fragment being created.
1184	 * The fragments are linked off the m_nextpkt of the original
1185	 * packet, which after processing serves as the first fragment.
1186	 */
1187	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
1188		struct ip *mhip;	/* ip header on the fragment */
1189		struct mbuf *m;
1190		int mhlen = sizeof (struct ip);
1191
1192		MGETHDR(m, M_DONTWAIT, MT_HEADER);
1193		if (m == 0) {
1194			error = ENOBUFS;
1195			ipstat.ips_odropped++;
1196			goto done;
1197		}
1198		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1199		/*
1200		 * In the first mbuf, leave room for the link header, then
1201		 * copy the original IP header including options. The payload
1202		 * goes into an additional mbuf chain returned by m_copy().
1203		 */
1204		m->m_data += max_linkhdr;
1205		mhip = mtod(m, struct ip *);
1206		*mhip = *ip;
1207		if (hlen > sizeof (struct ip)) {
1208			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
1209			mhip->ip_v = IPVERSION;
1210			mhip->ip_hl = mhlen >> 2;
1211		}
1212		m->m_len = mhlen;
1213		/* XXX do we need to add ip->ip_off below ? */
1214		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
1215		if (off + len >= ip->ip_len) {	/* last fragment */
1216			len = ip->ip_len - off;
1217			m->m_flags |= M_LASTFRAG;
1218		} else
1219			mhip->ip_off |= IP_MF;
1220		mhip->ip_len = htons((u_short)(len + mhlen));
1221		m->m_next = m_copy(m0, off, len);
1222		if (m->m_next == 0) {		/* copy failed */
1223			m_free(m);
1224			error = ENOBUFS;	/* ??? */
1225			ipstat.ips_odropped++;
1226			goto done;
1227		}
1228		m->m_pkthdr.len = mhlen + len;
1229		m->m_pkthdr.rcvif = (struct ifnet *)0;
1230#ifdef MAC
1231		mac_create_fragment(m0, m);
1232#endif
1233		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1234		mhip->ip_off = htons(mhip->ip_off);
1235		mhip->ip_sum = 0;
1236		if (sw_csum & CSUM_DELAY_IP)
1237			mhip->ip_sum = in_cksum(m, mhlen);
1238		*mnext = m;
1239		mnext = &m->m_nextpkt;
1240	}
1241	ipstat.ips_ofragments += nfrags;
1242
1243	/* set first marker for fragment chain */
1244	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1245	m0->m_pkthdr.csum_data = nfrags;
1246
1247	/*
1248	 * Update first fragment by trimming what's been copied out
1249	 * and updating header.
1250	 */
1251	m_adj(m0, hlen + firstlen - ip->ip_len);
1252	m0->m_pkthdr.len = hlen + firstlen;
1253	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
1254	ip->ip_off |= IP_MF;
1255	ip->ip_off = htons(ip->ip_off);
1256	ip->ip_sum = 0;
1257	if (sw_csum & CSUM_DELAY_IP)
1258		ip->ip_sum = in_cksum(m0, hlen);
1259
1260done:
1261	*m_frag = m0;
1262	return error;
1263}
1264
1265void
1266in_delayed_cksum(struct mbuf *m)
1267{
1268	struct ip *ip;
1269	u_short csum, offset;
1270
1271	ip = mtod(m, struct ip *);
1272	offset = ip->ip_hl << 2 ;
1273	csum = in_cksum_skip(m, ip->ip_len, offset);
1274	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
1275		csum = 0xffff;
1276	offset += m->m_pkthdr.csum_data;	/* checksum offset */
1277
1278	if (offset + sizeof(u_short) > m->m_len) {
1279		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
1280		    m->m_len, offset, ip->ip_p);
1281		/*
1282		 * XXX
1283		 * this shouldn't happen, but if it does, the
1284		 * correct behavior may be to insert the checksum
1285		 * in the existing chain instead of rearranging it.
1286		 */
1287		m = m_pullup(m, offset + sizeof(u_short));
1288	}
1289	*(u_short *)(m->m_data + offset) = csum;
1290}
1291
1292/*
1293 * Insert IP options into preformed packet.
1294 * Adjust IP destination as required for IP source routing,
1295 * as indicated by a non-zero in_addr at the start of the options.
1296 *
1297 * XXX This routine assumes that the packet has no options in place.
1298 */
1299static struct mbuf *
1300ip_insertoptions(m, opt, phlen)
1301	register struct mbuf *m;
1302	struct mbuf *opt;
1303	int *phlen;
1304{
1305	register struct ipoption *p = mtod(opt, struct ipoption *);
1306	struct mbuf *n;
1307	register struct ip *ip = mtod(m, struct ip *);
1308	unsigned optlen;
1309
1310	optlen = opt->m_len - sizeof(p->ipopt_dst);
1311	if (optlen + ip->ip_len > IP_MAXPACKET) {
1312		*phlen = 0;
1313		return (m);		/* XXX should fail */
1314	}
1315	if (p->ipopt_dst.s_addr)
1316		ip->ip_dst = p->ipopt_dst;
1317	if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
1318		MGETHDR(n, M_DONTWAIT, MT_HEADER);
1319		if (n == 0) {
1320			*phlen = 0;
1321			return (m);
1322		}
1323		n->m_pkthdr.rcvif = (struct ifnet *)0;
1324#ifdef MAC
1325		mac_create_mbuf_from_mbuf(m, n);
1326#endif
1327		n->m_pkthdr.len = m->m_pkthdr.len + optlen;
1328		m->m_len -= sizeof(struct ip);
1329		m->m_data += sizeof(struct ip);
1330		n->m_next = m;
1331		m = n;
1332		m->m_len = optlen + sizeof(struct ip);
1333		m->m_data += max_linkhdr;
1334		bcopy(ip, mtod(m, void *), sizeof(struct ip));
1335	} else {
1336		m->m_data -= optlen;
1337		m->m_len += optlen;
1338		m->m_pkthdr.len += optlen;
1339		bcopy(ip, mtod(m, void *), sizeof(struct ip));
1340	}
1341	ip = mtod(m, struct ip *);
1342	bcopy(p->ipopt_list, ip + 1, optlen);
1343	*phlen = sizeof(struct ip) + optlen;
1344	ip->ip_v = IPVERSION;
1345	ip->ip_hl = *phlen >> 2;
1346	ip->ip_len += optlen;
1347	return (m);
1348}
1349
1350/*
1351 * Copy options from ip to jp,
1352 * omitting those not copied during fragmentation.
1353 */
1354int
1355ip_optcopy(ip, jp)
1356	struct ip *ip, *jp;
1357{
1358	register u_char *cp, *dp;
1359	int opt, optlen, cnt;
1360
1361	cp = (u_char *)(ip + 1);
1362	dp = (u_char *)(jp + 1);
1363	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
1364	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1365		opt = cp[0];
1366		if (opt == IPOPT_EOL)
1367			break;
1368		if (opt == IPOPT_NOP) {
1369			/* Preserve for IP mcast tunnel's LSRR alignment. */
1370			*dp++ = IPOPT_NOP;
1371			optlen = 1;
1372			continue;
1373		}
1374
1375		KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
1376		    ("ip_optcopy: malformed ipv4 option"));
1377		optlen = cp[IPOPT_OLEN];
1378		KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
1379		    ("ip_optcopy: malformed ipv4 option"));
1380
1381		/* bogus lengths should have been caught by ip_dooptions */
1382		if (optlen > cnt)
1383			optlen = cnt;
1384		if (IPOPT_COPIED(opt)) {
1385			bcopy(cp, dp, optlen);
1386			dp += optlen;
1387		}
1388	}
1389	for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
1390		*dp++ = IPOPT_EOL;
1391	return (optlen);
1392}
1393
1394/*
1395 * IP socket option processing.
1396 */
1397int
1398ip_ctloutput(so, sopt)
1399	struct socket *so;
1400	struct sockopt *sopt;
1401{
1402	struct	inpcb *inp = sotoinpcb(so);
1403	int	error, optval;
1404
1405	error = optval = 0;
1406	if (sopt->sopt_level != IPPROTO_IP) {
1407		return (EINVAL);
1408	}
1409
1410	switch (sopt->sopt_dir) {
1411	case SOPT_SET:
1412		switch (sopt->sopt_name) {
1413		case IP_OPTIONS:
1414#ifdef notyet
1415		case IP_RETOPTS:
1416#endif
1417		{
1418			struct mbuf *m;
1419			if (sopt->sopt_valsize > MLEN) {
1420				error = EMSGSIZE;
1421				break;
1422			}
1423			MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_HEADER);
1424			if (m == 0) {
1425				error = ENOBUFS;
1426				break;
1427			}
1428			m->m_len = sopt->sopt_valsize;
1429			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
1430					    m->m_len);
1431
1432			return (ip_pcbopts(sopt->sopt_name, &inp->inp_options,
1433					   m));
1434		}
1435
1436		case IP_TOS:
1437		case IP_TTL:
1438		case IP_RECVOPTS:
1439		case IP_RECVRETOPTS:
1440		case IP_RECVDSTADDR:
1441		case IP_RECVTTL:
1442		case IP_RECVIF:
1443		case IP_FAITH:
1444		case IP_ONESBCAST:
1445			error = sooptcopyin(sopt, &optval, sizeof optval,
1446					    sizeof optval);
1447			if (error)
1448				break;
1449
1450			switch (sopt->sopt_name) {
1451			case IP_TOS:
1452				inp->inp_ip_tos = optval;
1453				break;
1454
1455			case IP_TTL:
1456				inp->inp_ip_ttl = optval;
1457				break;
1458#define	OPTSET(bit) \
1459	if (optval) \
1460		inp->inp_flags |= bit; \
1461	else \
1462		inp->inp_flags &= ~bit;
1463
1464			case IP_RECVOPTS:
1465				OPTSET(INP_RECVOPTS);
1466				break;
1467
1468			case IP_RECVRETOPTS:
1469				OPTSET(INP_RECVRETOPTS);
1470				break;
1471
1472			case IP_RECVDSTADDR:
1473				OPTSET(INP_RECVDSTADDR);
1474				break;
1475
1476			case IP_RECVTTL:
1477				OPTSET(INP_RECVTTL);
1478				break;
1479
1480			case IP_RECVIF:
1481				OPTSET(INP_RECVIF);
1482				break;
1483
1484			case IP_FAITH:
1485				OPTSET(INP_FAITH);
1486				break;
1487
1488			case IP_ONESBCAST:
1489				OPTSET(INP_ONESBCAST);
1490				break;
1491			}
1492			break;
1493#undef OPTSET
1494
1495		case IP_MULTICAST_IF:
1496		case IP_MULTICAST_VIF:
1497		case IP_MULTICAST_TTL:
1498		case IP_MULTICAST_LOOP:
1499		case IP_ADD_MEMBERSHIP:
1500		case IP_DROP_MEMBERSHIP:
1501			error = ip_setmoptions(sopt, &inp->inp_moptions);
1502			break;
1503
1504		case IP_PORTRANGE:
1505			error = sooptcopyin(sopt, &optval, sizeof optval,
1506					    sizeof optval);
1507			if (error)
1508				break;
1509
1510			switch (optval) {
1511			case IP_PORTRANGE_DEFAULT:
1512				inp->inp_flags &= ~(INP_LOWPORT);
1513				inp->inp_flags &= ~(INP_HIGHPORT);
1514				break;
1515
1516			case IP_PORTRANGE_HIGH:
1517				inp->inp_flags &= ~(INP_LOWPORT);
1518				inp->inp_flags |= INP_HIGHPORT;
1519				break;
1520
1521			case IP_PORTRANGE_LOW:
1522				inp->inp_flags &= ~(INP_HIGHPORT);
1523				inp->inp_flags |= INP_LOWPORT;
1524				break;
1525
1526			default:
1527				error = EINVAL;
1528				break;
1529			}
1530			break;
1531
1532#if defined(IPSEC) || defined(FAST_IPSEC)
1533		case IP_IPSEC_POLICY:
1534		{
1535			caddr_t req;
1536			size_t len = 0;
1537			int priv;
1538			struct mbuf *m;
1539			int optname;
1540
1541			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1542				break;
1543			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1544				break;
1545			priv = (sopt->sopt_td != NULL &&
1546				suser(sopt->sopt_td) != 0) ? 0 : 1;
1547			req = mtod(m, caddr_t);
1548			len = m->m_len;
1549			optname = sopt->sopt_name;
1550			error = ipsec4_set_policy(inp, optname, req, len, priv);
1551			m_freem(m);
1552			break;
1553		}
1554#endif /*IPSEC*/
1555
1556		default:
1557			error = ENOPROTOOPT;
1558			break;
1559		}
1560		break;
1561
1562	case SOPT_GET:
1563		switch (sopt->sopt_name) {
1564		case IP_OPTIONS:
1565		case IP_RETOPTS:
1566			if (inp->inp_options)
1567				error = sooptcopyout(sopt,
1568						     mtod(inp->inp_options,
1569							  char *),
1570						     inp->inp_options->m_len);
1571			else
1572				sopt->sopt_valsize = 0;
1573			break;
1574
1575		case IP_TOS:
1576		case IP_TTL:
1577		case IP_RECVOPTS:
1578		case IP_RECVRETOPTS:
1579		case IP_RECVDSTADDR:
1580		case IP_RECVTTL:
1581		case IP_RECVIF:
1582		case IP_PORTRANGE:
1583		case IP_FAITH:
1584		case IP_ONESBCAST:
1585			switch (sopt->sopt_name) {
1586
1587			case IP_TOS:
1588				optval = inp->inp_ip_tos;
1589				break;
1590
1591			case IP_TTL:
1592				optval = inp->inp_ip_ttl;
1593				break;
1594
1595#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1596
1597			case IP_RECVOPTS:
1598				optval = OPTBIT(INP_RECVOPTS);
1599				break;
1600
1601			case IP_RECVRETOPTS:
1602				optval = OPTBIT(INP_RECVRETOPTS);
1603				break;
1604
1605			case IP_RECVDSTADDR:
1606				optval = OPTBIT(INP_RECVDSTADDR);
1607				break;
1608
1609			case IP_RECVTTL:
1610				optval = OPTBIT(INP_RECVTTL);
1611				break;
1612
1613			case IP_RECVIF:
1614				optval = OPTBIT(INP_RECVIF);
1615				break;
1616
1617			case IP_PORTRANGE:
1618				if (inp->inp_flags & INP_HIGHPORT)
1619					optval = IP_PORTRANGE_HIGH;
1620				else if (inp->inp_flags & INP_LOWPORT)
1621					optval = IP_PORTRANGE_LOW;
1622				else
1623					optval = 0;
1624				break;
1625
1626			case IP_FAITH:
1627				optval = OPTBIT(INP_FAITH);
1628				break;
1629
1630			case IP_ONESBCAST:
1631				optval = OPTBIT(INP_ONESBCAST);
1632				break;
1633			}
1634			error = sooptcopyout(sopt, &optval, sizeof optval);
1635			break;
1636
1637		case IP_MULTICAST_IF:
1638		case IP_MULTICAST_VIF:
1639		case IP_MULTICAST_TTL:
1640		case IP_MULTICAST_LOOP:
1641		case IP_ADD_MEMBERSHIP:
1642		case IP_DROP_MEMBERSHIP:
1643			error = ip_getmoptions(sopt, inp->inp_moptions);
1644			break;
1645
1646#if defined(IPSEC) || defined(FAST_IPSEC)
1647		case IP_IPSEC_POLICY:
1648		{
1649			struct mbuf *m = NULL;
1650			caddr_t req = NULL;
1651			size_t len = 0;
1652
1653			if (m != 0) {
1654				req = mtod(m, caddr_t);
1655				len = m->m_len;
1656			}
1657			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1658			if (error == 0)
1659				error = soopt_mcopyout(sopt, m); /* XXX */
1660			if (error == 0)
1661				m_freem(m);
1662			break;
1663		}
1664#endif /*IPSEC*/
1665
1666		default:
1667			error = ENOPROTOOPT;
1668			break;
1669		}
1670		break;
1671	}
1672	return (error);
1673}
1674
1675/*
1676 * Set up IP options in pcb for insertion in output packets.
1677 * Store in mbuf with pointer in pcbopt, adding pseudo-option
1678 * with destination address if source routed.
1679 */
1680static int
1681ip_pcbopts(optname, pcbopt, m)
1682	int optname;
1683	struct mbuf **pcbopt;
1684	register struct mbuf *m;
1685{
1686	register int cnt, optlen;
1687	register u_char *cp;
1688	u_char opt;
1689
1690	/* turn off any old options */
1691	if (*pcbopt)
1692		(void)m_free(*pcbopt);
1693	*pcbopt = 0;
1694	if (m == (struct mbuf *)0 || m->m_len == 0) {
1695		/*
1696		 * Only turning off any previous options.
1697		 */
1698		if (m)
1699			(void)m_free(m);
1700		return (0);
1701	}
1702
1703	if (m->m_len % sizeof(int32_t))
1704		goto bad;
1705	/*
1706	 * IP first-hop destination address will be stored before
1707	 * actual options; move other options back
1708	 * and clear it when none present.
1709	 */
1710	if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
1711		goto bad;
1712	cnt = m->m_len;
1713	m->m_len += sizeof(struct in_addr);
1714	cp = mtod(m, u_char *) + sizeof(struct in_addr);
1715	bcopy(mtod(m, void *), cp, (unsigned)cnt);
1716	bzero(mtod(m, void *), sizeof(struct in_addr));
1717
1718	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1719		opt = cp[IPOPT_OPTVAL];
1720		if (opt == IPOPT_EOL)
1721			break;
1722		if (opt == IPOPT_NOP)
1723			optlen = 1;
1724		else {
1725			if (cnt < IPOPT_OLEN + sizeof(*cp))
1726				goto bad;
1727			optlen = cp[IPOPT_OLEN];
1728			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
1729				goto bad;
1730		}
1731		switch (opt) {
1732
1733		default:
1734			break;
1735
1736		case IPOPT_LSRR:
1737		case IPOPT_SSRR:
1738			/*
1739			 * user process specifies route as:
1740			 *	->A->B->C->D
1741			 * D must be our final destination (but we can't
1742			 * check that since we may not have connected yet).
1743			 * A is first hop destination, which doesn't appear in
1744			 * actual IP option, but is stored before the options.
1745			 */
1746			if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
1747				goto bad;
1748			m->m_len -= sizeof(struct in_addr);
1749			cnt -= sizeof(struct in_addr);
1750			optlen -= sizeof(struct in_addr);
1751			cp[IPOPT_OLEN] = optlen;
1752			/*
1753			 * Move first hop before start of options.
1754			 */
1755			bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
1756			    sizeof(struct in_addr));
1757			/*
1758			 * Then copy rest of options back
1759			 * to close up the deleted entry.
1760			 */
1761			bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)),
1762			    &cp[IPOPT_OFFSET+1],
1763			    (unsigned)cnt + sizeof(struct in_addr));
1764			break;
1765		}
1766	}
1767	if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
1768		goto bad;
1769	*pcbopt = m;
1770	return (0);
1771
1772bad:
1773	(void)m_free(m);
1774	return (EINVAL);
1775}
1776
1777/*
1778 * XXX
1779 * The whole multicast option thing needs to be re-thought.
1780 * Several of these options are equally applicable to non-multicast
1781 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1782 * standard option (IP_TTL).
1783 */
1784
1785/*
1786 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1787 */
1788static struct ifnet *
1789ip_multicast_if(a, ifindexp)
1790	struct in_addr *a;
1791	int *ifindexp;
1792{
1793	int ifindex;
1794	struct ifnet *ifp;
1795
1796	if (ifindexp)
1797		*ifindexp = 0;
1798	if (ntohl(a->s_addr) >> 24 == 0) {
1799		ifindex = ntohl(a->s_addr) & 0xffffff;
1800		if (ifindex < 0 || if_index < ifindex)
1801			return NULL;
1802		ifp = ifnet_byindex(ifindex);
1803		if (ifindexp)
1804			*ifindexp = ifindex;
1805	} else {
1806		INADDR_TO_IFP(*a, ifp);
1807	}
1808	return ifp;
1809}
1810
1811/*
1812 * Set the IP multicast options in response to user setsockopt().
1813 */
1814static int
1815ip_setmoptions(sopt, imop)
1816	struct sockopt *sopt;
1817	struct ip_moptions **imop;
1818{
1819	int error = 0;
1820	int i;
1821	struct in_addr addr;
1822	struct ip_mreq mreq;
1823	struct ifnet *ifp;
1824	struct ip_moptions *imo = *imop;
1825	struct route ro;
1826	struct sockaddr_in *dst;
1827	int ifindex;
1828	int s;
1829
1830	if (imo == NULL) {
1831		/*
1832		 * No multicast option buffer attached to the pcb;
1833		 * allocate one and initialize to default values.
1834		 */
1835		imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS,
1836		    M_WAITOK);
1837
1838		if (imo == NULL)
1839			return (ENOBUFS);
1840		*imop = imo;
1841		imo->imo_multicast_ifp = NULL;
1842		imo->imo_multicast_addr.s_addr = INADDR_ANY;
1843		imo->imo_multicast_vif = -1;
1844		imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1845		imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1846		imo->imo_num_memberships = 0;
1847	}
1848
1849	switch (sopt->sopt_name) {
1850	/* store an index number for the vif you wanna use in the send */
1851	case IP_MULTICAST_VIF:
1852		if (legal_vif_num == 0) {
1853			error = EOPNOTSUPP;
1854			break;
1855		}
1856		error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1857		if (error)
1858			break;
1859		if (!legal_vif_num(i) && (i != -1)) {
1860			error = EINVAL;
1861			break;
1862		}
1863		imo->imo_multicast_vif = i;
1864		break;
1865
1866	case IP_MULTICAST_IF:
1867		/*
1868		 * Select the interface for outgoing multicast packets.
1869		 */
1870		error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1871		if (error)
1872			break;
1873		/*
1874		 * INADDR_ANY is used to remove a previous selection.
1875		 * When no interface is selected, a default one is
1876		 * chosen every time a multicast packet is sent.
1877		 */
1878		if (addr.s_addr == INADDR_ANY) {
1879			imo->imo_multicast_ifp = NULL;
1880			break;
1881		}
1882		/*
1883		 * The selected interface is identified by its local
1884		 * IP address.  Find the interface and confirm that
1885		 * it supports multicasting.
1886		 */
1887		s = splimp();
1888		ifp = ip_multicast_if(&addr, &ifindex);
1889		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1890			splx(s);
1891			error = EADDRNOTAVAIL;
1892			break;
1893		}
1894		imo->imo_multicast_ifp = ifp;
1895		if (ifindex)
1896			imo->imo_multicast_addr = addr;
1897		else
1898			imo->imo_multicast_addr.s_addr = INADDR_ANY;
1899		splx(s);
1900		break;
1901
1902	case IP_MULTICAST_TTL:
1903		/*
1904		 * Set the IP time-to-live for outgoing multicast packets.
1905		 * The original multicast API required a char argument,
1906		 * which is inconsistent with the rest of the socket API.
1907		 * We allow either a char or an int.
1908		 */
1909		if (sopt->sopt_valsize == 1) {
1910			u_char ttl;
1911			error = sooptcopyin(sopt, &ttl, 1, 1);
1912			if (error)
1913				break;
1914			imo->imo_multicast_ttl = ttl;
1915		} else {
1916			u_int ttl;
1917			error = sooptcopyin(sopt, &ttl, sizeof ttl,
1918					    sizeof ttl);
1919			if (error)
1920				break;
1921			if (ttl > 255)
1922				error = EINVAL;
1923			else
1924				imo->imo_multicast_ttl = ttl;
1925		}
1926		break;
1927
1928	case IP_MULTICAST_LOOP:
1929		/*
1930		 * Set the loopback flag for outgoing multicast packets.
1931		 * Must be zero or one.  The original multicast API required a
1932		 * char argument, which is inconsistent with the rest
1933		 * of the socket API.  We allow either a char or an int.
1934		 */
1935		if (sopt->sopt_valsize == 1) {
1936			u_char loop;
1937			error = sooptcopyin(sopt, &loop, 1, 1);
1938			if (error)
1939				break;
1940			imo->imo_multicast_loop = !!loop;
1941		} else {
1942			u_int loop;
1943			error = sooptcopyin(sopt, &loop, sizeof loop,
1944					    sizeof loop);
1945			if (error)
1946				break;
1947			imo->imo_multicast_loop = !!loop;
1948		}
1949		break;
1950
1951	case IP_ADD_MEMBERSHIP:
1952		/*
1953		 * Add a multicast group membership.
1954		 * Group must be a valid IP multicast address.
1955		 */
1956		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1957		if (error)
1958			break;
1959
1960		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1961			error = EINVAL;
1962			break;
1963		}
1964		s = splimp();
1965		/*
1966		 * If no interface address was provided, use the interface of
1967		 * the route to the given multicast address.
1968		 */
1969		if (mreq.imr_interface.s_addr == INADDR_ANY) {
1970			bzero((caddr_t)&ro, sizeof(ro));
1971			dst = (struct sockaddr_in *)&ro.ro_dst;
1972			dst->sin_len = sizeof(*dst);
1973			dst->sin_family = AF_INET;
1974			dst->sin_addr = mreq.imr_multiaddr;
1975			rtalloc_ign(&ro, RTF_CLONING);
1976			if (ro.ro_rt == NULL) {
1977				error = EADDRNOTAVAIL;
1978				splx(s);
1979				break;
1980			}
1981			ifp = ro.ro_rt->rt_ifp;
1982			RTFREE(ro.ro_rt);
1983		}
1984		else {
1985			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1986		}
1987
1988		/*
1989		 * See if we found an interface, and confirm that it
1990		 * supports multicast.
1991		 */
1992		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1993			error = EADDRNOTAVAIL;
1994			splx(s);
1995			break;
1996		}
1997		/*
1998		 * See if the membership already exists or if all the
1999		 * membership slots are full.
2000		 */
2001		for (i = 0; i < imo->imo_num_memberships; ++i) {
2002			if (imo->imo_membership[i]->inm_ifp == ifp &&
2003			    imo->imo_membership[i]->inm_addr.s_addr
2004						== mreq.imr_multiaddr.s_addr)
2005				break;
2006		}
2007		if (i < imo->imo_num_memberships) {
2008			error = EADDRINUSE;
2009			splx(s);
2010			break;
2011		}
2012		if (i == IP_MAX_MEMBERSHIPS) {
2013			error = ETOOMANYREFS;
2014			splx(s);
2015			break;
2016		}
2017		/*
2018		 * Everything looks good; add a new record to the multicast
2019		 * address list for the given interface.
2020		 */
2021		if ((imo->imo_membership[i] =
2022		    in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
2023			error = ENOBUFS;
2024			splx(s);
2025			break;
2026		}
2027		++imo->imo_num_memberships;
2028		splx(s);
2029		break;
2030
2031	case IP_DROP_MEMBERSHIP:
2032		/*
2033		 * Drop a multicast group membership.
2034		 * Group must be a valid IP multicast address.
2035		 */
2036		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
2037		if (error)
2038			break;
2039
2040		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
2041			error = EINVAL;
2042			break;
2043		}
2044
2045		s = splimp();
2046		/*
2047		 * If an interface address was specified, get a pointer
2048		 * to its ifnet structure.
2049		 */
2050		if (mreq.imr_interface.s_addr == INADDR_ANY)
2051			ifp = NULL;
2052		else {
2053			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
2054			if (ifp == NULL) {
2055				error = EADDRNOTAVAIL;
2056				splx(s);
2057				break;
2058			}
2059		}
2060		/*
2061		 * Find the membership in the membership array.
2062		 */
2063		for (i = 0; i < imo->imo_num_memberships; ++i) {
2064			if ((ifp == NULL ||
2065			     imo->imo_membership[i]->inm_ifp == ifp) &&
2066			     imo->imo_membership[i]->inm_addr.s_addr ==
2067			     mreq.imr_multiaddr.s_addr)
2068				break;
2069		}
2070		if (i == imo->imo_num_memberships) {
2071			error = EADDRNOTAVAIL;
2072			splx(s);
2073			break;
2074		}
2075		/*
2076		 * Give up the multicast address record to which the
2077		 * membership points.
2078		 */
2079		in_delmulti(imo->imo_membership[i]);
2080		/*
2081		 * Remove the gap in the membership array.
2082		 */
2083		for (++i; i < imo->imo_num_memberships; ++i)
2084			imo->imo_membership[i-1] = imo->imo_membership[i];
2085		--imo->imo_num_memberships;
2086		splx(s);
2087		break;
2088
2089	default:
2090		error = EOPNOTSUPP;
2091		break;
2092	}
2093
2094	/*
2095	 * If all options have default values, no need to keep the mbuf.
2096	 */
2097	if (imo->imo_multicast_ifp == NULL &&
2098	    imo->imo_multicast_vif == -1 &&
2099	    imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
2100	    imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
2101	    imo->imo_num_memberships == 0) {
2102		free(*imop, M_IPMOPTS);
2103		*imop = NULL;
2104	}
2105
2106	return (error);
2107}
2108
2109/*
2110 * Return the IP multicast options in response to user getsockopt().
2111 */
2112static int
2113ip_getmoptions(sopt, imo)
2114	struct sockopt *sopt;
2115	register struct ip_moptions *imo;
2116{
2117	struct in_addr addr;
2118	struct in_ifaddr *ia;
2119	int error, optval;
2120	u_char coptval;
2121
2122	error = 0;
2123	switch (sopt->sopt_name) {
2124	case IP_MULTICAST_VIF:
2125		if (imo != NULL)
2126			optval = imo->imo_multicast_vif;
2127		else
2128			optval = -1;
2129		error = sooptcopyout(sopt, &optval, sizeof optval);
2130		break;
2131
2132	case IP_MULTICAST_IF:
2133		if (imo == NULL || imo->imo_multicast_ifp == NULL)
2134			addr.s_addr = INADDR_ANY;
2135		else if (imo->imo_multicast_addr.s_addr) {
2136			/* return the value user has set */
2137			addr = imo->imo_multicast_addr;
2138		} else {
2139			IFP_TO_IA(imo->imo_multicast_ifp, ia);
2140			addr.s_addr = (ia == NULL) ? INADDR_ANY
2141				: IA_SIN(ia)->sin_addr.s_addr;
2142		}
2143		error = sooptcopyout(sopt, &addr, sizeof addr);
2144		break;
2145
2146	case IP_MULTICAST_TTL:
2147		if (imo == 0)
2148			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
2149		else
2150			optval = coptval = imo->imo_multicast_ttl;
2151		if (sopt->sopt_valsize == 1)
2152			error = sooptcopyout(sopt, &coptval, 1);
2153		else
2154			error = sooptcopyout(sopt, &optval, sizeof optval);
2155		break;
2156
2157	case IP_MULTICAST_LOOP:
2158		if (imo == 0)
2159			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
2160		else
2161			optval = coptval = imo->imo_multicast_loop;
2162		if (sopt->sopt_valsize == 1)
2163			error = sooptcopyout(sopt, &coptval, 1);
2164		else
2165			error = sooptcopyout(sopt, &optval, sizeof optval);
2166		break;
2167
2168	default:
2169		error = ENOPROTOOPT;
2170		break;
2171	}
2172	return (error);
2173}
2174
2175/*
2176 * Discard the IP multicast options.
2177 */
2178void
2179ip_freemoptions(imo)
2180	register struct ip_moptions *imo;
2181{
2182	register int i;
2183
2184	if (imo != NULL) {
2185		for (i = 0; i < imo->imo_num_memberships; ++i)
2186			in_delmulti(imo->imo_membership[i]);
2187		free(imo, M_IPMOPTS);
2188	}
2189}
2190
2191/*
2192 * Routine called from ip_output() to loop back a copy of an IP multicast
2193 * packet to the input queue of a specified interface.  Note that this
2194 * calls the output routine of the loopback "driver", but with an interface
2195 * pointer that might NOT be a loopback interface -- evil, but easier than
2196 * replicating that code here.
2197 */
2198static void
2199ip_mloopback(ifp, m, dst, hlen)
2200	struct ifnet *ifp;
2201	register struct mbuf *m;
2202	register struct sockaddr_in *dst;
2203	int hlen;
2204{
2205	register struct ip *ip;
2206	struct mbuf *copym;
2207
2208	copym = m_copy(m, 0, M_COPYALL);
2209	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
2210		copym = m_pullup(copym, hlen);
2211	if (copym != NULL) {
2212		/*
2213		 * We don't bother to fragment if the IP length is greater
2214		 * than the interface's MTU.  Can this possibly matter?
2215		 */
2216		ip = mtod(copym, struct ip *);
2217		ip->ip_len = htons(ip->ip_len);
2218		ip->ip_off = htons(ip->ip_off);
2219		ip->ip_sum = 0;
2220		ip->ip_sum = in_cksum(copym, hlen);
2221		/*
2222		 * NB:
2223		 * It's not clear whether there are any lingering
2224		 * reentrancy problems in other areas which might
2225		 * be exposed by using ip_input directly (in
2226		 * particular, everything which modifies the packet
2227		 * in-place).  Yet another option is using the
2228		 * protosw directly to deliver the looped back
2229		 * packet.  For the moment, we'll err on the side
2230		 * of safety by using if_simloop().
2231		 */
2232#if 1 /* XXX */
2233		if (dst->sin_family != AF_INET) {
2234			printf("ip_mloopback: bad address family %d\n",
2235						dst->sin_family);
2236			dst->sin_family = AF_INET;
2237		}
2238#endif
2239
2240#ifdef notdef
2241		copym->m_pkthdr.rcvif = ifp;
2242		ip_input(copym);
2243#else
2244		/* if the checksum hasn't been computed, mark it as valid */
2245		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
2246			copym->m_pkthdr.csum_flags |=
2247			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
2248			copym->m_pkthdr.csum_data = 0xffff;
2249		}
2250		if_simloop(ifp, copym, dst->sin_family, 0);
2251#endif
2252	}
2253}
2254