ip_output.c revision 126486
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
34 * $FreeBSD: head/sys/netinet/ip_output.c 126486 2004-03-02 14:37:23Z mlaier $
35 */
36
37#include "opt_ipfw.h"
38#include "opt_ipdn.h"
39#include "opt_ipdivert.h"
40#include "opt_ipfilter.h"
41#include "opt_ipsec.h"
42#include "opt_mac.h"
43#include "opt_pfil_hooks.h"
44#include "opt_random_ip_id.h"
45#include "opt_mbuf_stress_test.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/kernel.h>
50#include <sys/mac.h>
51#include <sys/malloc.h>
52#include <sys/mbuf.h>
53#include <sys/protosw.h>
54#include <sys/socket.h>
55#include <sys/socketvar.h>
56#include <sys/sysctl.h>
57
58#include <net/if.h>
59#include <net/route.h>
60
61#include <netinet/in.h>
62#include <netinet/in_systm.h>
63#include <netinet/ip.h>
64#include <netinet/in_pcb.h>
65#include <netinet/in_var.h>
66#include <netinet/ip_var.h>
67
68#ifdef PFIL_HOOKS
69#include <net/pfil.h>
70#endif
71
72#include <machine/in_cksum.h>
73
74static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
75
76#ifdef IPSEC
77#include <netinet6/ipsec.h>
78#include <netkey/key.h>
79#ifdef IPSEC_DEBUG
80#include <netkey/key_debug.h>
81#else
82#define	KEYDEBUG(lev,arg)
83#endif
84#endif /*IPSEC*/
85
86#ifdef FAST_IPSEC
87#include <netipsec/ipsec.h>
88#include <netipsec/xform.h>
89#include <netipsec/key.h>
90#endif /*FAST_IPSEC*/
91
92#include <netinet/ip_fw.h>
93#include <netinet/ip_divert.h>
94#include <netinet/ip_dummynet.h>
95
96#define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
97				x, (ntohl(a.s_addr)>>24)&0xFF,\
98				  (ntohl(a.s_addr)>>16)&0xFF,\
99				  (ntohl(a.s_addr)>>8)&0xFF,\
100				  (ntohl(a.s_addr))&0xFF, y);
101
102u_short ip_id;
103
104#ifdef MBUF_STRESS_TEST
105int mbuf_frag_size = 0;
106SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
107	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
108#endif
109
110static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
111static struct ifnet *ip_multicast_if(struct in_addr *, int *);
112static void	ip_mloopback
113	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
114static int	ip_getmoptions
115	(struct sockopt *, struct ip_moptions *);
116static int	ip_pcbopts(int, struct mbuf **, struct mbuf *);
117static int	ip_setmoptions
118	(struct sockopt *, struct ip_moptions **);
119
120int	ip_optcopy(struct ip *, struct ip *);
121
122
123extern	struct protosw inetsw[];
124
125/*
126 * IP output.  The packet in mbuf chain m contains a skeletal IP
127 * header (with len, off, ttl, proto, tos, src, dst).
128 * The mbuf chain containing the packet will be freed.
129 * The mbuf opt, if present, will not be freed.
130 * In the IP forwarding case, the packet will arrive with options already
131 * inserted, so must have a NULL opt pointer.
132 */
133int
134ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro,
135	int flags, struct ip_moptions *imo, struct inpcb *inp)
136{
137	struct ip *ip;
138	struct ifnet *ifp = NULL;	/* keep compiler happy */
139	struct mbuf *m0;
140	int hlen = sizeof (struct ip);
141	int len, off, error = 0;
142	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
143	struct in_ifaddr *ia = NULL;
144	int isbroadcast, sw_csum;
145	struct in_addr pkt_dst;
146	struct route iproute;
147	struct m_tag *mtag, *dummytag;
148#ifdef IPSEC
149	struct secpolicy *sp = NULL;
150#endif
151#ifdef FAST_IPSEC
152	struct secpolicy *sp = NULL;
153	struct tdb_ident *tdbi;
154	int s;
155#endif /* FAST_IPSEC */
156	struct ip_fw_args args;
157	int src_was_INADDR_ANY = 0;	/* as the name says... */
158
159	args.eh = NULL;
160	args.rule = NULL;
161
162	M_ASSERTPKTHDR(m);
163
164	args.next_hop = ip_claim_next_hop(m);
165	dummytag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
166	if (dummytag != NULL) {
167		struct dn_pkt_tag *dt = (struct dn_pkt_tag *)(dummytag+1);
168		/*
169		 * Prevent lower layers from finding the tag
170		 * Cleanup and free is done below
171		 */
172		m_tag_unlink(m, dummytag);
173		/*
174		 * the packet was already tagged, so part of the
175		 * processing was already done, and we need to go down.
176		 * Get parameters from the header.
177		 */
178		args.rule = dt->rule;
179		ro = &(dt->ro);
180		dst = dt->dn_dst;
181		ifp = dt->ifp;
182	}
183
184	if (ro == NULL) {
185		ro = &iproute;
186		bzero(ro, sizeof (*ro));
187	}
188
189	if (inp != NULL)
190		INP_LOCK_ASSERT(inp);
191
192	if (args.rule != NULL) {	/* dummynet already saw us */
193		ip = mtod(m, struct ip *);
194		hlen = ip->ip_hl << 2 ;
195		if (ro->ro_rt)
196			ia = ifatoia(ro->ro_rt->rt_ifa);
197		goto sendit;
198	}
199
200	if (opt) {
201		len = 0;
202		m = ip_insertoptions(m, opt, &len);
203		if (len != 0)
204			hlen = len;
205	}
206	ip = mtod(m, struct ip *);
207	pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
208
209	/*
210	 * Fill in IP header.  If we are not allowing fragmentation,
211	 * then the ip_id field is meaningless, but we don't set it
212	 * to zero.  Doing so causes various problems when devices along
213	 * the path (routers, load balancers, firewalls, etc.) illegally
214	 * disable DF on our packet.  Note that a 16-bit counter
215	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
216	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
217	 * for Counting NATted Hosts", Proc. IMW'02, available at
218	 * <http://www.research.att.com/~smb/papers/fnat.pdf>.
219	 */
220	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
221		ip->ip_v = IPVERSION;
222		ip->ip_hl = hlen >> 2;
223#ifdef RANDOM_IP_ID
224		ip->ip_id = ip_randomid();
225#else
226		ip->ip_id = htons(ip_id++);
227#endif
228		ipstat.ips_localout++;
229	} else {
230		hlen = ip->ip_hl << 2;
231	}
232
233	dst = (struct sockaddr_in *)&ro->ro_dst;
234	/*
235	 * If there is a cached route,
236	 * check that it is to the same destination
237	 * and is still up.  If not, free it and try again.
238	 * The address family should also be checked in case of sharing the
239	 * cache with IPv6.
240	 */
241	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
242			  dst->sin_family != AF_INET ||
243			  dst->sin_addr.s_addr != pkt_dst.s_addr)) {
244		RTFREE(ro->ro_rt);
245		ro->ro_rt = (struct rtentry *)0;
246	}
247	if (ro->ro_rt == 0) {
248		bzero(dst, sizeof(*dst));
249		dst->sin_family = AF_INET;
250		dst->sin_len = sizeof(*dst);
251		dst->sin_addr = pkt_dst;
252	}
253	/*
254	 * If routing to interface only,
255	 * short circuit routing lookup.
256	 */
257	if (flags & IP_ROUTETOIF) {
258		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 &&
259		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
260			ipstat.ips_noroute++;
261			error = ENETUNREACH;
262			goto bad;
263		}
264		ifp = ia->ia_ifp;
265		ip->ip_ttl = 1;
266		isbroadcast = in_broadcast(dst->sin_addr, ifp);
267	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
268	    imo != NULL && imo->imo_multicast_ifp != NULL) {
269		/*
270		 * Bypass the normal routing lookup for multicast
271		 * packets if the interface is specified.
272		 */
273		ifp = imo->imo_multicast_ifp;
274		IFP_TO_IA(ifp, ia);
275		isbroadcast = 0;	/* fool gcc */
276	} else {
277		/*
278		 * We want to do any cloning requested by the link layer,
279		 * as this is probably required in all cases for correct
280		 * operation (as it is for ARP).
281		 */
282		if (ro->ro_rt == 0)
283			rtalloc(ro);
284		if (ro->ro_rt == 0) {
285			ipstat.ips_noroute++;
286			error = EHOSTUNREACH;
287			goto bad;
288		}
289		ia = ifatoia(ro->ro_rt->rt_ifa);
290		ifp = ro->ro_rt->rt_ifp;
291		ro->ro_rt->rt_rmx.rmx_pksent++;
292		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
293			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
294		if (ro->ro_rt->rt_flags & RTF_HOST)
295			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
296		else
297			isbroadcast = in_broadcast(dst->sin_addr, ifp);
298	}
299	if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
300		struct in_multi *inm;
301
302		m->m_flags |= M_MCAST;
303		/*
304		 * IP destination address is multicast.  Make sure "dst"
305		 * still points to the address in "ro".  (It may have been
306		 * changed to point to a gateway address, above.)
307		 */
308		dst = (struct sockaddr_in *)&ro->ro_dst;
309		/*
310		 * See if the caller provided any multicast options
311		 */
312		if (imo != NULL) {
313			ip->ip_ttl = imo->imo_multicast_ttl;
314			if (imo->imo_multicast_vif != -1)
315				ip->ip_src.s_addr =
316				    ip_mcast_src ?
317				    ip_mcast_src(imo->imo_multicast_vif) :
318				    INADDR_ANY;
319		} else
320			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
321		/*
322		 * Confirm that the outgoing interface supports multicast.
323		 */
324		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
325			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
326				ipstat.ips_noroute++;
327				error = ENETUNREACH;
328				goto bad;
329			}
330		}
331		/*
332		 * If source address not specified yet, use address
333		 * of outgoing interface.
334		 */
335		if (ip->ip_src.s_addr == INADDR_ANY) {
336			/* Interface may have no addresses. */
337			if (ia != NULL)
338				ip->ip_src = IA_SIN(ia)->sin_addr;
339		}
340
341		if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
342			/*
343			 * XXX
344			 * delayed checksums are not currently
345			 * compatible with IP multicast routing
346			 */
347			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
348				in_delayed_cksum(m);
349				m->m_pkthdr.csum_flags &=
350					~CSUM_DELAY_DATA;
351			}
352		}
353		IN_LOOKUP_MULTI(pkt_dst, ifp, inm);
354		if (inm != NULL &&
355		   (imo == NULL || imo->imo_multicast_loop)) {
356			/*
357			 * If we belong to the destination multicast group
358			 * on the outgoing interface, and the caller did not
359			 * forbid loopback, loop back a copy.
360			 */
361			ip_mloopback(ifp, m, dst, hlen);
362		}
363		else {
364			/*
365			 * If we are acting as a multicast router, perform
366			 * multicast forwarding as if the packet had just
367			 * arrived on the interface to which we are about
368			 * to send.  The multicast forwarding function
369			 * recursively calls this function, using the
370			 * IP_FORWARDING flag to prevent infinite recursion.
371			 *
372			 * Multicasts that are looped back by ip_mloopback(),
373			 * above, will be forwarded by the ip_input() routine,
374			 * if necessary.
375			 */
376			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
377				/*
378				 * If rsvp daemon is not running, do not
379				 * set ip_moptions. This ensures that the packet
380				 * is multicast and not just sent down one link
381				 * as prescribed by rsvpd.
382				 */
383				if (!rsvp_on)
384					imo = NULL;
385				if (ip_mforward &&
386				    ip_mforward(ip, ifp, m, imo) != 0) {
387					m_freem(m);
388					goto done;
389				}
390			}
391		}
392
393		/*
394		 * Multicasts with a time-to-live of zero may be looped-
395		 * back, above, but must not be transmitted on a network.
396		 * Also, multicasts addressed to the loopback interface
397		 * are not sent -- the above call to ip_mloopback() will
398		 * loop back a copy if this host actually belongs to the
399		 * destination group on the loopback interface.
400		 */
401		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
402			m_freem(m);
403			goto done;
404		}
405
406		goto sendit;
407	}
408#ifndef notdef
409	/*
410	 * If the source address is not specified yet, use the address
411	 * of the outoing interface. In case, keep note we did that, so
412	 * if the the firewall changes the next-hop causing the output
413	 * interface to change, we can fix that.
414	 */
415	if (ip->ip_src.s_addr == INADDR_ANY) {
416		/* Interface may have no addresses. */
417		if (ia != NULL) {
418			ip->ip_src = IA_SIN(ia)->sin_addr;
419			src_was_INADDR_ANY = 1;
420		}
421	}
422#endif /* notdef */
423	/*
424	 * Verify that we have any chance at all of being able to queue
425	 *      the packet or packet fragments
426	 */
427	if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
428		ifp->if_snd.ifq_maxlen) {
429			error = ENOBUFS;
430			ipstat.ips_odropped++;
431			goto bad;
432	}
433
434	/*
435	 * Look for broadcast address and
436	 * verify user is allowed to send
437	 * such a packet.
438	 */
439	if (isbroadcast) {
440		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
441			error = EADDRNOTAVAIL;
442			goto bad;
443		}
444		if ((flags & IP_ALLOWBROADCAST) == 0) {
445			error = EACCES;
446			goto bad;
447		}
448		/* don't allow broadcast messages to be fragmented */
449		if (ip->ip_len > ifp->if_mtu) {
450			error = EMSGSIZE;
451			goto bad;
452		}
453		if (flags & IP_SENDONES)
454			ip->ip_dst.s_addr = INADDR_BROADCAST;
455		m->m_flags |= M_BCAST;
456	} else {
457		m->m_flags &= ~M_BCAST;
458	}
459
460sendit:
461#ifdef IPSEC
462	/* get SP for this packet */
463	if (inp == NULL)
464		sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
465		    flags, &error);
466	else
467		sp = ipsec4_getpolicybypcb(m, IPSEC_DIR_OUTBOUND, inp, &error);
468
469	if (sp == NULL) {
470		ipsecstat.out_inval++;
471		goto bad;
472	}
473
474	error = 0;
475
476	/* check policy */
477	switch (sp->policy) {
478	case IPSEC_POLICY_DISCARD:
479		/*
480		 * This packet is just discarded.
481		 */
482		ipsecstat.out_polvio++;
483		goto bad;
484
485	case IPSEC_POLICY_BYPASS:
486	case IPSEC_POLICY_NONE:
487	case IPSEC_POLICY_TCP:
488		/* no need to do IPsec. */
489		goto skip_ipsec;
490
491	case IPSEC_POLICY_IPSEC:
492		if (sp->req == NULL) {
493			/* acquire a policy */
494			error = key_spdacquire(sp);
495			goto bad;
496		}
497		break;
498
499	case IPSEC_POLICY_ENTRUST:
500	default:
501		printf("ip_output: Invalid policy found. %d\n", sp->policy);
502	}
503    {
504	struct ipsec_output_state state;
505	bzero(&state, sizeof(state));
506	state.m = m;
507	if (flags & IP_ROUTETOIF) {
508		state.ro = &iproute;
509		bzero(&iproute, sizeof(iproute));
510	} else
511		state.ro = ro;
512	state.dst = (struct sockaddr *)dst;
513
514	ip->ip_sum = 0;
515
516	/*
517	 * XXX
518	 * delayed checksums are not currently compatible with IPsec
519	 */
520	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
521		in_delayed_cksum(m);
522		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
523	}
524
525	ip->ip_len = htons(ip->ip_len);
526	ip->ip_off = htons(ip->ip_off);
527
528	error = ipsec4_output(&state, sp, flags);
529
530	m = state.m;
531	if (flags & IP_ROUTETOIF) {
532		/*
533		 * if we have tunnel mode SA, we may need to ignore
534		 * IP_ROUTETOIF.
535		 */
536		if (state.ro != &iproute || state.ro->ro_rt != NULL) {
537			flags &= ~IP_ROUTETOIF;
538			ro = state.ro;
539		}
540	} else
541		ro = state.ro;
542	dst = (struct sockaddr_in *)state.dst;
543	if (error) {
544		/* mbuf is already reclaimed in ipsec4_output. */
545		m = NULL;
546		switch (error) {
547		case EHOSTUNREACH:
548		case ENETUNREACH:
549		case EMSGSIZE:
550		case ENOBUFS:
551		case ENOMEM:
552			break;
553		default:
554			printf("ip4_output (ipsec): error code %d\n", error);
555			/*fall through*/
556		case ENOENT:
557			/* don't show these error codes to the user */
558			error = 0;
559			break;
560		}
561		goto bad;
562	}
563
564	/* be sure to update variables that are affected by ipsec4_output() */
565	ip = mtod(m, struct ip *);
566	hlen = ip->ip_hl << 2;
567	if (ro->ro_rt == NULL) {
568		if ((flags & IP_ROUTETOIF) == 0) {
569			printf("ip_output: "
570				"can't update route after IPsec processing\n");
571			error = EHOSTUNREACH;	/*XXX*/
572			goto bad;
573		}
574	} else {
575		if (state.encap) {
576			ia = ifatoia(ro->ro_rt->rt_ifa);
577			ifp = ro->ro_rt->rt_ifp;
578		}
579	}
580    }
581
582	/* make it flipped, again. */
583	ip->ip_len = ntohs(ip->ip_len);
584	ip->ip_off = ntohs(ip->ip_off);
585skip_ipsec:
586#endif /*IPSEC*/
587#ifdef FAST_IPSEC
588	/*
589	 * Check the security policy (SP) for the packet and, if
590	 * required, do IPsec-related processing.  There are two
591	 * cases here; the first time a packet is sent through
592	 * it will be untagged and handled by ipsec4_checkpolicy.
593	 * If the packet is resubmitted to ip_output (e.g. after
594	 * AH, ESP, etc. processing), there will be a tag to bypass
595	 * the lookup and related policy checking.
596	 */
597	mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
598	s = splnet();
599	if (mtag != NULL) {
600		tdbi = (struct tdb_ident *)(mtag + 1);
601		sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND);
602		if (sp == NULL)
603			error = -EINVAL;	/* force silent drop */
604		m_tag_delete(m, mtag);
605	} else {
606		sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags,
607					&error, inp);
608	}
609	/*
610	 * There are four return cases:
611	 *    sp != NULL	 	    apply IPsec policy
612	 *    sp == NULL, error == 0	    no IPsec handling needed
613	 *    sp == NULL, error == -EINVAL  discard packet w/o error
614	 *    sp == NULL, error != 0	    discard packet, report error
615	 */
616	if (sp != NULL) {
617		/* Loop detection, check if ipsec processing already done */
618		KASSERT(sp->req != NULL, ("ip_output: no ipsec request"));
619		for (mtag = m_tag_first(m); mtag != NULL;
620		     mtag = m_tag_next(m, mtag)) {
621			if (mtag->m_tag_cookie != MTAG_ABI_COMPAT)
622				continue;
623			if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
624			    mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
625				continue;
626			/*
627			 * Check if policy has an SA associated with it.
628			 * This can happen when an SP has yet to acquire
629			 * an SA; e.g. on first reference.  If it occurs,
630			 * then we let ipsec4_process_packet do its thing.
631			 */
632			if (sp->req->sav == NULL)
633				break;
634			tdbi = (struct tdb_ident *)(mtag + 1);
635			if (tdbi->spi == sp->req->sav->spi &&
636			    tdbi->proto == sp->req->sav->sah->saidx.proto &&
637			    bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst,
638				 sizeof (union sockaddr_union)) == 0) {
639				/*
640				 * No IPsec processing is needed, free
641				 * reference to SP.
642				 *
643				 * NB: null pointer to avoid free at
644				 *     done: below.
645				 */
646				KEY_FREESP(&sp), sp = NULL;
647				splx(s);
648				goto spd_done;
649			}
650		}
651
652		/*
653		 * Do delayed checksums now because we send before
654		 * this is done in the normal processing path.
655		 */
656		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
657			in_delayed_cksum(m);
658			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
659		}
660
661		ip->ip_len = htons(ip->ip_len);
662		ip->ip_off = htons(ip->ip_off);
663
664		/* NB: callee frees mbuf */
665		error = ipsec4_process_packet(m, sp->req, flags, 0);
666		/*
667		 * Preserve KAME behaviour: ENOENT can be returned
668		 * when an SA acquire is in progress.  Don't propagate
669		 * this to user-level; it confuses applications.
670		 *
671		 * XXX this will go away when the SADB is redone.
672		 */
673		if (error == ENOENT)
674			error = 0;
675		splx(s);
676		goto done;
677	} else {
678		splx(s);
679
680		if (error != 0) {
681			/*
682			 * Hack: -EINVAL is used to signal that a packet
683			 * should be silently discarded.  This is typically
684			 * because we asked key management for an SA and
685			 * it was delayed (e.g. kicked up to IKE).
686			 */
687			if (error == -EINVAL)
688				error = 0;
689			goto bad;
690		} else {
691			/* No IPsec processing for this packet. */
692		}
693#ifdef notyet
694		/*
695		 * If deferred crypto processing is needed, check that
696		 * the interface supports it.
697		 */
698		mtag = m_tag_find(m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL);
699		if (mtag != NULL && (ifp->if_capenable & IFCAP_IPSEC) == 0) {
700			/* notify IPsec to do its own crypto */
701			ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1));
702			error = EHOSTUNREACH;
703			goto bad;
704		}
705#endif
706	}
707spd_done:
708#endif /* FAST_IPSEC */
709
710	/*
711	 * IpHack's section.
712	 * - Xlate: translate packet's addr/port (NAT).
713	 * - Firewall: deny/allow/etc.
714	 * - Wrap: fake packet's addr/port <unimpl.>
715	 * - Encapsulate: put it in another IP and send out. <unimp.>
716	 */
717#ifdef PFIL_HOOKS
718	/*
719	 * Run through list of hooks for output packets.
720	 */
721	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT);
722	if (error != 0 || m == NULL)
723		goto done;
724	ip = mtod(m, struct ip *);
725#endif /* PFIL_HOOKS */
726
727	/*
728	 * Check with the firewall...
729	 * but not if we are already being fwd'd from a firewall.
730	 */
731	if (fw_enable && IPFW_LOADED && !args.next_hop) {
732		struct sockaddr_in *old = dst;
733
734		args.m = m;
735		args.next_hop = dst;
736		args.oif = ifp;
737		off = ip_fw_chk_ptr(&args);
738		m = args.m;
739		dst = args.next_hop;
740
741                /*
742		 * On return we must do the following:
743		 * m == NULL	-> drop the pkt (old interface, deprecated)
744		 * (off & IP_FW_PORT_DENY_FLAG)	-> drop the pkt (new interface)
745		 * 1<=off<= 0xffff		-> DIVERT
746		 * (off & IP_FW_PORT_DYNT_FLAG)	-> send to a DUMMYNET pipe
747		 * (off & IP_FW_PORT_TEE_FLAG)	-> TEE the packet
748		 * dst != old			-> IPFIREWALL_FORWARD
749		 * off==0, dst==old		-> accept
750		 * If some of the above modules are not compiled in, then
751		 * we should't have to check the corresponding condition
752		 * (because the ipfw control socket should not accept
753		 * unsupported rules), but better play safe and drop
754		 * packets in case of doubt.
755		 */
756		if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) {
757			if (m)
758				m_freem(m);
759			error = EACCES;
760			goto done;
761		}
762		ip = mtod(m, struct ip *);
763		if (off == 0 && dst == old)		/* common case */
764			goto pass;
765                if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) {
766			/*
767			 * pass the pkt to dummynet. Need to include
768			 * pipe number, m, ifp, ro, dst because these are
769			 * not recomputed in the next pass.
770			 * All other parameters have been already used and
771			 * so they are not needed anymore.
772			 * XXX note: if the ifp or ro entry are deleted
773			 * while a pkt is in dummynet, we are in trouble!
774			 */
775			args.ro = ro;
776			args.dst = dst;
777			args.flags = flags;
778
779			error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
780				&args);
781			goto done;
782		}
783#ifdef IPDIVERT
784		if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
785			struct mbuf *clone;
786
787			/* Clone packet if we're doing a 'tee' */
788			if ((off & IP_FW_PORT_TEE_FLAG) != 0)
789				clone = divert_clone(m);
790			else
791				clone = NULL;
792
793			/*
794			 * XXX
795			 * delayed checksums are not currently compatible
796			 * with divert sockets.
797			 */
798			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
799				in_delayed_cksum(m);
800				m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
801			}
802
803			/* Restore packet header fields to original values */
804			ip->ip_len = htons(ip->ip_len);
805			ip->ip_off = htons(ip->ip_off);
806
807			/* Deliver packet to divert input routine */
808			divert_packet(m, 0);
809
810			/* If 'tee', continue with original packet */
811			if (clone != NULL) {
812				m = clone;
813				ip = mtod(m, struct ip *);
814				goto pass;
815			}
816			goto done;
817		}
818#endif
819
820		/* IPFIREWALL_FORWARD */
821		/*
822		 * Check dst to make sure it is directly reachable on the
823		 * interface we previously thought it was.
824		 * If it isn't (which may be likely in some situations) we have
825		 * to re-route it (ie, find a route for the next-hop and the
826		 * associated interface) and set them here. This is nested
827		 * forwarding which in most cases is undesirable, except where
828		 * such control is nigh impossible. So we do it here.
829		 * And I'm babbling.
830		 */
831		if (off == 0 && old != dst) { /* FORWARD, dst has changed */
832#if 0
833			/*
834			 * XXX To improve readability, this block should be
835			 * changed into a function call as below:
836			 */
837			error = ip_ipforward(&m, &dst, &ifp);
838			if (error)
839				goto bad;
840			if (m == NULL) /* ip_input consumed the mbuf */
841				goto done;
842#else
843			struct in_ifaddr *ia;
844
845			/*
846			 * XXX sro_fwd below is static, and a pointer
847			 * to it gets passed to routines downstream.
848			 * This could have surprisingly bad results in
849			 * practice, because its content is overwritten
850			 * by subsequent packets.
851			 */
852			/* There must be a better way to do this next line... */
853			static struct route sro_fwd;
854			struct route *ro_fwd = &sro_fwd;
855
856#if 0
857			print_ip("IPFIREWALL_FORWARD: New dst ip: ",
858			    dst->sin_addr, "\n");
859#endif
860
861			/*
862			 * We need to figure out if we have been forwarded
863			 * to a local socket. If so, then we should somehow
864			 * "loop back" to ip_input, and get directed to the
865			 * PCB as if we had received this packet. This is
866			 * because it may be dificult to identify the packets
867			 * you want to forward until they are being output
868			 * and have selected an interface. (e.g. locally
869			 * initiated packets) If we used the loopback inteface,
870			 * we would not be able to control what happens
871			 * as the packet runs through ip_input() as
872			 * it is done through an ISR.
873			 */
874			LIST_FOREACH(ia,
875			    INADDR_HASH(dst->sin_addr.s_addr), ia_hash) {
876				/*
877				 * If the addr to forward to is one
878				 * of ours, we pretend to
879				 * be the destination for this packet.
880				 */
881				if (IA_SIN(ia)->sin_addr.s_addr ==
882						 dst->sin_addr.s_addr)
883					break;
884			}
885			if (ia) {	/* tell ip_input "dont filter" */
886				mtag = m_tag_get(
887				    PACKET_TAG_IPFORWARD,
888				    sizeof(struct sockaddr_in *), M_NOWAIT);
889				if (mtag == NULL) {
890					error = ENOBUFS;
891					goto bad;
892				}
893				*(struct sockaddr_in **)(mtag+1) =
894				    args.next_hop;
895				m_tag_prepend(m, mtag);
896
897				if (m->m_pkthdr.rcvif == NULL)
898					m->m_pkthdr.rcvif = ifunit("lo0");
899				if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
900					m->m_pkthdr.csum_flags |=
901					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
902					m->m_pkthdr.csum_data = 0xffff;
903				}
904				m->m_pkthdr.csum_flags |=
905				    CSUM_IP_CHECKED | CSUM_IP_VALID;
906				ip->ip_len = htons(ip->ip_len);
907				ip->ip_off = htons(ip->ip_off);
908				ip_input(m);
909				goto done;
910			}
911			/*
912			 * Some of the logic for this was
913			 * nicked from above.
914			 */
915			bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
916
917			ro_fwd->ro_rt = 0;
918			rtalloc_ign(ro_fwd, RTF_CLONING);
919
920			if (ro_fwd->ro_rt == 0) {
921				ipstat.ips_noroute++;
922				error = EHOSTUNREACH;
923				goto bad;
924			}
925
926			ia = ifatoia(ro_fwd->ro_rt->rt_ifa);
927			ifp = ro_fwd->ro_rt->rt_ifp;
928			ro_fwd->ro_rt->rt_rmx.rmx_pksent++;
929			if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
930				dst = (struct sockaddr_in *)
931					ro_fwd->ro_rt->rt_gateway;
932			if (ro_fwd->ro_rt->rt_flags & RTF_HOST)
933				isbroadcast =
934				    (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
935			else
936				isbroadcast = in_broadcast(dst->sin_addr, ifp);
937			if (ro->ro_rt)
938				RTFREE(ro->ro_rt);
939			ro->ro_rt = ro_fwd->ro_rt;
940			dst = (struct sockaddr_in *)&ro_fwd->ro_dst;
941
942#endif	/* ... block to be put into a function */
943			/*
944			 * If we added a default src ip earlier,
945			 * which would have been gotten from the-then
946			 * interface, do it again, from the new one.
947			 */
948			if (src_was_INADDR_ANY)
949				ip->ip_src = IA_SIN(ia)->sin_addr;
950			goto pass ;
951		}
952
953                /*
954                 * if we get here, none of the above matches, and
955                 * we have to drop the pkt
956                 */
957		m_freem(m);
958                error = EACCES; /* not sure this is the right error msg */
959                goto done;
960	}
961
962pass:
963	/* 127/8 must not appear on wire - RFC1122. */
964	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
965	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
966		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
967			ipstat.ips_badaddr++;
968			error = EADDRNOTAVAIL;
969			goto bad;
970		}
971	}
972
973	m->m_pkthdr.csum_flags |= CSUM_IP;
974	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
975	if (sw_csum & CSUM_DELAY_DATA) {
976		in_delayed_cksum(m);
977		sw_csum &= ~CSUM_DELAY_DATA;
978	}
979	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
980
981	/*
982	 * If small enough for interface, or the interface will take
983	 * care of the fragmentation for us, can just send directly.
984	 */
985	if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT &&
986	    ((ip->ip_off & IP_DF) == 0))) {
987		ip->ip_len = htons(ip->ip_len);
988		ip->ip_off = htons(ip->ip_off);
989		ip->ip_sum = 0;
990		if (sw_csum & CSUM_DELAY_IP)
991			ip->ip_sum = in_cksum(m, hlen);
992
993		/* Record statistics for this interface address. */
994		if (!(flags & IP_FORWARDING) && ia) {
995			ia->ia_ifa.if_opackets++;
996			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
997		}
998
999#ifdef IPSEC
1000		/* clean ipsec history once it goes out of the node */
1001		ipsec_delaux(m);
1002#endif
1003
1004#ifdef MBUF_STRESS_TEST
1005		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
1006			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
1007#endif
1008		error = (*ifp->if_output)(ifp, m,
1009				(struct sockaddr *)dst, ro->ro_rt);
1010		goto done;
1011	}
1012
1013	if (ip->ip_off & IP_DF) {
1014		error = EMSGSIZE;
1015		/*
1016		 * This case can happen if the user changed the MTU
1017		 * of an interface after enabling IP on it.  Because
1018		 * most netifs don't keep track of routes pointing to
1019		 * them, there is no way for one to update all its
1020		 * routes when the MTU is changed.
1021		 */
1022		if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
1023		    (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
1024			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
1025		}
1026		ipstat.ips_cantfrag++;
1027		goto bad;
1028	}
1029
1030	/*
1031	 * Too large for interface; fragment if possible. If successful,
1032	 * on return, m will point to a list of packets to be sent.
1033	 */
1034	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
1035	if (error)
1036		goto bad;
1037	for (; m; m = m0) {
1038		m0 = m->m_nextpkt;
1039		m->m_nextpkt = 0;
1040#ifdef IPSEC
1041		/* clean ipsec history once it goes out of the node */
1042		ipsec_delaux(m);
1043#endif
1044		if (error == 0) {
1045			/* Record statistics for this interface address. */
1046			if (ia != NULL) {
1047				ia->ia_ifa.if_opackets++;
1048				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1049			}
1050
1051			error = (*ifp->if_output)(ifp, m,
1052			    (struct sockaddr *)dst, ro->ro_rt);
1053		} else
1054			m_freem(m);
1055	}
1056
1057	if (error == 0)
1058		ipstat.ips_fragmented++;
1059
1060done:
1061	if (ro == &iproute && ro->ro_rt) {
1062		RTFREE(ro->ro_rt);
1063		ro->ro_rt = NULL;
1064	}
1065	if (dummytag) {
1066		struct dn_pkt_tag *dt = (struct dn_pkt_tag *)(dummytag+1);
1067		if (dt->ro.ro_rt)
1068			RTFREE(dt->ro.ro_rt);
1069		m_tag_free(dummytag);
1070	}
1071#ifdef IPSEC
1072	if (sp != NULL) {
1073		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1074			printf("DP ip_output call free SP:%p\n", sp));
1075		key_freesp(sp);
1076	}
1077#endif
1078#ifdef FAST_IPSEC
1079	if (sp != NULL)
1080		KEY_FREESP(&sp);
1081#endif
1082	return (error);
1083bad:
1084	m_freem(m);
1085	goto done;
1086}
1087
1088/*
1089 * Create a chain of fragments which fit the given mtu. m_frag points to the
1090 * mbuf to be fragmented; on return it points to the chain with the fragments.
1091 * Return 0 if no error. If error, m_frag may contain a partially built
1092 * chain of fragments that should be freed by the caller.
1093 *
1094 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
1095 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
1096 */
1097int
1098ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
1099	    u_long if_hwassist_flags, int sw_csum)
1100{
1101	int error = 0;
1102	int hlen = ip->ip_hl << 2;
1103	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
1104	int off;
1105	struct mbuf *m0 = *m_frag;	/* the original packet		*/
1106	int firstlen;
1107	struct mbuf **mnext;
1108	int nfrags;
1109
1110	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
1111		ipstat.ips_cantfrag++;
1112		return EMSGSIZE;
1113	}
1114
1115	/*
1116	 * Must be able to put at least 8 bytes per fragment.
1117	 */
1118	if (len < 8)
1119		return EMSGSIZE;
1120
1121	/*
1122	 * If the interface will not calculate checksums on
1123	 * fragmented packets, then do it here.
1124	 */
1125	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
1126	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
1127		in_delayed_cksum(m0);
1128		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1129	}
1130
1131	if (len > PAGE_SIZE) {
1132		/*
1133		 * Fragment large datagrams such that each segment
1134		 * contains a multiple of PAGE_SIZE amount of data,
1135		 * plus headers. This enables a receiver to perform
1136		 * page-flipping zero-copy optimizations.
1137		 *
1138		 * XXX When does this help given that sender and receiver
1139		 * could have different page sizes, and also mtu could
1140		 * be less than the receiver's page size ?
1141		 */
1142		int newlen;
1143		struct mbuf *m;
1144
1145		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
1146			off += m->m_len;
1147
1148		/*
1149		 * firstlen (off - hlen) must be aligned on an
1150		 * 8-byte boundary
1151		 */
1152		if (off < hlen)
1153			goto smart_frag_failure;
1154		off = ((off - hlen) & ~7) + hlen;
1155		newlen = (~PAGE_MASK) & mtu;
1156		if ((newlen + sizeof (struct ip)) > mtu) {
1157			/* we failed, go back the default */
1158smart_frag_failure:
1159			newlen = len;
1160			off = hlen + len;
1161		}
1162		len = newlen;
1163
1164	} else {
1165		off = hlen + len;
1166	}
1167
1168	firstlen = off - hlen;
1169	mnext = &m0->m_nextpkt;		/* pointer to next packet */
1170
1171	/*
1172	 * Loop through length of segment after first fragment,
1173	 * make new header and copy data of each part and link onto chain.
1174	 * Here, m0 is the original packet, m is the fragment being created.
1175	 * The fragments are linked off the m_nextpkt of the original
1176	 * packet, which after processing serves as the first fragment.
1177	 */
1178	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
1179		struct ip *mhip;	/* ip header on the fragment */
1180		struct mbuf *m;
1181		int mhlen = sizeof (struct ip);
1182
1183		MGETHDR(m, M_DONTWAIT, MT_HEADER);
1184		if (m == 0) {
1185			error = ENOBUFS;
1186			ipstat.ips_odropped++;
1187			goto done;
1188		}
1189		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1190		/*
1191		 * In the first mbuf, leave room for the link header, then
1192		 * copy the original IP header including options. The payload
1193		 * goes into an additional mbuf chain returned by m_copy().
1194		 */
1195		m->m_data += max_linkhdr;
1196		mhip = mtod(m, struct ip *);
1197		*mhip = *ip;
1198		if (hlen > sizeof (struct ip)) {
1199			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
1200			mhip->ip_v = IPVERSION;
1201			mhip->ip_hl = mhlen >> 2;
1202		}
1203		m->m_len = mhlen;
1204		/* XXX do we need to add ip->ip_off below ? */
1205		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
1206		if (off + len >= ip->ip_len) {	/* last fragment */
1207			len = ip->ip_len - off;
1208			m->m_flags |= M_LASTFRAG;
1209		} else
1210			mhip->ip_off |= IP_MF;
1211		mhip->ip_len = htons((u_short)(len + mhlen));
1212		m->m_next = m_copy(m0, off, len);
1213		if (m->m_next == 0) {		/* copy failed */
1214			m_free(m);
1215			error = ENOBUFS;	/* ??? */
1216			ipstat.ips_odropped++;
1217			goto done;
1218		}
1219		m->m_pkthdr.len = mhlen + len;
1220		m->m_pkthdr.rcvif = (struct ifnet *)0;
1221#ifdef MAC
1222		mac_create_fragment(m0, m);
1223#endif
1224		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1225		mhip->ip_off = htons(mhip->ip_off);
1226		mhip->ip_sum = 0;
1227		if (sw_csum & CSUM_DELAY_IP)
1228			mhip->ip_sum = in_cksum(m, mhlen);
1229		*mnext = m;
1230		mnext = &m->m_nextpkt;
1231	}
1232	ipstat.ips_ofragments += nfrags;
1233
1234	/* set first marker for fragment chain */
1235	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1236	m0->m_pkthdr.csum_data = nfrags;
1237
1238	/*
1239	 * Update first fragment by trimming what's been copied out
1240	 * and updating header.
1241	 */
1242	m_adj(m0, hlen + firstlen - ip->ip_len);
1243	m0->m_pkthdr.len = hlen + firstlen;
1244	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
1245	ip->ip_off |= IP_MF;
1246	ip->ip_off = htons(ip->ip_off);
1247	ip->ip_sum = 0;
1248	if (sw_csum & CSUM_DELAY_IP)
1249		ip->ip_sum = in_cksum(m0, hlen);
1250
1251done:
1252	*m_frag = m0;
1253	return error;
1254}
1255
1256void
1257in_delayed_cksum(struct mbuf *m)
1258{
1259	struct ip *ip;
1260	u_short csum, offset;
1261
1262	ip = mtod(m, struct ip *);
1263	offset = ip->ip_hl << 2 ;
1264	csum = in_cksum_skip(m, ip->ip_len, offset);
1265	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
1266		csum = 0xffff;
1267	offset += m->m_pkthdr.csum_data;	/* checksum offset */
1268
1269	if (offset + sizeof(u_short) > m->m_len) {
1270		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
1271		    m->m_len, offset, ip->ip_p);
1272		/*
1273		 * XXX
1274		 * this shouldn't happen, but if it does, the
1275		 * correct behavior may be to insert the checksum
1276		 * in the existing chain instead of rearranging it.
1277		 */
1278		m = m_pullup(m, offset + sizeof(u_short));
1279	}
1280	*(u_short *)(m->m_data + offset) = csum;
1281}
1282
1283/*
1284 * Insert IP options into preformed packet.
1285 * Adjust IP destination as required for IP source routing,
1286 * as indicated by a non-zero in_addr at the start of the options.
1287 *
1288 * XXX This routine assumes that the packet has no options in place.
1289 */
1290static struct mbuf *
1291ip_insertoptions(m, opt, phlen)
1292	register struct mbuf *m;
1293	struct mbuf *opt;
1294	int *phlen;
1295{
1296	register struct ipoption *p = mtod(opt, struct ipoption *);
1297	struct mbuf *n;
1298	register struct ip *ip = mtod(m, struct ip *);
1299	unsigned optlen;
1300
1301	optlen = opt->m_len - sizeof(p->ipopt_dst);
1302	if (optlen + ip->ip_len > IP_MAXPACKET) {
1303		*phlen = 0;
1304		return (m);		/* XXX should fail */
1305	}
1306	if (p->ipopt_dst.s_addr)
1307		ip->ip_dst = p->ipopt_dst;
1308	if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
1309		MGETHDR(n, M_DONTWAIT, MT_HEADER);
1310		if (n == 0) {
1311			*phlen = 0;
1312			return (m);
1313		}
1314		n->m_pkthdr.rcvif = (struct ifnet *)0;
1315#ifdef MAC
1316		mac_create_mbuf_from_mbuf(m, n);
1317#endif
1318		n->m_pkthdr.len = m->m_pkthdr.len + optlen;
1319		m->m_len -= sizeof(struct ip);
1320		m->m_data += sizeof(struct ip);
1321		n->m_next = m;
1322		m = n;
1323		m->m_len = optlen + sizeof(struct ip);
1324		m->m_data += max_linkhdr;
1325		bcopy(ip, mtod(m, void *), sizeof(struct ip));
1326	} else {
1327		m->m_data -= optlen;
1328		m->m_len += optlen;
1329		m->m_pkthdr.len += optlen;
1330		bcopy(ip, mtod(m, void *), sizeof(struct ip));
1331	}
1332	ip = mtod(m, struct ip *);
1333	bcopy(p->ipopt_list, ip + 1, optlen);
1334	*phlen = sizeof(struct ip) + optlen;
1335	ip->ip_v = IPVERSION;
1336	ip->ip_hl = *phlen >> 2;
1337	ip->ip_len += optlen;
1338	return (m);
1339}
1340
1341/*
1342 * Copy options from ip to jp,
1343 * omitting those not copied during fragmentation.
1344 */
1345int
1346ip_optcopy(ip, jp)
1347	struct ip *ip, *jp;
1348{
1349	register u_char *cp, *dp;
1350	int opt, optlen, cnt;
1351
1352	cp = (u_char *)(ip + 1);
1353	dp = (u_char *)(jp + 1);
1354	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
1355	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1356		opt = cp[0];
1357		if (opt == IPOPT_EOL)
1358			break;
1359		if (opt == IPOPT_NOP) {
1360			/* Preserve for IP mcast tunnel's LSRR alignment. */
1361			*dp++ = IPOPT_NOP;
1362			optlen = 1;
1363			continue;
1364		}
1365
1366		KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
1367		    ("ip_optcopy: malformed ipv4 option"));
1368		optlen = cp[IPOPT_OLEN];
1369		KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
1370		    ("ip_optcopy: malformed ipv4 option"));
1371
1372		/* bogus lengths should have been caught by ip_dooptions */
1373		if (optlen > cnt)
1374			optlen = cnt;
1375		if (IPOPT_COPIED(opt)) {
1376			bcopy(cp, dp, optlen);
1377			dp += optlen;
1378		}
1379	}
1380	for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
1381		*dp++ = IPOPT_EOL;
1382	return (optlen);
1383}
1384
1385/*
1386 * IP socket option processing.
1387 */
1388int
1389ip_ctloutput(so, sopt)
1390	struct socket *so;
1391	struct sockopt *sopt;
1392{
1393	struct	inpcb *inp = sotoinpcb(so);
1394	int	error, optval;
1395
1396	error = optval = 0;
1397	if (sopt->sopt_level != IPPROTO_IP) {
1398		return (EINVAL);
1399	}
1400
1401	switch (sopt->sopt_dir) {
1402	case SOPT_SET:
1403		switch (sopt->sopt_name) {
1404		case IP_OPTIONS:
1405#ifdef notyet
1406		case IP_RETOPTS:
1407#endif
1408		{
1409			struct mbuf *m;
1410			if (sopt->sopt_valsize > MLEN) {
1411				error = EMSGSIZE;
1412				break;
1413			}
1414			MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_HEADER);
1415			if (m == 0) {
1416				error = ENOBUFS;
1417				break;
1418			}
1419			m->m_len = sopt->sopt_valsize;
1420			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
1421					    m->m_len);
1422
1423			return (ip_pcbopts(sopt->sopt_name, &inp->inp_options,
1424					   m));
1425		}
1426
1427		case IP_TOS:
1428		case IP_TTL:
1429		case IP_RECVOPTS:
1430		case IP_RECVRETOPTS:
1431		case IP_RECVDSTADDR:
1432		case IP_RECVTTL:
1433		case IP_RECVIF:
1434		case IP_FAITH:
1435		case IP_ONESBCAST:
1436			error = sooptcopyin(sopt, &optval, sizeof optval,
1437					    sizeof optval);
1438			if (error)
1439				break;
1440
1441			switch (sopt->sopt_name) {
1442			case IP_TOS:
1443				inp->inp_ip_tos = optval;
1444				break;
1445
1446			case IP_TTL:
1447				inp->inp_ip_ttl = optval;
1448				break;
1449#define	OPTSET(bit) \
1450	if (optval) \
1451		inp->inp_flags |= bit; \
1452	else \
1453		inp->inp_flags &= ~bit;
1454
1455			case IP_RECVOPTS:
1456				OPTSET(INP_RECVOPTS);
1457				break;
1458
1459			case IP_RECVRETOPTS:
1460				OPTSET(INP_RECVRETOPTS);
1461				break;
1462
1463			case IP_RECVDSTADDR:
1464				OPTSET(INP_RECVDSTADDR);
1465				break;
1466
1467			case IP_RECVTTL:
1468				OPTSET(INP_RECVTTL);
1469				break;
1470
1471			case IP_RECVIF:
1472				OPTSET(INP_RECVIF);
1473				break;
1474
1475			case IP_FAITH:
1476				OPTSET(INP_FAITH);
1477				break;
1478
1479			case IP_ONESBCAST:
1480				OPTSET(INP_ONESBCAST);
1481				break;
1482			}
1483			break;
1484#undef OPTSET
1485
1486		case IP_MULTICAST_IF:
1487		case IP_MULTICAST_VIF:
1488		case IP_MULTICAST_TTL:
1489		case IP_MULTICAST_LOOP:
1490		case IP_ADD_MEMBERSHIP:
1491		case IP_DROP_MEMBERSHIP:
1492			error = ip_setmoptions(sopt, &inp->inp_moptions);
1493			break;
1494
1495		case IP_PORTRANGE:
1496			error = sooptcopyin(sopt, &optval, sizeof optval,
1497					    sizeof optval);
1498			if (error)
1499				break;
1500
1501			switch (optval) {
1502			case IP_PORTRANGE_DEFAULT:
1503				inp->inp_flags &= ~(INP_LOWPORT);
1504				inp->inp_flags &= ~(INP_HIGHPORT);
1505				break;
1506
1507			case IP_PORTRANGE_HIGH:
1508				inp->inp_flags &= ~(INP_LOWPORT);
1509				inp->inp_flags |= INP_HIGHPORT;
1510				break;
1511
1512			case IP_PORTRANGE_LOW:
1513				inp->inp_flags &= ~(INP_HIGHPORT);
1514				inp->inp_flags |= INP_LOWPORT;
1515				break;
1516
1517			default:
1518				error = EINVAL;
1519				break;
1520			}
1521			break;
1522
1523#if defined(IPSEC) || defined(FAST_IPSEC)
1524		case IP_IPSEC_POLICY:
1525		{
1526			caddr_t req;
1527			size_t len = 0;
1528			int priv;
1529			struct mbuf *m;
1530			int optname;
1531
1532			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1533				break;
1534			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1535				break;
1536			priv = (sopt->sopt_td != NULL &&
1537				suser(sopt->sopt_td) != 0) ? 0 : 1;
1538			req = mtod(m, caddr_t);
1539			len = m->m_len;
1540			optname = sopt->sopt_name;
1541			error = ipsec4_set_policy(inp, optname, req, len, priv);
1542			m_freem(m);
1543			break;
1544		}
1545#endif /*IPSEC*/
1546
1547		default:
1548			error = ENOPROTOOPT;
1549			break;
1550		}
1551		break;
1552
1553	case SOPT_GET:
1554		switch (sopt->sopt_name) {
1555		case IP_OPTIONS:
1556		case IP_RETOPTS:
1557			if (inp->inp_options)
1558				error = sooptcopyout(sopt,
1559						     mtod(inp->inp_options,
1560							  char *),
1561						     inp->inp_options->m_len);
1562			else
1563				sopt->sopt_valsize = 0;
1564			break;
1565
1566		case IP_TOS:
1567		case IP_TTL:
1568		case IP_RECVOPTS:
1569		case IP_RECVRETOPTS:
1570		case IP_RECVDSTADDR:
1571		case IP_RECVTTL:
1572		case IP_RECVIF:
1573		case IP_PORTRANGE:
1574		case IP_FAITH:
1575		case IP_ONESBCAST:
1576			switch (sopt->sopt_name) {
1577
1578			case IP_TOS:
1579				optval = inp->inp_ip_tos;
1580				break;
1581
1582			case IP_TTL:
1583				optval = inp->inp_ip_ttl;
1584				break;
1585
1586#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1587
1588			case IP_RECVOPTS:
1589				optval = OPTBIT(INP_RECVOPTS);
1590				break;
1591
1592			case IP_RECVRETOPTS:
1593				optval = OPTBIT(INP_RECVRETOPTS);
1594				break;
1595
1596			case IP_RECVDSTADDR:
1597				optval = OPTBIT(INP_RECVDSTADDR);
1598				break;
1599
1600			case IP_RECVTTL:
1601				optval = OPTBIT(INP_RECVTTL);
1602				break;
1603
1604			case IP_RECVIF:
1605				optval = OPTBIT(INP_RECVIF);
1606				break;
1607
1608			case IP_PORTRANGE:
1609				if (inp->inp_flags & INP_HIGHPORT)
1610					optval = IP_PORTRANGE_HIGH;
1611				else if (inp->inp_flags & INP_LOWPORT)
1612					optval = IP_PORTRANGE_LOW;
1613				else
1614					optval = 0;
1615				break;
1616
1617			case IP_FAITH:
1618				optval = OPTBIT(INP_FAITH);
1619				break;
1620
1621			case IP_ONESBCAST:
1622				optval = OPTBIT(INP_ONESBCAST);
1623				break;
1624			}
1625			error = sooptcopyout(sopt, &optval, sizeof optval);
1626			break;
1627
1628		case IP_MULTICAST_IF:
1629		case IP_MULTICAST_VIF:
1630		case IP_MULTICAST_TTL:
1631		case IP_MULTICAST_LOOP:
1632		case IP_ADD_MEMBERSHIP:
1633		case IP_DROP_MEMBERSHIP:
1634			error = ip_getmoptions(sopt, inp->inp_moptions);
1635			break;
1636
1637#if defined(IPSEC) || defined(FAST_IPSEC)
1638		case IP_IPSEC_POLICY:
1639		{
1640			struct mbuf *m = NULL;
1641			caddr_t req = NULL;
1642			size_t len = 0;
1643
1644			if (m != 0) {
1645				req = mtod(m, caddr_t);
1646				len = m->m_len;
1647			}
1648			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1649			if (error == 0)
1650				error = soopt_mcopyout(sopt, m); /* XXX */
1651			if (error == 0)
1652				m_freem(m);
1653			break;
1654		}
1655#endif /*IPSEC*/
1656
1657		default:
1658			error = ENOPROTOOPT;
1659			break;
1660		}
1661		break;
1662	}
1663	return (error);
1664}
1665
1666/*
1667 * Set up IP options in pcb for insertion in output packets.
1668 * Store in mbuf with pointer in pcbopt, adding pseudo-option
1669 * with destination address if source routed.
1670 */
1671static int
1672ip_pcbopts(optname, pcbopt, m)
1673	int optname;
1674	struct mbuf **pcbopt;
1675	register struct mbuf *m;
1676{
1677	register int cnt, optlen;
1678	register u_char *cp;
1679	u_char opt;
1680
1681	/* turn off any old options */
1682	if (*pcbopt)
1683		(void)m_free(*pcbopt);
1684	*pcbopt = 0;
1685	if (m == (struct mbuf *)0 || m->m_len == 0) {
1686		/*
1687		 * Only turning off any previous options.
1688		 */
1689		if (m)
1690			(void)m_free(m);
1691		return (0);
1692	}
1693
1694	if (m->m_len % sizeof(int32_t))
1695		goto bad;
1696	/*
1697	 * IP first-hop destination address will be stored before
1698	 * actual options; move other options back
1699	 * and clear it when none present.
1700	 */
1701	if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
1702		goto bad;
1703	cnt = m->m_len;
1704	m->m_len += sizeof(struct in_addr);
1705	cp = mtod(m, u_char *) + sizeof(struct in_addr);
1706	bcopy(mtod(m, void *), cp, (unsigned)cnt);
1707	bzero(mtod(m, void *), sizeof(struct in_addr));
1708
1709	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1710		opt = cp[IPOPT_OPTVAL];
1711		if (opt == IPOPT_EOL)
1712			break;
1713		if (opt == IPOPT_NOP)
1714			optlen = 1;
1715		else {
1716			if (cnt < IPOPT_OLEN + sizeof(*cp))
1717				goto bad;
1718			optlen = cp[IPOPT_OLEN];
1719			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
1720				goto bad;
1721		}
1722		switch (opt) {
1723
1724		default:
1725			break;
1726
1727		case IPOPT_LSRR:
1728		case IPOPT_SSRR:
1729			/*
1730			 * user process specifies route as:
1731			 *	->A->B->C->D
1732			 * D must be our final destination (but we can't
1733			 * check that since we may not have connected yet).
1734			 * A is first hop destination, which doesn't appear in
1735			 * actual IP option, but is stored before the options.
1736			 */
1737			if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
1738				goto bad;
1739			m->m_len -= sizeof(struct in_addr);
1740			cnt -= sizeof(struct in_addr);
1741			optlen -= sizeof(struct in_addr);
1742			cp[IPOPT_OLEN] = optlen;
1743			/*
1744			 * Move first hop before start of options.
1745			 */
1746			bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
1747			    sizeof(struct in_addr));
1748			/*
1749			 * Then copy rest of options back
1750			 * to close up the deleted entry.
1751			 */
1752			bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)),
1753			    &cp[IPOPT_OFFSET+1],
1754			    (unsigned)cnt + sizeof(struct in_addr));
1755			break;
1756		}
1757	}
1758	if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
1759		goto bad;
1760	*pcbopt = m;
1761	return (0);
1762
1763bad:
1764	(void)m_free(m);
1765	return (EINVAL);
1766}
1767
1768/*
1769 * XXX
1770 * The whole multicast option thing needs to be re-thought.
1771 * Several of these options are equally applicable to non-multicast
1772 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1773 * standard option (IP_TTL).
1774 */
1775
1776/*
1777 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1778 */
1779static struct ifnet *
1780ip_multicast_if(a, ifindexp)
1781	struct in_addr *a;
1782	int *ifindexp;
1783{
1784	int ifindex;
1785	struct ifnet *ifp;
1786
1787	if (ifindexp)
1788		*ifindexp = 0;
1789	if (ntohl(a->s_addr) >> 24 == 0) {
1790		ifindex = ntohl(a->s_addr) & 0xffffff;
1791		if (ifindex < 0 || if_index < ifindex)
1792			return NULL;
1793		ifp = ifnet_byindex(ifindex);
1794		if (ifindexp)
1795			*ifindexp = ifindex;
1796	} else {
1797		INADDR_TO_IFP(*a, ifp);
1798	}
1799	return ifp;
1800}
1801
1802/*
1803 * Set the IP multicast options in response to user setsockopt().
1804 */
1805static int
1806ip_setmoptions(sopt, imop)
1807	struct sockopt *sopt;
1808	struct ip_moptions **imop;
1809{
1810	int error = 0;
1811	int i;
1812	struct in_addr addr;
1813	struct ip_mreq mreq;
1814	struct ifnet *ifp;
1815	struct ip_moptions *imo = *imop;
1816	struct route ro;
1817	struct sockaddr_in *dst;
1818	int ifindex;
1819	int s;
1820
1821	if (imo == NULL) {
1822		/*
1823		 * No multicast option buffer attached to the pcb;
1824		 * allocate one and initialize to default values.
1825		 */
1826		imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS,
1827		    M_WAITOK);
1828
1829		if (imo == NULL)
1830			return (ENOBUFS);
1831		*imop = imo;
1832		imo->imo_multicast_ifp = NULL;
1833		imo->imo_multicast_addr.s_addr = INADDR_ANY;
1834		imo->imo_multicast_vif = -1;
1835		imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1836		imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1837		imo->imo_num_memberships = 0;
1838	}
1839
1840	switch (sopt->sopt_name) {
1841	/* store an index number for the vif you wanna use in the send */
1842	case IP_MULTICAST_VIF:
1843		if (legal_vif_num == 0) {
1844			error = EOPNOTSUPP;
1845			break;
1846		}
1847		error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1848		if (error)
1849			break;
1850		if (!legal_vif_num(i) && (i != -1)) {
1851			error = EINVAL;
1852			break;
1853		}
1854		imo->imo_multicast_vif = i;
1855		break;
1856
1857	case IP_MULTICAST_IF:
1858		/*
1859		 * Select the interface for outgoing multicast packets.
1860		 */
1861		error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1862		if (error)
1863			break;
1864		/*
1865		 * INADDR_ANY is used to remove a previous selection.
1866		 * When no interface is selected, a default one is
1867		 * chosen every time a multicast packet is sent.
1868		 */
1869		if (addr.s_addr == INADDR_ANY) {
1870			imo->imo_multicast_ifp = NULL;
1871			break;
1872		}
1873		/*
1874		 * The selected interface is identified by its local
1875		 * IP address.  Find the interface and confirm that
1876		 * it supports multicasting.
1877		 */
1878		s = splimp();
1879		ifp = ip_multicast_if(&addr, &ifindex);
1880		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1881			splx(s);
1882			error = EADDRNOTAVAIL;
1883			break;
1884		}
1885		imo->imo_multicast_ifp = ifp;
1886		if (ifindex)
1887			imo->imo_multicast_addr = addr;
1888		else
1889			imo->imo_multicast_addr.s_addr = INADDR_ANY;
1890		splx(s);
1891		break;
1892
1893	case IP_MULTICAST_TTL:
1894		/*
1895		 * Set the IP time-to-live for outgoing multicast packets.
1896		 * The original multicast API required a char argument,
1897		 * which is inconsistent with the rest of the socket API.
1898		 * We allow either a char or an int.
1899		 */
1900		if (sopt->sopt_valsize == 1) {
1901			u_char ttl;
1902			error = sooptcopyin(sopt, &ttl, 1, 1);
1903			if (error)
1904				break;
1905			imo->imo_multicast_ttl = ttl;
1906		} else {
1907			u_int ttl;
1908			error = sooptcopyin(sopt, &ttl, sizeof ttl,
1909					    sizeof ttl);
1910			if (error)
1911				break;
1912			if (ttl > 255)
1913				error = EINVAL;
1914			else
1915				imo->imo_multicast_ttl = ttl;
1916		}
1917		break;
1918
1919	case IP_MULTICAST_LOOP:
1920		/*
1921		 * Set the loopback flag for outgoing multicast packets.
1922		 * Must be zero or one.  The original multicast API required a
1923		 * char argument, which is inconsistent with the rest
1924		 * of the socket API.  We allow either a char or an int.
1925		 */
1926		if (sopt->sopt_valsize == 1) {
1927			u_char loop;
1928			error = sooptcopyin(sopt, &loop, 1, 1);
1929			if (error)
1930				break;
1931			imo->imo_multicast_loop = !!loop;
1932		} else {
1933			u_int loop;
1934			error = sooptcopyin(sopt, &loop, sizeof loop,
1935					    sizeof loop);
1936			if (error)
1937				break;
1938			imo->imo_multicast_loop = !!loop;
1939		}
1940		break;
1941
1942	case IP_ADD_MEMBERSHIP:
1943		/*
1944		 * Add a multicast group membership.
1945		 * Group must be a valid IP multicast address.
1946		 */
1947		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1948		if (error)
1949			break;
1950
1951		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1952			error = EINVAL;
1953			break;
1954		}
1955		s = splimp();
1956		/*
1957		 * If no interface address was provided, use the interface of
1958		 * the route to the given multicast address.
1959		 */
1960		if (mreq.imr_interface.s_addr == INADDR_ANY) {
1961			bzero((caddr_t)&ro, sizeof(ro));
1962			dst = (struct sockaddr_in *)&ro.ro_dst;
1963			dst->sin_len = sizeof(*dst);
1964			dst->sin_family = AF_INET;
1965			dst->sin_addr = mreq.imr_multiaddr;
1966			rtalloc_ign(&ro, RTF_CLONING);
1967			if (ro.ro_rt == NULL) {
1968				error = EADDRNOTAVAIL;
1969				splx(s);
1970				break;
1971			}
1972			ifp = ro.ro_rt->rt_ifp;
1973			RTFREE(ro.ro_rt);
1974		}
1975		else {
1976			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1977		}
1978
1979		/*
1980		 * See if we found an interface, and confirm that it
1981		 * supports multicast.
1982		 */
1983		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1984			error = EADDRNOTAVAIL;
1985			splx(s);
1986			break;
1987		}
1988		/*
1989		 * See if the membership already exists or if all the
1990		 * membership slots are full.
1991		 */
1992		for (i = 0; i < imo->imo_num_memberships; ++i) {
1993			if (imo->imo_membership[i]->inm_ifp == ifp &&
1994			    imo->imo_membership[i]->inm_addr.s_addr
1995						== mreq.imr_multiaddr.s_addr)
1996				break;
1997		}
1998		if (i < imo->imo_num_memberships) {
1999			error = EADDRINUSE;
2000			splx(s);
2001			break;
2002		}
2003		if (i == IP_MAX_MEMBERSHIPS) {
2004			error = ETOOMANYREFS;
2005			splx(s);
2006			break;
2007		}
2008		/*
2009		 * Everything looks good; add a new record to the multicast
2010		 * address list for the given interface.
2011		 */
2012		if ((imo->imo_membership[i] =
2013		    in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
2014			error = ENOBUFS;
2015			splx(s);
2016			break;
2017		}
2018		++imo->imo_num_memberships;
2019		splx(s);
2020		break;
2021
2022	case IP_DROP_MEMBERSHIP:
2023		/*
2024		 * Drop a multicast group membership.
2025		 * Group must be a valid IP multicast address.
2026		 */
2027		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
2028		if (error)
2029			break;
2030
2031		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
2032			error = EINVAL;
2033			break;
2034		}
2035
2036		s = splimp();
2037		/*
2038		 * If an interface address was specified, get a pointer
2039		 * to its ifnet structure.
2040		 */
2041		if (mreq.imr_interface.s_addr == INADDR_ANY)
2042			ifp = NULL;
2043		else {
2044			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
2045			if (ifp == NULL) {
2046				error = EADDRNOTAVAIL;
2047				splx(s);
2048				break;
2049			}
2050		}
2051		/*
2052		 * Find the membership in the membership array.
2053		 */
2054		for (i = 0; i < imo->imo_num_memberships; ++i) {
2055			if ((ifp == NULL ||
2056			     imo->imo_membership[i]->inm_ifp == ifp) &&
2057			     imo->imo_membership[i]->inm_addr.s_addr ==
2058			     mreq.imr_multiaddr.s_addr)
2059				break;
2060		}
2061		if (i == imo->imo_num_memberships) {
2062			error = EADDRNOTAVAIL;
2063			splx(s);
2064			break;
2065		}
2066		/*
2067		 * Give up the multicast address record to which the
2068		 * membership points.
2069		 */
2070		in_delmulti(imo->imo_membership[i]);
2071		/*
2072		 * Remove the gap in the membership array.
2073		 */
2074		for (++i; i < imo->imo_num_memberships; ++i)
2075			imo->imo_membership[i-1] = imo->imo_membership[i];
2076		--imo->imo_num_memberships;
2077		splx(s);
2078		break;
2079
2080	default:
2081		error = EOPNOTSUPP;
2082		break;
2083	}
2084
2085	/*
2086	 * If all options have default values, no need to keep the mbuf.
2087	 */
2088	if (imo->imo_multicast_ifp == NULL &&
2089	    imo->imo_multicast_vif == -1 &&
2090	    imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
2091	    imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
2092	    imo->imo_num_memberships == 0) {
2093		free(*imop, M_IPMOPTS);
2094		*imop = NULL;
2095	}
2096
2097	return (error);
2098}
2099
2100/*
2101 * Return the IP multicast options in response to user getsockopt().
2102 */
2103static int
2104ip_getmoptions(sopt, imo)
2105	struct sockopt *sopt;
2106	register struct ip_moptions *imo;
2107{
2108	struct in_addr addr;
2109	struct in_ifaddr *ia;
2110	int error, optval;
2111	u_char coptval;
2112
2113	error = 0;
2114	switch (sopt->sopt_name) {
2115	case IP_MULTICAST_VIF:
2116		if (imo != NULL)
2117			optval = imo->imo_multicast_vif;
2118		else
2119			optval = -1;
2120		error = sooptcopyout(sopt, &optval, sizeof optval);
2121		break;
2122
2123	case IP_MULTICAST_IF:
2124		if (imo == NULL || imo->imo_multicast_ifp == NULL)
2125			addr.s_addr = INADDR_ANY;
2126		else if (imo->imo_multicast_addr.s_addr) {
2127			/* return the value user has set */
2128			addr = imo->imo_multicast_addr;
2129		} else {
2130			IFP_TO_IA(imo->imo_multicast_ifp, ia);
2131			addr.s_addr = (ia == NULL) ? INADDR_ANY
2132				: IA_SIN(ia)->sin_addr.s_addr;
2133		}
2134		error = sooptcopyout(sopt, &addr, sizeof addr);
2135		break;
2136
2137	case IP_MULTICAST_TTL:
2138		if (imo == 0)
2139			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
2140		else
2141			optval = coptval = imo->imo_multicast_ttl;
2142		if (sopt->sopt_valsize == 1)
2143			error = sooptcopyout(sopt, &coptval, 1);
2144		else
2145			error = sooptcopyout(sopt, &optval, sizeof optval);
2146		break;
2147
2148	case IP_MULTICAST_LOOP:
2149		if (imo == 0)
2150			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
2151		else
2152			optval = coptval = imo->imo_multicast_loop;
2153		if (sopt->sopt_valsize == 1)
2154			error = sooptcopyout(sopt, &coptval, 1);
2155		else
2156			error = sooptcopyout(sopt, &optval, sizeof optval);
2157		break;
2158
2159	default:
2160		error = ENOPROTOOPT;
2161		break;
2162	}
2163	return (error);
2164}
2165
2166/*
2167 * Discard the IP multicast options.
2168 */
2169void
2170ip_freemoptions(imo)
2171	register struct ip_moptions *imo;
2172{
2173	register int i;
2174
2175	if (imo != NULL) {
2176		for (i = 0; i < imo->imo_num_memberships; ++i)
2177			in_delmulti(imo->imo_membership[i]);
2178		free(imo, M_IPMOPTS);
2179	}
2180}
2181
2182/*
2183 * Routine called from ip_output() to loop back a copy of an IP multicast
2184 * packet to the input queue of a specified interface.  Note that this
2185 * calls the output routine of the loopback "driver", but with an interface
2186 * pointer that might NOT be a loopback interface -- evil, but easier than
2187 * replicating that code here.
2188 */
2189static void
2190ip_mloopback(ifp, m, dst, hlen)
2191	struct ifnet *ifp;
2192	register struct mbuf *m;
2193	register struct sockaddr_in *dst;
2194	int hlen;
2195{
2196	register struct ip *ip;
2197	struct mbuf *copym;
2198
2199	copym = m_copy(m, 0, M_COPYALL);
2200	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
2201		copym = m_pullup(copym, hlen);
2202	if (copym != NULL) {
2203		/*
2204		 * We don't bother to fragment if the IP length is greater
2205		 * than the interface's MTU.  Can this possibly matter?
2206		 */
2207		ip = mtod(copym, struct ip *);
2208		ip->ip_len = htons(ip->ip_len);
2209		ip->ip_off = htons(ip->ip_off);
2210		ip->ip_sum = 0;
2211		ip->ip_sum = in_cksum(copym, hlen);
2212		/*
2213		 * NB:
2214		 * It's not clear whether there are any lingering
2215		 * reentrancy problems in other areas which might
2216		 * be exposed by using ip_input directly (in
2217		 * particular, everything which modifies the packet
2218		 * in-place).  Yet another option is using the
2219		 * protosw directly to deliver the looped back
2220		 * packet.  For the moment, we'll err on the side
2221		 * of safety by using if_simloop().
2222		 */
2223#if 1 /* XXX */
2224		if (dst->sin_family != AF_INET) {
2225			printf("ip_mloopback: bad address family %d\n",
2226						dst->sin_family);
2227			dst->sin_family = AF_INET;
2228		}
2229#endif
2230
2231#ifdef notdef
2232		copym->m_pkthdr.rcvif = ifp;
2233		ip_input(copym);
2234#else
2235		/* if the checksum hasn't been computed, mark it as valid */
2236		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
2237			copym->m_pkthdr.csum_flags |=
2238			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
2239			copym->m_pkthdr.csum_data = 0xffff;
2240		}
2241		if_simloop(ifp, copym, dst->sin_family, 0);
2242#endif
2243	}
2244}
2245