ip_output.c revision 154520
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
30 * $FreeBSD: head/sys/netinet/ip_output.c 154520 2006-01-18 15:05:05Z andre $
31 */
32
33#include "opt_ipfw.h"
34#include "opt_ipsec.h"
35#include "opt_mac.h"
36#include "opt_mbuf_stress_test.h"
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/kernel.h>
41#include <sys/mac.h>
42#include <sys/malloc.h>
43#include <sys/mbuf.h>
44#include <sys/protosw.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
47#include <sys/sysctl.h>
48
49#include <net/if.h>
50#include <net/netisr.h>
51#include <net/pfil.h>
52#include <net/route.h>
53
54#include <netinet/in.h>
55#include <netinet/in_systm.h>
56#include <netinet/ip.h>
57#include <netinet/in_pcb.h>
58#include <netinet/in_var.h>
59#include <netinet/ip_var.h>
60#include <netinet/ip_options.h>
61
62#include <machine/in_cksum.h>
63
64static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
65
66#ifdef IPSEC
67#include <netinet6/ipsec.h>
68#include <netkey/key.h>
69#ifdef IPSEC_DEBUG
70#include <netkey/key_debug.h>
71#else
72#define	KEYDEBUG(lev,arg)
73#endif
74#endif /*IPSEC*/
75
76#ifdef FAST_IPSEC
77#include <netipsec/ipsec.h>
78#include <netipsec/xform.h>
79#include <netipsec/key.h>
80#endif /*FAST_IPSEC*/
81
82#define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
83				x, (ntohl(a.s_addr)>>24)&0xFF,\
84				  (ntohl(a.s_addr)>>16)&0xFF,\
85				  (ntohl(a.s_addr)>>8)&0xFF,\
86				  (ntohl(a.s_addr))&0xFF, y);
87
88u_short ip_id;
89
90#ifdef MBUF_STRESS_TEST
91int mbuf_frag_size = 0;
92SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
93	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
94#endif
95
96static struct ifnet *ip_multicast_if(struct in_addr *, int *);
97static void	ip_mloopback
98	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
99static int	ip_getmoptions(struct inpcb *, struct sockopt *);
100static int	ip_setmoptions(struct inpcb *, struct sockopt *);
101
102
103extern	struct protosw inetsw[];
104
105/*
106 * IP output.  The packet in mbuf chain m contains a skeletal IP
107 * header (with len, off, ttl, proto, tos, src, dst).
108 * The mbuf chain containing the packet will be freed.
109 * The mbuf opt, if present, will not be freed.
110 * In the IP forwarding case, the packet will arrive with options already
111 * inserted, so must have a NULL opt pointer.
112 */
113int
114ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro,
115	int flags, struct ip_moptions *imo, struct inpcb *inp)
116{
117	struct ip *ip;
118	struct ifnet *ifp = NULL;	/* keep compiler happy */
119	struct mbuf *m0;
120	int hlen = sizeof (struct ip);
121	int len, error = 0;
122	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
123	struct in_ifaddr *ia = NULL;
124	int isbroadcast, sw_csum;
125	struct route iproute;
126	struct in_addr odst;
127#ifdef IPFIREWALL_FORWARD
128	struct m_tag *fwd_tag = NULL;
129#endif
130#ifdef IPSEC
131	struct secpolicy *sp = NULL;
132#endif
133#ifdef FAST_IPSEC
134	struct secpolicy *sp = NULL;
135	struct tdb_ident *tdbi;
136	struct m_tag *mtag;
137	int s;
138#endif /* FAST_IPSEC */
139
140	M_ASSERTPKTHDR(m);
141
142	if (ro == NULL) {
143		ro = &iproute;
144		bzero(ro, sizeof (*ro));
145	}
146
147	if (inp != NULL)
148		INP_LOCK_ASSERT(inp);
149
150	if (opt) {
151		len = 0;
152		m = ip_insertoptions(m, opt, &len);
153		if (len != 0)
154			hlen = len;
155	}
156	ip = mtod(m, struct ip *);
157
158	/*
159	 * Fill in IP header.  If we are not allowing fragmentation,
160	 * then the ip_id field is meaningless, but we don't set it
161	 * to zero.  Doing so causes various problems when devices along
162	 * the path (routers, load balancers, firewalls, etc.) illegally
163	 * disable DF on our packet.  Note that a 16-bit counter
164	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
165	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
166	 * for Counting NATted Hosts", Proc. IMW'02, available at
167	 * <http://www.research.att.com/~smb/papers/fnat.pdf>.
168	 */
169	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
170		ip->ip_v = IPVERSION;
171		ip->ip_hl = hlen >> 2;
172		ip->ip_id = ip_newid();
173		ipstat.ips_localout++;
174	} else {
175		hlen = ip->ip_hl << 2;
176	}
177
178	dst = (struct sockaddr_in *)&ro->ro_dst;
179again:
180	/*
181	 * If there is a cached route,
182	 * check that it is to the same destination
183	 * and is still up.  If not, free it and try again.
184	 * The address family should also be checked in case of sharing the
185	 * cache with IPv6.
186	 */
187	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
188			  dst->sin_family != AF_INET ||
189			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
190		RTFREE(ro->ro_rt);
191		ro->ro_rt = (struct rtentry *)0;
192	}
193#ifdef IPFIREWALL_FORWARD
194	if (ro->ro_rt == NULL && fwd_tag == NULL) {
195#else
196	if (ro->ro_rt == NULL) {
197#endif
198		bzero(dst, sizeof(*dst));
199		dst->sin_family = AF_INET;
200		dst->sin_len = sizeof(*dst);
201		dst->sin_addr = ip->ip_dst;
202	}
203	/*
204	 * If routing to interface only,
205	 * short circuit routing lookup.
206	 */
207	if (flags & IP_ROUTETOIF) {
208		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
209		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
210			ipstat.ips_noroute++;
211			error = ENETUNREACH;
212			goto bad;
213		}
214		ifp = ia->ia_ifp;
215		ip->ip_ttl = 1;
216		isbroadcast = in_broadcast(dst->sin_addr, ifp);
217	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
218	    imo != NULL && imo->imo_multicast_ifp != NULL) {
219		/*
220		 * Bypass the normal routing lookup for multicast
221		 * packets if the interface is specified.
222		 */
223		ifp = imo->imo_multicast_ifp;
224		IFP_TO_IA(ifp, ia);
225		isbroadcast = 0;	/* fool gcc */
226	} else {
227		/*
228		 * We want to do any cloning requested by the link layer,
229		 * as this is probably required in all cases for correct
230		 * operation (as it is for ARP).
231		 */
232		if (ro->ro_rt == NULL)
233			rtalloc_ign(ro, 0);
234		if (ro->ro_rt == NULL) {
235			ipstat.ips_noroute++;
236			error = EHOSTUNREACH;
237			goto bad;
238		}
239		ia = ifatoia(ro->ro_rt->rt_ifa);
240		ifp = ro->ro_rt->rt_ifp;
241		ro->ro_rt->rt_rmx.rmx_pksent++;
242		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
243			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
244		if (ro->ro_rt->rt_flags & RTF_HOST)
245			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
246		else
247			isbroadcast = in_broadcast(dst->sin_addr, ifp);
248	}
249	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
250		struct in_multi *inm;
251
252		m->m_flags |= M_MCAST;
253		/*
254		 * IP destination address is multicast.  Make sure "dst"
255		 * still points to the address in "ro".  (It may have been
256		 * changed to point to a gateway address, above.)
257		 */
258		dst = (struct sockaddr_in *)&ro->ro_dst;
259		/*
260		 * See if the caller provided any multicast options
261		 */
262		if (imo != NULL) {
263			ip->ip_ttl = imo->imo_multicast_ttl;
264			if (imo->imo_multicast_vif != -1)
265				ip->ip_src.s_addr =
266				    ip_mcast_src ?
267				    ip_mcast_src(imo->imo_multicast_vif) :
268				    INADDR_ANY;
269		} else
270			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
271		/*
272		 * Confirm that the outgoing interface supports multicast.
273		 */
274		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
275			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
276				ipstat.ips_noroute++;
277				error = ENETUNREACH;
278				goto bad;
279			}
280		}
281		/*
282		 * If source address not specified yet, use address
283		 * of outgoing interface.
284		 */
285		if (ip->ip_src.s_addr == INADDR_ANY) {
286			/* Interface may have no addresses. */
287			if (ia != NULL)
288				ip->ip_src = IA_SIN(ia)->sin_addr;
289		}
290
291		IN_MULTI_LOCK();
292		IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
293		if (inm != NULL &&
294		   (imo == NULL || imo->imo_multicast_loop)) {
295			IN_MULTI_UNLOCK();
296			/*
297			 * If we belong to the destination multicast group
298			 * on the outgoing interface, and the caller did not
299			 * forbid loopback, loop back a copy.
300			 */
301			ip_mloopback(ifp, m, dst, hlen);
302		}
303		else {
304			IN_MULTI_UNLOCK();
305			/*
306			 * If we are acting as a multicast router, perform
307			 * multicast forwarding as if the packet had just
308			 * arrived on the interface to which we are about
309			 * to send.  The multicast forwarding function
310			 * recursively calls this function, using the
311			 * IP_FORWARDING flag to prevent infinite recursion.
312			 *
313			 * Multicasts that are looped back by ip_mloopback(),
314			 * above, will be forwarded by the ip_input() routine,
315			 * if necessary.
316			 */
317			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
318				/*
319				 * If rsvp daemon is not running, do not
320				 * set ip_moptions. This ensures that the packet
321				 * is multicast and not just sent down one link
322				 * as prescribed by rsvpd.
323				 */
324				if (!rsvp_on)
325					imo = NULL;
326				if (ip_mforward &&
327				    ip_mforward(ip, ifp, m, imo) != 0) {
328					m_freem(m);
329					goto done;
330				}
331			}
332		}
333
334		/*
335		 * Multicasts with a time-to-live of zero may be looped-
336		 * back, above, but must not be transmitted on a network.
337		 * Also, multicasts addressed to the loopback interface
338		 * are not sent -- the above call to ip_mloopback() will
339		 * loop back a copy if this host actually belongs to the
340		 * destination group on the loopback interface.
341		 */
342		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
343			m_freem(m);
344			goto done;
345		}
346
347		goto sendit;
348	}
349#ifndef notdef
350	/*
351	 * If the source address is not specified yet, use the address
352	 * of the outoing interface.
353	 */
354	if (ip->ip_src.s_addr == INADDR_ANY) {
355		/* Interface may have no addresses. */
356		if (ia != NULL) {
357			ip->ip_src = IA_SIN(ia)->sin_addr;
358		}
359	}
360#endif /* notdef */
361	/*
362	 * Verify that we have any chance at all of being able to queue the
363	 * packet or packet fragments, unless ALTQ is enabled on the given
364	 * interface in which case packetdrop should be done by queueing.
365	 */
366#ifdef ALTQ
367	if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
368	    ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
369	    ifp->if_snd.ifq_maxlen))
370#else
371	if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
372	    ifp->if_snd.ifq_maxlen)
373#endif /* ALTQ */
374	{
375		error = ENOBUFS;
376		ipstat.ips_odropped++;
377		ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
378		goto bad;
379	}
380
381	/*
382	 * Look for broadcast address and
383	 * verify user is allowed to send
384	 * such a packet.
385	 */
386	if (isbroadcast) {
387		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
388			error = EADDRNOTAVAIL;
389			goto bad;
390		}
391		if ((flags & IP_ALLOWBROADCAST) == 0) {
392			error = EACCES;
393			goto bad;
394		}
395		/* don't allow broadcast messages to be fragmented */
396		if (ip->ip_len > ifp->if_mtu) {
397			error = EMSGSIZE;
398			goto bad;
399		}
400		if (flags & IP_SENDONES)
401			ip->ip_dst.s_addr = INADDR_BROADCAST;
402		m->m_flags |= M_BCAST;
403	} else {
404		m->m_flags &= ~M_BCAST;
405	}
406
407sendit:
408#ifdef IPSEC
409	/* get SP for this packet */
410	if (inp == NULL)
411		sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
412		    flags, &error);
413	else
414		sp = ipsec4_getpolicybypcb(m, IPSEC_DIR_OUTBOUND, inp, &error);
415
416	if (sp == NULL) {
417		ipsecstat.out_inval++;
418		goto bad;
419	}
420
421	error = 0;
422
423	/* check policy */
424	switch (sp->policy) {
425	case IPSEC_POLICY_DISCARD:
426		/*
427		 * This packet is just discarded.
428		 */
429		ipsecstat.out_polvio++;
430		goto bad;
431
432	case IPSEC_POLICY_BYPASS:
433	case IPSEC_POLICY_NONE:
434	case IPSEC_POLICY_TCP:
435		/* no need to do IPsec. */
436		goto skip_ipsec;
437
438	case IPSEC_POLICY_IPSEC:
439		if (sp->req == NULL) {
440			/* acquire a policy */
441			error = key_spdacquire(sp);
442			goto bad;
443		}
444		break;
445
446	case IPSEC_POLICY_ENTRUST:
447	default:
448		printf("ip_output: Invalid policy found. %d\n", sp->policy);
449	}
450    {
451	struct ipsec_output_state state;
452	bzero(&state, sizeof(state));
453	state.m = m;
454	if (flags & IP_ROUTETOIF) {
455		state.ro = &iproute;
456		bzero(&iproute, sizeof(iproute));
457	} else
458		state.ro = ro;
459	state.dst = (struct sockaddr *)dst;
460
461	ip->ip_sum = 0;
462
463	/*
464	 * XXX
465	 * delayed checksums are not currently compatible with IPsec
466	 */
467	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
468		in_delayed_cksum(m);
469		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
470	}
471
472	ip->ip_len = htons(ip->ip_len);
473	ip->ip_off = htons(ip->ip_off);
474
475	error = ipsec4_output(&state, sp, flags);
476
477	m = state.m;
478	if (flags & IP_ROUTETOIF) {
479		/*
480		 * if we have tunnel mode SA, we may need to ignore
481		 * IP_ROUTETOIF.
482		 */
483		if (state.ro != &iproute || state.ro->ro_rt != NULL) {
484			flags &= ~IP_ROUTETOIF;
485			ro = state.ro;
486		}
487	} else
488		ro = state.ro;
489	dst = (struct sockaddr_in *)state.dst;
490	if (error) {
491		/* mbuf is already reclaimed in ipsec4_output. */
492		m = NULL;
493		switch (error) {
494		case EHOSTUNREACH:
495		case ENETUNREACH:
496		case EMSGSIZE:
497		case ENOBUFS:
498		case ENOMEM:
499			break;
500		default:
501			printf("ip4_output (ipsec): error code %d\n", error);
502			/*fall through*/
503		case ENOENT:
504			/* don't show these error codes to the user */
505			error = 0;
506			break;
507		}
508		goto bad;
509	}
510
511	/* be sure to update variables that are affected by ipsec4_output() */
512	ip = mtod(m, struct ip *);
513	hlen = ip->ip_hl << 2;
514	if (ro->ro_rt == NULL) {
515		if ((flags & IP_ROUTETOIF) == 0) {
516			printf("ip_output: "
517				"can't update route after IPsec processing\n");
518			error = EHOSTUNREACH;	/*XXX*/
519			goto bad;
520		}
521	} else {
522		if (state.encap) {
523			ia = ifatoia(ro->ro_rt->rt_ifa);
524			ifp = ro->ro_rt->rt_ifp;
525		}
526	}
527    }
528
529	/* make it flipped, again. */
530	ip->ip_len = ntohs(ip->ip_len);
531	ip->ip_off = ntohs(ip->ip_off);
532skip_ipsec:
533#endif /*IPSEC*/
534#ifdef FAST_IPSEC
535	/*
536	 * Check the security policy (SP) for the packet and, if
537	 * required, do IPsec-related processing.  There are two
538	 * cases here; the first time a packet is sent through
539	 * it will be untagged and handled by ipsec4_checkpolicy.
540	 * If the packet is resubmitted to ip_output (e.g. after
541	 * AH, ESP, etc. processing), there will be a tag to bypass
542	 * the lookup and related policy checking.
543	 */
544	mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
545	s = splnet();
546	if (mtag != NULL) {
547		tdbi = (struct tdb_ident *)(mtag + 1);
548		sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND);
549		if (sp == NULL)
550			error = -EINVAL;	/* force silent drop */
551		m_tag_delete(m, mtag);
552	} else {
553		sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags,
554					&error, inp);
555	}
556	/*
557	 * There are four return cases:
558	 *    sp != NULL	 	    apply IPsec policy
559	 *    sp == NULL, error == 0	    no IPsec handling needed
560	 *    sp == NULL, error == -EINVAL  discard packet w/o error
561	 *    sp == NULL, error != 0	    discard packet, report error
562	 */
563	if (sp != NULL) {
564		/* Loop detection, check if ipsec processing already done */
565		KASSERT(sp->req != NULL, ("ip_output: no ipsec request"));
566		for (mtag = m_tag_first(m); mtag != NULL;
567		     mtag = m_tag_next(m, mtag)) {
568			if (mtag->m_tag_cookie != MTAG_ABI_COMPAT)
569				continue;
570			if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
571			    mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
572				continue;
573			/*
574			 * Check if policy has an SA associated with it.
575			 * This can happen when an SP has yet to acquire
576			 * an SA; e.g. on first reference.  If it occurs,
577			 * then we let ipsec4_process_packet do its thing.
578			 */
579			if (sp->req->sav == NULL)
580				break;
581			tdbi = (struct tdb_ident *)(mtag + 1);
582			if (tdbi->spi == sp->req->sav->spi &&
583			    tdbi->proto == sp->req->sav->sah->saidx.proto &&
584			    bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst,
585				 sizeof (union sockaddr_union)) == 0) {
586				/*
587				 * No IPsec processing is needed, free
588				 * reference to SP.
589				 *
590				 * NB: null pointer to avoid free at
591				 *     done: below.
592				 */
593				KEY_FREESP(&sp), sp = NULL;
594				splx(s);
595				goto spd_done;
596			}
597		}
598
599		/*
600		 * Do delayed checksums now because we send before
601		 * this is done in the normal processing path.
602		 */
603		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
604			in_delayed_cksum(m);
605			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
606		}
607
608		ip->ip_len = htons(ip->ip_len);
609		ip->ip_off = htons(ip->ip_off);
610
611		/* NB: callee frees mbuf */
612		error = ipsec4_process_packet(m, sp->req, flags, 0);
613		/*
614		 * Preserve KAME behaviour: ENOENT can be returned
615		 * when an SA acquire is in progress.  Don't propagate
616		 * this to user-level; it confuses applications.
617		 *
618		 * XXX this will go away when the SADB is redone.
619		 */
620		if (error == ENOENT)
621			error = 0;
622		splx(s);
623		goto done;
624	} else {
625		splx(s);
626
627		if (error != 0) {
628			/*
629			 * Hack: -EINVAL is used to signal that a packet
630			 * should be silently discarded.  This is typically
631			 * because we asked key management for an SA and
632			 * it was delayed (e.g. kicked up to IKE).
633			 */
634			if (error == -EINVAL)
635				error = 0;
636			goto bad;
637		} else {
638			/* No IPsec processing for this packet. */
639		}
640#ifdef notyet
641		/*
642		 * If deferred crypto processing is needed, check that
643		 * the interface supports it.
644		 */
645		mtag = m_tag_find(m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL);
646		if (mtag != NULL && (ifp->if_capenable & IFCAP_IPSEC) == 0) {
647			/* notify IPsec to do its own crypto */
648			ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1));
649			error = EHOSTUNREACH;
650			goto bad;
651		}
652#endif
653	}
654spd_done:
655#endif /* FAST_IPSEC */
656
657	/* Jump over all PFIL processing if hooks are not active. */
658	if (inet_pfil_hook.ph_busy_count == -1)
659		goto passout;
660
661	/* Run through list of hooks for output packets. */
662	odst.s_addr = ip->ip_dst.s_addr;
663	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
664	if (error != 0 || m == NULL)
665		goto done;
666
667	ip = mtod(m, struct ip *);
668
669	/* See if destination IP address was changed by packet filter. */
670	if (odst.s_addr != ip->ip_dst.s_addr) {
671		m->m_flags |= M_SKIP_FIREWALL;
672		/* If destination is now ourself drop to ip_input(). */
673		if (in_localip(ip->ip_dst)) {
674			m->m_flags |= M_FASTFWD_OURS;
675			if (m->m_pkthdr.rcvif == NULL)
676				m->m_pkthdr.rcvif = loif;
677			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
678				m->m_pkthdr.csum_flags |=
679				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
680				m->m_pkthdr.csum_data = 0xffff;
681			}
682			m->m_pkthdr.csum_flags |=
683			    CSUM_IP_CHECKED | CSUM_IP_VALID;
684
685			error = netisr_queue(NETISR_IP, m);
686			goto done;
687		} else
688			goto again;	/* Redo the routing table lookup. */
689	}
690
691#ifdef IPFIREWALL_FORWARD
692	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
693	if (m->m_flags & M_FASTFWD_OURS) {
694		if (m->m_pkthdr.rcvif == NULL)
695			m->m_pkthdr.rcvif = loif;
696		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
697			m->m_pkthdr.csum_flags |=
698			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
699			m->m_pkthdr.csum_data = 0xffff;
700		}
701		m->m_pkthdr.csum_flags |=
702			    CSUM_IP_CHECKED | CSUM_IP_VALID;
703
704		error = netisr_queue(NETISR_IP, m);
705		goto done;
706	}
707	/* Or forward to some other address? */
708	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
709	if (fwd_tag) {
710#ifndef IPFIREWALL_FORWARD_EXTENDED
711		if (!in_localip(ip->ip_src) && !in_localaddr(ip->ip_dst)) {
712#endif
713			dst = (struct sockaddr_in *)&ro->ro_dst;
714			bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
715			m->m_flags |= M_SKIP_FIREWALL;
716			m_tag_delete(m, fwd_tag);
717			goto again;
718#ifndef IPFIREWALL_FORWARD_EXTENDED
719		} else {
720			m_tag_delete(m, fwd_tag);
721			/* Continue. */
722		}
723#endif
724	}
725#endif /* IPFIREWALL_FORWARD */
726
727passout:
728	/* 127/8 must not appear on wire - RFC1122. */
729	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
730	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
731		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
732			ipstat.ips_badaddr++;
733			error = EADDRNOTAVAIL;
734			goto bad;
735		}
736	}
737
738	m->m_pkthdr.csum_flags |= CSUM_IP;
739	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
740	if (sw_csum & CSUM_DELAY_DATA) {
741		in_delayed_cksum(m);
742		sw_csum &= ~CSUM_DELAY_DATA;
743	}
744	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
745
746	/*
747	 * If small enough for interface, or the interface will take
748	 * care of the fragmentation for us, can just send directly.
749	 */
750	if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT &&
751	    ((ip->ip_off & IP_DF) == 0))) {
752		ip->ip_len = htons(ip->ip_len);
753		ip->ip_off = htons(ip->ip_off);
754		ip->ip_sum = 0;
755		if (sw_csum & CSUM_DELAY_IP)
756			ip->ip_sum = in_cksum(m, hlen);
757
758		/* Record statistics for this interface address. */
759		if (!(flags & IP_FORWARDING) && ia) {
760			ia->ia_ifa.if_opackets++;
761			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
762		}
763
764#ifdef IPSEC
765		/* clean ipsec history once it goes out of the node */
766		ipsec_delaux(m);
767#endif
768
769#ifdef MBUF_STRESS_TEST
770		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
771			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
772#endif
773		/*
774		 * Reset layer specific mbuf flags
775		 * to avoid confusing lower layers.
776		 */
777		m->m_flags &= ~(M_PROTOFLAGS);
778
779		error = (*ifp->if_output)(ifp, m,
780				(struct sockaddr *)dst, ro->ro_rt);
781		goto done;
782	}
783
784	if (ip->ip_off & IP_DF) {
785		error = EMSGSIZE;
786		/*
787		 * This case can happen if the user changed the MTU
788		 * of an interface after enabling IP on it.  Because
789		 * most netifs don't keep track of routes pointing to
790		 * them, there is no way for one to update all its
791		 * routes when the MTU is changed.
792		 */
793		if (ro != NULL &&
794		    (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
795		    (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
796			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
797		}
798		ipstat.ips_cantfrag++;
799		goto bad;
800	}
801
802	/*
803	 * Too large for interface; fragment if possible. If successful,
804	 * on return, m will point to a list of packets to be sent.
805	 */
806	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
807	if (error)
808		goto bad;
809	for (; m; m = m0) {
810		m0 = m->m_nextpkt;
811		m->m_nextpkt = 0;
812#ifdef IPSEC
813		/* clean ipsec history once it goes out of the node */
814		ipsec_delaux(m);
815#endif
816		if (error == 0) {
817			/* Record statistics for this interface address. */
818			if (ia != NULL) {
819				ia->ia_ifa.if_opackets++;
820				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
821			}
822			/*
823			 * Reset layer specific mbuf flags
824			 * to avoid confusing upper layers.
825			 */
826			m->m_flags &= ~(M_PROTOFLAGS);
827
828			error = (*ifp->if_output)(ifp, m,
829			    (struct sockaddr *)dst, ro->ro_rt);
830		} else
831			m_freem(m);
832	}
833
834	if (error == 0)
835		ipstat.ips_fragmented++;
836
837done:
838	if (ro == &iproute && ro->ro_rt) {
839		RTFREE(ro->ro_rt);
840	}
841#ifdef IPSEC
842	if (sp != NULL) {
843		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
844			printf("DP ip_output call free SP:%p\n", sp));
845		key_freesp(sp);
846	}
847#endif
848#ifdef FAST_IPSEC
849	if (sp != NULL)
850		KEY_FREESP(&sp);
851#endif
852	return (error);
853bad:
854	m_freem(m);
855	goto done;
856}
857
858/*
859 * Create a chain of fragments which fit the given mtu. m_frag points to the
860 * mbuf to be fragmented; on return it points to the chain with the fragments.
861 * Return 0 if no error. If error, m_frag may contain a partially built
862 * chain of fragments that should be freed by the caller.
863 *
864 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
865 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
866 */
867int
868ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
869	    u_long if_hwassist_flags, int sw_csum)
870{
871	int error = 0;
872	int hlen = ip->ip_hl << 2;
873	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
874	int off;
875	struct mbuf *m0 = *m_frag;	/* the original packet		*/
876	int firstlen;
877	struct mbuf **mnext;
878	int nfrags;
879
880	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
881		ipstat.ips_cantfrag++;
882		return EMSGSIZE;
883	}
884
885	/*
886	 * Must be able to put at least 8 bytes per fragment.
887	 */
888	if (len < 8)
889		return EMSGSIZE;
890
891	/*
892	 * If the interface will not calculate checksums on
893	 * fragmented packets, then do it here.
894	 */
895	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
896	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
897		in_delayed_cksum(m0);
898		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
899	}
900
901	if (len > PAGE_SIZE) {
902		/*
903		 * Fragment large datagrams such that each segment
904		 * contains a multiple of PAGE_SIZE amount of data,
905		 * plus headers. This enables a receiver to perform
906		 * page-flipping zero-copy optimizations.
907		 *
908		 * XXX When does this help given that sender and receiver
909		 * could have different page sizes, and also mtu could
910		 * be less than the receiver's page size ?
911		 */
912		int newlen;
913		struct mbuf *m;
914
915		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
916			off += m->m_len;
917
918		/*
919		 * firstlen (off - hlen) must be aligned on an
920		 * 8-byte boundary
921		 */
922		if (off < hlen)
923			goto smart_frag_failure;
924		off = ((off - hlen) & ~7) + hlen;
925		newlen = (~PAGE_MASK) & mtu;
926		if ((newlen + sizeof (struct ip)) > mtu) {
927			/* we failed, go back the default */
928smart_frag_failure:
929			newlen = len;
930			off = hlen + len;
931		}
932		len = newlen;
933
934	} else {
935		off = hlen + len;
936	}
937
938	firstlen = off - hlen;
939	mnext = &m0->m_nextpkt;		/* pointer to next packet */
940
941	/*
942	 * Loop through length of segment after first fragment,
943	 * make new header and copy data of each part and link onto chain.
944	 * Here, m0 is the original packet, m is the fragment being created.
945	 * The fragments are linked off the m_nextpkt of the original
946	 * packet, which after processing serves as the first fragment.
947	 */
948	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
949		struct ip *mhip;	/* ip header on the fragment */
950		struct mbuf *m;
951		int mhlen = sizeof (struct ip);
952
953		MGETHDR(m, M_DONTWAIT, MT_DATA);
954		if (m == NULL) {
955			error = ENOBUFS;
956			ipstat.ips_odropped++;
957			goto done;
958		}
959		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
960		/*
961		 * In the first mbuf, leave room for the link header, then
962		 * copy the original IP header including options. The payload
963		 * goes into an additional mbuf chain returned by m_copy().
964		 */
965		m->m_data += max_linkhdr;
966		mhip = mtod(m, struct ip *);
967		*mhip = *ip;
968		if (hlen > sizeof (struct ip)) {
969			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
970			mhip->ip_v = IPVERSION;
971			mhip->ip_hl = mhlen >> 2;
972		}
973		m->m_len = mhlen;
974		/* XXX do we need to add ip->ip_off below ? */
975		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
976		if (off + len >= ip->ip_len) {	/* last fragment */
977			len = ip->ip_len - off;
978			m->m_flags |= M_LASTFRAG;
979		} else
980			mhip->ip_off |= IP_MF;
981		mhip->ip_len = htons((u_short)(len + mhlen));
982		m->m_next = m_copy(m0, off, len);
983		if (m->m_next == NULL) {	/* copy failed */
984			m_free(m);
985			error = ENOBUFS;	/* ??? */
986			ipstat.ips_odropped++;
987			goto done;
988		}
989		m->m_pkthdr.len = mhlen + len;
990		m->m_pkthdr.rcvif = NULL;
991#ifdef MAC
992		mac_create_fragment(m0, m);
993#endif
994		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
995		mhip->ip_off = htons(mhip->ip_off);
996		mhip->ip_sum = 0;
997		if (sw_csum & CSUM_DELAY_IP)
998			mhip->ip_sum = in_cksum(m, mhlen);
999		*mnext = m;
1000		mnext = &m->m_nextpkt;
1001	}
1002	ipstat.ips_ofragments += nfrags;
1003
1004	/* set first marker for fragment chain */
1005	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1006	m0->m_pkthdr.csum_data = nfrags;
1007
1008	/*
1009	 * Update first fragment by trimming what's been copied out
1010	 * and updating header.
1011	 */
1012	m_adj(m0, hlen + firstlen - ip->ip_len);
1013	m0->m_pkthdr.len = hlen + firstlen;
1014	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
1015	ip->ip_off |= IP_MF;
1016	ip->ip_off = htons(ip->ip_off);
1017	ip->ip_sum = 0;
1018	if (sw_csum & CSUM_DELAY_IP)
1019		ip->ip_sum = in_cksum(m0, hlen);
1020
1021done:
1022	*m_frag = m0;
1023	return error;
1024}
1025
1026void
1027in_delayed_cksum(struct mbuf *m)
1028{
1029	struct ip *ip;
1030	u_short csum, offset;
1031
1032	ip = mtod(m, struct ip *);
1033	offset = ip->ip_hl << 2 ;
1034	csum = in_cksum_skip(m, ip->ip_len, offset);
1035	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
1036		csum = 0xffff;
1037	offset += m->m_pkthdr.csum_data;	/* checksum offset */
1038
1039	if (offset + sizeof(u_short) > m->m_len) {
1040		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
1041		    m->m_len, offset, ip->ip_p);
1042		/*
1043		 * XXX
1044		 * this shouldn't happen, but if it does, the
1045		 * correct behavior may be to insert the checksum
1046		 * in the existing chain instead of rearranging it.
1047		 */
1048		m = m_pullup(m, offset + sizeof(u_short));
1049	}
1050	*(u_short *)(m->m_data + offset) = csum;
1051}
1052
1053/*
1054 * IP socket option processing.
1055 */
1056int
1057ip_ctloutput(so, sopt)
1058	struct socket *so;
1059	struct sockopt *sopt;
1060{
1061	struct	inpcb *inp = sotoinpcb(so);
1062	int	error, optval;
1063
1064	error = optval = 0;
1065	if (sopt->sopt_level != IPPROTO_IP) {
1066		return (EINVAL);
1067	}
1068
1069	switch (sopt->sopt_dir) {
1070	case SOPT_SET:
1071		switch (sopt->sopt_name) {
1072		case IP_OPTIONS:
1073#ifdef notyet
1074		case IP_RETOPTS:
1075#endif
1076		{
1077			struct mbuf *m;
1078			if (sopt->sopt_valsize > MLEN) {
1079				error = EMSGSIZE;
1080				break;
1081			}
1082			MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
1083			if (m == NULL) {
1084				error = ENOBUFS;
1085				break;
1086			}
1087			m->m_len = sopt->sopt_valsize;
1088			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
1089					    m->m_len);
1090			INP_LOCK(inp);
1091			error = ip_pcbopts(inp, sopt->sopt_name, m);
1092			INP_UNLOCK(inp);
1093			return (error);
1094		}
1095
1096		case IP_TOS:
1097		case IP_TTL:
1098		case IP_MINTTL:
1099		case IP_RECVOPTS:
1100		case IP_RECVRETOPTS:
1101		case IP_RECVDSTADDR:
1102		case IP_RECVTTL:
1103		case IP_RECVIF:
1104		case IP_FAITH:
1105		case IP_ONESBCAST:
1106		case IP_DONTFRAG:
1107			error = sooptcopyin(sopt, &optval, sizeof optval,
1108					    sizeof optval);
1109			if (error)
1110				break;
1111
1112			switch (sopt->sopt_name) {
1113			case IP_TOS:
1114				inp->inp_ip_tos = optval;
1115				break;
1116
1117			case IP_TTL:
1118				inp->inp_ip_ttl = optval;
1119				break;
1120
1121			case IP_MINTTL:
1122				if (optval > 0 && optval <= MAXTTL)
1123					inp->inp_ip_minttl = optval;
1124				else
1125					error = EINVAL;
1126				break;
1127
1128#define	OPTSET(bit) do {						\
1129	INP_LOCK(inp);							\
1130	if (optval)							\
1131		inp->inp_flags |= bit;					\
1132	else								\
1133		inp->inp_flags &= ~bit;					\
1134	INP_UNLOCK(inp);						\
1135} while (0)
1136
1137			case IP_RECVOPTS:
1138				OPTSET(INP_RECVOPTS);
1139				break;
1140
1141			case IP_RECVRETOPTS:
1142				OPTSET(INP_RECVRETOPTS);
1143				break;
1144
1145			case IP_RECVDSTADDR:
1146				OPTSET(INP_RECVDSTADDR);
1147				break;
1148
1149			case IP_RECVTTL:
1150				OPTSET(INP_RECVTTL);
1151				break;
1152
1153			case IP_RECVIF:
1154				OPTSET(INP_RECVIF);
1155				break;
1156
1157			case IP_FAITH:
1158				OPTSET(INP_FAITH);
1159				break;
1160
1161			case IP_ONESBCAST:
1162				OPTSET(INP_ONESBCAST);
1163				break;
1164			case IP_DONTFRAG:
1165				OPTSET(INP_DONTFRAG);
1166				break;
1167			}
1168			break;
1169#undef OPTSET
1170
1171		case IP_MULTICAST_IF:
1172		case IP_MULTICAST_VIF:
1173		case IP_MULTICAST_TTL:
1174		case IP_MULTICAST_LOOP:
1175		case IP_ADD_MEMBERSHIP:
1176		case IP_DROP_MEMBERSHIP:
1177			error = ip_setmoptions(inp, sopt);
1178			break;
1179
1180		case IP_PORTRANGE:
1181			error = sooptcopyin(sopt, &optval, sizeof optval,
1182					    sizeof optval);
1183			if (error)
1184				break;
1185
1186			INP_LOCK(inp);
1187			switch (optval) {
1188			case IP_PORTRANGE_DEFAULT:
1189				inp->inp_flags &= ~(INP_LOWPORT);
1190				inp->inp_flags &= ~(INP_HIGHPORT);
1191				break;
1192
1193			case IP_PORTRANGE_HIGH:
1194				inp->inp_flags &= ~(INP_LOWPORT);
1195				inp->inp_flags |= INP_HIGHPORT;
1196				break;
1197
1198			case IP_PORTRANGE_LOW:
1199				inp->inp_flags &= ~(INP_HIGHPORT);
1200				inp->inp_flags |= INP_LOWPORT;
1201				break;
1202
1203			default:
1204				error = EINVAL;
1205				break;
1206			}
1207			INP_UNLOCK(inp);
1208			break;
1209
1210#if defined(IPSEC) || defined(FAST_IPSEC)
1211		case IP_IPSEC_POLICY:
1212		{
1213			caddr_t req;
1214			size_t len = 0;
1215			int priv;
1216			struct mbuf *m;
1217			int optname;
1218
1219			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1220				break;
1221			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1222				break;
1223			priv = (sopt->sopt_td != NULL &&
1224				suser(sopt->sopt_td) != 0) ? 0 : 1;
1225			req = mtod(m, caddr_t);
1226			len = m->m_len;
1227			optname = sopt->sopt_name;
1228			error = ipsec4_set_policy(inp, optname, req, len, priv);
1229			m_freem(m);
1230			break;
1231		}
1232#endif /*IPSEC*/
1233
1234		default:
1235			error = ENOPROTOOPT;
1236			break;
1237		}
1238		break;
1239
1240	case SOPT_GET:
1241		switch (sopt->sopt_name) {
1242		case IP_OPTIONS:
1243		case IP_RETOPTS:
1244			if (inp->inp_options)
1245				error = sooptcopyout(sopt,
1246						     mtod(inp->inp_options,
1247							  char *),
1248						     inp->inp_options->m_len);
1249			else
1250				sopt->sopt_valsize = 0;
1251			break;
1252
1253		case IP_TOS:
1254		case IP_TTL:
1255		case IP_MINTTL:
1256		case IP_RECVOPTS:
1257		case IP_RECVRETOPTS:
1258		case IP_RECVDSTADDR:
1259		case IP_RECVTTL:
1260		case IP_RECVIF:
1261		case IP_PORTRANGE:
1262		case IP_FAITH:
1263		case IP_ONESBCAST:
1264		case IP_DONTFRAG:
1265			switch (sopt->sopt_name) {
1266
1267			case IP_TOS:
1268				optval = inp->inp_ip_tos;
1269				break;
1270
1271			case IP_TTL:
1272				optval = inp->inp_ip_ttl;
1273				break;
1274
1275			case IP_MINTTL:
1276				optval = inp->inp_ip_minttl;
1277				break;
1278
1279#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1280
1281			case IP_RECVOPTS:
1282				optval = OPTBIT(INP_RECVOPTS);
1283				break;
1284
1285			case IP_RECVRETOPTS:
1286				optval = OPTBIT(INP_RECVRETOPTS);
1287				break;
1288
1289			case IP_RECVDSTADDR:
1290				optval = OPTBIT(INP_RECVDSTADDR);
1291				break;
1292
1293			case IP_RECVTTL:
1294				optval = OPTBIT(INP_RECVTTL);
1295				break;
1296
1297			case IP_RECVIF:
1298				optval = OPTBIT(INP_RECVIF);
1299				break;
1300
1301			case IP_PORTRANGE:
1302				if (inp->inp_flags & INP_HIGHPORT)
1303					optval = IP_PORTRANGE_HIGH;
1304				else if (inp->inp_flags & INP_LOWPORT)
1305					optval = IP_PORTRANGE_LOW;
1306				else
1307					optval = 0;
1308				break;
1309
1310			case IP_FAITH:
1311				optval = OPTBIT(INP_FAITH);
1312				break;
1313
1314			case IP_ONESBCAST:
1315				optval = OPTBIT(INP_ONESBCAST);
1316				break;
1317			case IP_DONTFRAG:
1318				optval = OPTBIT(INP_DONTFRAG);
1319				break;
1320			}
1321			error = sooptcopyout(sopt, &optval, sizeof optval);
1322			break;
1323
1324		case IP_MULTICAST_IF:
1325		case IP_MULTICAST_VIF:
1326		case IP_MULTICAST_TTL:
1327		case IP_MULTICAST_LOOP:
1328		case IP_ADD_MEMBERSHIP:
1329		case IP_DROP_MEMBERSHIP:
1330			error = ip_getmoptions(inp, sopt);
1331			break;
1332
1333#if defined(IPSEC) || defined(FAST_IPSEC)
1334		case IP_IPSEC_POLICY:
1335		{
1336			struct mbuf *m = NULL;
1337			caddr_t req = NULL;
1338			size_t len = 0;
1339
1340			if (m != 0) {
1341				req = mtod(m, caddr_t);
1342				len = m->m_len;
1343			}
1344			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1345			if (error == 0)
1346				error = soopt_mcopyout(sopt, m); /* XXX */
1347			if (error == 0)
1348				m_freem(m);
1349			break;
1350		}
1351#endif /*IPSEC*/
1352
1353		default:
1354			error = ENOPROTOOPT;
1355			break;
1356		}
1357		break;
1358	}
1359	return (error);
1360}
1361
1362/*
1363 * XXX
1364 * The whole multicast option thing needs to be re-thought.
1365 * Several of these options are equally applicable to non-multicast
1366 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1367 * standard option (IP_TTL).
1368 */
1369
1370/*
1371 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1372 */
1373static struct ifnet *
1374ip_multicast_if(a, ifindexp)
1375	struct in_addr *a;
1376	int *ifindexp;
1377{
1378	int ifindex;
1379	struct ifnet *ifp;
1380
1381	if (ifindexp)
1382		*ifindexp = 0;
1383	if (ntohl(a->s_addr) >> 24 == 0) {
1384		ifindex = ntohl(a->s_addr) & 0xffffff;
1385		if (ifindex < 0 || if_index < ifindex)
1386			return NULL;
1387		ifp = ifnet_byindex(ifindex);
1388		if (ifindexp)
1389			*ifindexp = ifindex;
1390	} else {
1391		INADDR_TO_IFP(*a, ifp);
1392	}
1393	return ifp;
1394}
1395
1396/*
1397 * Given an inpcb, return its multicast options structure pointer.  Accepts
1398 * an unlocked inpcb pointer, but will return it locked.  May sleep.
1399 */
1400static struct ip_moptions *
1401ip_findmoptions(struct inpcb *inp)
1402{
1403	struct ip_moptions *imo;
1404
1405	INP_LOCK(inp);
1406	if (inp->inp_moptions != NULL)
1407		return (inp->inp_moptions);
1408
1409	INP_UNLOCK(inp);
1410
1411	imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
1412
1413	imo->imo_multicast_ifp = NULL;
1414	imo->imo_multicast_addr.s_addr = INADDR_ANY;
1415	imo->imo_multicast_vif = -1;
1416	imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1417	imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1418	imo->imo_num_memberships = 0;
1419
1420	INP_LOCK(inp);
1421	if (inp->inp_moptions != NULL) {
1422		free(imo, M_IPMOPTS);
1423		return (inp->inp_moptions);
1424	}
1425	inp->inp_moptions = imo;
1426	return (imo);
1427}
1428
1429/*
1430 * Set the IP multicast options in response to user setsockopt().
1431 */
1432static int
1433ip_setmoptions(struct inpcb *inp, struct sockopt *sopt)
1434{
1435	int error = 0;
1436	int i;
1437	struct in_addr addr;
1438	struct ip_mreq mreq;
1439	struct ifnet *ifp;
1440	struct ip_moptions *imo;
1441	struct route ro;
1442	struct sockaddr_in *dst;
1443	int ifindex;
1444	int s;
1445
1446	switch (sopt->sopt_name) {
1447	/* store an index number for the vif you wanna use in the send */
1448	case IP_MULTICAST_VIF:
1449		if (legal_vif_num == 0) {
1450			error = EOPNOTSUPP;
1451			break;
1452		}
1453		error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1454		if (error)
1455			break;
1456		if (!legal_vif_num(i) && (i != -1)) {
1457			error = EINVAL;
1458			break;
1459		}
1460		imo = ip_findmoptions(inp);
1461		imo->imo_multicast_vif = i;
1462		INP_UNLOCK(inp);
1463		break;
1464
1465	case IP_MULTICAST_IF:
1466		/*
1467		 * Select the interface for outgoing multicast packets.
1468		 */
1469		error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1470		if (error)
1471			break;
1472		/*
1473		 * INADDR_ANY is used to remove a previous selection.
1474		 * When no interface is selected, a default one is
1475		 * chosen every time a multicast packet is sent.
1476		 */
1477		imo = ip_findmoptions(inp);
1478		if (addr.s_addr == INADDR_ANY) {
1479			imo->imo_multicast_ifp = NULL;
1480			INP_UNLOCK(inp);
1481			break;
1482		}
1483		/*
1484		 * The selected interface is identified by its local
1485		 * IP address.  Find the interface and confirm that
1486		 * it supports multicasting.
1487		 */
1488		s = splimp();
1489		ifp = ip_multicast_if(&addr, &ifindex);
1490		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1491			INP_UNLOCK(inp);
1492			splx(s);
1493			error = EADDRNOTAVAIL;
1494			break;
1495		}
1496		imo->imo_multicast_ifp = ifp;
1497		if (ifindex)
1498			imo->imo_multicast_addr = addr;
1499		else
1500			imo->imo_multicast_addr.s_addr = INADDR_ANY;
1501		INP_UNLOCK(inp);
1502		splx(s);
1503		break;
1504
1505	case IP_MULTICAST_TTL:
1506		/*
1507		 * Set the IP time-to-live for outgoing multicast packets.
1508		 * The original multicast API required a char argument,
1509		 * which is inconsistent with the rest of the socket API.
1510		 * We allow either a char or an int.
1511		 */
1512		if (sopt->sopt_valsize == 1) {
1513			u_char ttl;
1514			error = sooptcopyin(sopt, &ttl, 1, 1);
1515			if (error)
1516				break;
1517			imo = ip_findmoptions(inp);
1518			imo->imo_multicast_ttl = ttl;
1519			INP_UNLOCK(inp);
1520		} else {
1521			u_int ttl;
1522			error = sooptcopyin(sopt, &ttl, sizeof ttl,
1523					    sizeof ttl);
1524			if (error)
1525				break;
1526			if (ttl > 255)
1527				error = EINVAL;
1528			else {
1529				imo = ip_findmoptions(inp);
1530				imo->imo_multicast_ttl = ttl;
1531				INP_UNLOCK(inp);
1532			}
1533		}
1534		break;
1535
1536	case IP_MULTICAST_LOOP:
1537		/*
1538		 * Set the loopback flag for outgoing multicast packets.
1539		 * Must be zero or one.  The original multicast API required a
1540		 * char argument, which is inconsistent with the rest
1541		 * of the socket API.  We allow either a char or an int.
1542		 */
1543		if (sopt->sopt_valsize == 1) {
1544			u_char loop;
1545			error = sooptcopyin(sopt, &loop, 1, 1);
1546			if (error)
1547				break;
1548			imo = ip_findmoptions(inp);
1549			imo->imo_multicast_loop = !!loop;
1550			INP_UNLOCK(inp);
1551		} else {
1552			u_int loop;
1553			error = sooptcopyin(sopt, &loop, sizeof loop,
1554					    sizeof loop);
1555			if (error)
1556				break;
1557			imo = ip_findmoptions(inp);
1558			imo->imo_multicast_loop = !!loop;
1559			INP_UNLOCK(inp);
1560		}
1561		break;
1562
1563	case IP_ADD_MEMBERSHIP:
1564		/*
1565		 * Add a multicast group membership.
1566		 * Group must be a valid IP multicast address.
1567		 */
1568		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1569		if (error)
1570			break;
1571
1572		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1573			error = EINVAL;
1574			break;
1575		}
1576		s = splimp();
1577		/*
1578		 * If no interface address was provided, use the interface of
1579		 * the route to the given multicast address.
1580		 */
1581		if (mreq.imr_interface.s_addr == INADDR_ANY) {
1582			bzero((caddr_t)&ro, sizeof(ro));
1583			dst = (struct sockaddr_in *)&ro.ro_dst;
1584			dst->sin_len = sizeof(*dst);
1585			dst->sin_family = AF_INET;
1586			dst->sin_addr = mreq.imr_multiaddr;
1587			rtalloc_ign(&ro, RTF_CLONING);
1588			if (ro.ro_rt == NULL) {
1589				error = EADDRNOTAVAIL;
1590				splx(s);
1591				break;
1592			}
1593			ifp = ro.ro_rt->rt_ifp;
1594			RTFREE(ro.ro_rt);
1595		}
1596		else {
1597			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1598		}
1599
1600		/*
1601		 * See if we found an interface, and confirm that it
1602		 * supports multicast.
1603		 */
1604		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1605			error = EADDRNOTAVAIL;
1606			splx(s);
1607			break;
1608		}
1609		/*
1610		 * See if the membership already exists or if all the
1611		 * membership slots are full.
1612		 */
1613		imo = ip_findmoptions(inp);
1614		for (i = 0; i < imo->imo_num_memberships; ++i) {
1615			if (imo->imo_membership[i]->inm_ifp == ifp &&
1616			    imo->imo_membership[i]->inm_addr.s_addr
1617						== mreq.imr_multiaddr.s_addr)
1618				break;
1619		}
1620		if (i < imo->imo_num_memberships) {
1621			INP_UNLOCK(inp);
1622			error = EADDRINUSE;
1623			splx(s);
1624			break;
1625		}
1626		if (i == IP_MAX_MEMBERSHIPS) {
1627			INP_UNLOCK(inp);
1628			error = ETOOMANYREFS;
1629			splx(s);
1630			break;
1631		}
1632		/*
1633		 * Everything looks good; add a new record to the multicast
1634		 * address list for the given interface.
1635		 */
1636		if ((imo->imo_membership[i] =
1637		    in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
1638			INP_UNLOCK(inp);
1639			error = ENOBUFS;
1640			splx(s);
1641			break;
1642		}
1643		++imo->imo_num_memberships;
1644		INP_UNLOCK(inp);
1645		splx(s);
1646		break;
1647
1648	case IP_DROP_MEMBERSHIP:
1649		/*
1650		 * Drop a multicast group membership.
1651		 * Group must be a valid IP multicast address.
1652		 */
1653		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1654		if (error)
1655			break;
1656
1657		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1658			error = EINVAL;
1659			break;
1660		}
1661
1662		s = splimp();
1663		/*
1664		 * If an interface address was specified, get a pointer
1665		 * to its ifnet structure.
1666		 */
1667		if (mreq.imr_interface.s_addr == INADDR_ANY)
1668			ifp = NULL;
1669		else {
1670			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1671			if (ifp == NULL) {
1672				error = EADDRNOTAVAIL;
1673				splx(s);
1674				break;
1675			}
1676		}
1677		/*
1678		 * Find the membership in the membership array.
1679		 */
1680		imo = ip_findmoptions(inp);
1681		for (i = 0; i < imo->imo_num_memberships; ++i) {
1682			if ((ifp == NULL ||
1683			     imo->imo_membership[i]->inm_ifp == ifp) &&
1684			     imo->imo_membership[i]->inm_addr.s_addr ==
1685			     mreq.imr_multiaddr.s_addr)
1686				break;
1687		}
1688		if (i == imo->imo_num_memberships) {
1689			INP_UNLOCK(inp);
1690			error = EADDRNOTAVAIL;
1691			splx(s);
1692			break;
1693		}
1694		/*
1695		 * Give up the multicast address record to which the
1696		 * membership points.
1697		 */
1698		in_delmulti(imo->imo_membership[i]);
1699		/*
1700		 * Remove the gap in the membership array.
1701		 */
1702		for (++i; i < imo->imo_num_memberships; ++i)
1703			imo->imo_membership[i-1] = imo->imo_membership[i];
1704		--imo->imo_num_memberships;
1705		INP_UNLOCK(inp);
1706		splx(s);
1707		break;
1708
1709	default:
1710		error = EOPNOTSUPP;
1711		break;
1712	}
1713
1714	return (error);
1715}
1716
1717/*
1718 * Return the IP multicast options in response to user getsockopt().
1719 */
1720static int
1721ip_getmoptions(struct inpcb *inp, struct sockopt *sopt)
1722{
1723	struct ip_moptions *imo;
1724	struct in_addr addr;
1725	struct in_ifaddr *ia;
1726	int error, optval;
1727	u_char coptval;
1728
1729	INP_LOCK(inp);
1730	imo = inp->inp_moptions;
1731
1732	error = 0;
1733	switch (sopt->sopt_name) {
1734	case IP_MULTICAST_VIF:
1735		if (imo != NULL)
1736			optval = imo->imo_multicast_vif;
1737		else
1738			optval = -1;
1739		INP_UNLOCK(inp);
1740		error = sooptcopyout(sopt, &optval, sizeof optval);
1741		break;
1742
1743	case IP_MULTICAST_IF:
1744		if (imo == NULL || imo->imo_multicast_ifp == NULL)
1745			addr.s_addr = INADDR_ANY;
1746		else if (imo->imo_multicast_addr.s_addr) {
1747			/* return the value user has set */
1748			addr = imo->imo_multicast_addr;
1749		} else {
1750			IFP_TO_IA(imo->imo_multicast_ifp, ia);
1751			addr.s_addr = (ia == NULL) ? INADDR_ANY
1752				: IA_SIN(ia)->sin_addr.s_addr;
1753		}
1754		INP_UNLOCK(inp);
1755		error = sooptcopyout(sopt, &addr, sizeof addr);
1756		break;
1757
1758	case IP_MULTICAST_TTL:
1759		if (imo == 0)
1760			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
1761		else
1762			optval = coptval = imo->imo_multicast_ttl;
1763		INP_UNLOCK(inp);
1764		if (sopt->sopt_valsize == 1)
1765			error = sooptcopyout(sopt, &coptval, 1);
1766		else
1767			error = sooptcopyout(sopt, &optval, sizeof optval);
1768		break;
1769
1770	case IP_MULTICAST_LOOP:
1771		if (imo == 0)
1772			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
1773		else
1774			optval = coptval = imo->imo_multicast_loop;
1775		INP_UNLOCK(inp);
1776		if (sopt->sopt_valsize == 1)
1777			error = sooptcopyout(sopt, &coptval, 1);
1778		else
1779			error = sooptcopyout(sopt, &optval, sizeof optval);
1780		break;
1781
1782	default:
1783		INP_UNLOCK(inp);
1784		error = ENOPROTOOPT;
1785		break;
1786	}
1787	INP_UNLOCK_ASSERT(inp);
1788
1789	return (error);
1790}
1791
1792/*
1793 * Discard the IP multicast options.
1794 */
1795void
1796ip_freemoptions(imo)
1797	register struct ip_moptions *imo;
1798{
1799	register int i;
1800
1801	if (imo != NULL) {
1802		for (i = 0; i < imo->imo_num_memberships; ++i)
1803			in_delmulti(imo->imo_membership[i]);
1804		free(imo, M_IPMOPTS);
1805	}
1806}
1807
1808/*
1809 * Routine called from ip_output() to loop back a copy of an IP multicast
1810 * packet to the input queue of a specified interface.  Note that this
1811 * calls the output routine of the loopback "driver", but with an interface
1812 * pointer that might NOT be a loopback interface -- evil, but easier than
1813 * replicating that code here.
1814 */
1815static void
1816ip_mloopback(ifp, m, dst, hlen)
1817	struct ifnet *ifp;
1818	register struct mbuf *m;
1819	register struct sockaddr_in *dst;
1820	int hlen;
1821{
1822	register struct ip *ip;
1823	struct mbuf *copym;
1824
1825	copym = m_copy(m, 0, M_COPYALL);
1826	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
1827		copym = m_pullup(copym, hlen);
1828	if (copym != NULL) {
1829		/* If needed, compute the checksum and mark it as valid. */
1830		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1831			in_delayed_cksum(copym);
1832			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1833			copym->m_pkthdr.csum_flags |=
1834			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1835			copym->m_pkthdr.csum_data = 0xffff;
1836		}
1837		/*
1838		 * We don't bother to fragment if the IP length is greater
1839		 * than the interface's MTU.  Can this possibly matter?
1840		 */
1841		ip = mtod(copym, struct ip *);
1842		ip->ip_len = htons(ip->ip_len);
1843		ip->ip_off = htons(ip->ip_off);
1844		ip->ip_sum = 0;
1845		ip->ip_sum = in_cksum(copym, hlen);
1846		/*
1847		 * NB:
1848		 * It's not clear whether there are any lingering
1849		 * reentrancy problems in other areas which might
1850		 * be exposed by using ip_input directly (in
1851		 * particular, everything which modifies the packet
1852		 * in-place).  Yet another option is using the
1853		 * protosw directly to deliver the looped back
1854		 * packet.  For the moment, we'll err on the side
1855		 * of safety by using if_simloop().
1856		 */
1857#if 1 /* XXX */
1858		if (dst->sin_family != AF_INET) {
1859			printf("ip_mloopback: bad address family %d\n",
1860						dst->sin_family);
1861			dst->sin_family = AF_INET;
1862		}
1863#endif
1864
1865#ifdef notdef
1866		copym->m_pkthdr.rcvif = ifp;
1867		ip_input(copym);
1868#else
1869		if_simloop(ifp, copym, dst->sin_family, 0);
1870#endif
1871	}
1872}
1873