ip_output.c revision 153164
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
30 * $FreeBSD: head/sys/netinet/ip_output.c 153164 2005-12-06 11:16:11Z glebius $
31 */
32
33#include "opt_ipfw.h"
34#include "opt_ipsec.h"
35#include "opt_mac.h"
36#include "opt_mbuf_stress_test.h"
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/kernel.h>
41#include <sys/mac.h>
42#include <sys/malloc.h>
43#include <sys/mbuf.h>
44#include <sys/protosw.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
47#include <sys/sysctl.h>
48
49#include <net/if.h>
50#include <net/netisr.h>
51#include <net/pfil.h>
52#include <net/route.h>
53
54#include <netinet/in.h>
55#include <netinet/in_systm.h>
56#include <netinet/ip.h>
57#include <netinet/in_pcb.h>
58#include <netinet/in_var.h>
59#include <netinet/ip_var.h>
60#include <netinet/ip_options.h>
61
62#include <machine/in_cksum.h>
63
64static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
65
66#ifdef IPSEC
67#include <netinet6/ipsec.h>
68#include <netkey/key.h>
69#ifdef IPSEC_DEBUG
70#include <netkey/key_debug.h>
71#else
72#define	KEYDEBUG(lev,arg)
73#endif
74#endif /*IPSEC*/
75
76#ifdef FAST_IPSEC
77#include <netipsec/ipsec.h>
78#include <netipsec/xform.h>
79#include <netipsec/key.h>
80#endif /*FAST_IPSEC*/
81
82#define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
83				x, (ntohl(a.s_addr)>>24)&0xFF,\
84				  (ntohl(a.s_addr)>>16)&0xFF,\
85				  (ntohl(a.s_addr)>>8)&0xFF,\
86				  (ntohl(a.s_addr))&0xFF, y);
87
88u_short ip_id;
89
90#ifdef MBUF_STRESS_TEST
91int mbuf_frag_size = 0;
92SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
93	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
94#endif
95
96static struct ifnet *ip_multicast_if(struct in_addr *, int *);
97static void	ip_mloopback
98	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
99static int	ip_getmoptions(struct inpcb *, struct sockopt *);
100static int	ip_setmoptions(struct inpcb *, struct sockopt *);
101
102
103extern	struct protosw inetsw[];
104
105/*
106 * IP output.  The packet in mbuf chain m contains a skeletal IP
107 * header (with len, off, ttl, proto, tos, src, dst).
108 * The mbuf chain containing the packet will be freed.
109 * The mbuf opt, if present, will not be freed.
110 * In the IP forwarding case, the packet will arrive with options already
111 * inserted, so must have a NULL opt pointer.
112 */
113int
114ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro,
115	int flags, struct ip_moptions *imo, struct inpcb *inp)
116{
117	struct ip *ip;
118	struct ifnet *ifp = NULL;	/* keep compiler happy */
119	struct mbuf *m0;
120	int hlen = sizeof (struct ip);
121	int len, error = 0;
122	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
123	struct in_ifaddr *ia = NULL;
124	int isbroadcast, sw_csum;
125	struct route iproute;
126	struct in_addr odst;
127#ifdef IPFIREWALL_FORWARD
128	struct m_tag *fwd_tag = NULL;
129#endif
130#ifdef IPSEC
131	struct secpolicy *sp = NULL;
132#endif
133#ifdef FAST_IPSEC
134	struct secpolicy *sp = NULL;
135	struct tdb_ident *tdbi;
136	struct m_tag *mtag;
137	int s;
138#endif /* FAST_IPSEC */
139
140	M_ASSERTPKTHDR(m);
141
142	if (ro == NULL) {
143		ro = &iproute;
144		bzero(ro, sizeof (*ro));
145	}
146
147	if (inp != NULL)
148		INP_LOCK_ASSERT(inp);
149
150	if (opt) {
151		len = 0;
152		m = ip_insertoptions(m, opt, &len);
153		if (len != 0)
154			hlen = len;
155	}
156	ip = mtod(m, struct ip *);
157
158	/*
159	 * Fill in IP header.  If we are not allowing fragmentation,
160	 * then the ip_id field is meaningless, but we don't set it
161	 * to zero.  Doing so causes various problems when devices along
162	 * the path (routers, load balancers, firewalls, etc.) illegally
163	 * disable DF on our packet.  Note that a 16-bit counter
164	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
165	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
166	 * for Counting NATted Hosts", Proc. IMW'02, available at
167	 * <http://www.research.att.com/~smb/papers/fnat.pdf>.
168	 */
169	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
170		ip->ip_v = IPVERSION;
171		ip->ip_hl = hlen >> 2;
172		ip->ip_id = ip_newid();
173		ipstat.ips_localout++;
174	} else {
175		hlen = ip->ip_hl << 2;
176	}
177
178	dst = (struct sockaddr_in *)&ro->ro_dst;
179again:
180	/*
181	 * If there is a cached route,
182	 * check that it is to the same destination
183	 * and is still up.  If not, free it and try again.
184	 * The address family should also be checked in case of sharing the
185	 * cache with IPv6.
186	 */
187	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
188			  dst->sin_family != AF_INET ||
189			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
190		RTFREE(ro->ro_rt);
191		ro->ro_rt = (struct rtentry *)0;
192	}
193#ifdef IPFIREWALL_FORWARD
194	if (ro->ro_rt == NULL && fwd_tag == NULL) {
195#else
196	if (ro->ro_rt == NULL) {
197#endif
198		bzero(dst, sizeof(*dst));
199		dst->sin_family = AF_INET;
200		dst->sin_len = sizeof(*dst);
201		dst->sin_addr = ip->ip_dst;
202	}
203	/*
204	 * If routing to interface only,
205	 * short circuit routing lookup.
206	 */
207	if (flags & IP_ROUTETOIF) {
208		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
209		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
210			ipstat.ips_noroute++;
211			error = ENETUNREACH;
212			goto bad;
213		}
214		ifp = ia->ia_ifp;
215		ip->ip_ttl = 1;
216		isbroadcast = in_broadcast(dst->sin_addr, ifp);
217	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
218	    imo != NULL && imo->imo_multicast_ifp != NULL) {
219		/*
220		 * Bypass the normal routing lookup for multicast
221		 * packets if the interface is specified.
222		 */
223		ifp = imo->imo_multicast_ifp;
224		IFP_TO_IA(ifp, ia);
225		isbroadcast = 0;	/* fool gcc */
226	} else {
227		/*
228		 * We want to do any cloning requested by the link layer,
229		 * as this is probably required in all cases for correct
230		 * operation (as it is for ARP).
231		 */
232		if (ro->ro_rt == NULL)
233			rtalloc_ign(ro, 0);
234		if (ro->ro_rt == NULL) {
235			ipstat.ips_noroute++;
236			error = EHOSTUNREACH;
237			goto bad;
238		}
239		ia = ifatoia(ro->ro_rt->rt_ifa);
240		ifp = ro->ro_rt->rt_ifp;
241		ro->ro_rt->rt_rmx.rmx_pksent++;
242		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
243			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
244		if (ro->ro_rt->rt_flags & RTF_HOST)
245			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
246		else
247			isbroadcast = in_broadcast(dst->sin_addr, ifp);
248	}
249	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
250		struct in_multi *inm;
251
252		m->m_flags |= M_MCAST;
253		/*
254		 * IP destination address is multicast.  Make sure "dst"
255		 * still points to the address in "ro".  (It may have been
256		 * changed to point to a gateway address, above.)
257		 */
258		dst = (struct sockaddr_in *)&ro->ro_dst;
259		/*
260		 * See if the caller provided any multicast options
261		 */
262		if (imo != NULL) {
263			ip->ip_ttl = imo->imo_multicast_ttl;
264			if (imo->imo_multicast_vif != -1)
265				ip->ip_src.s_addr =
266				    ip_mcast_src ?
267				    ip_mcast_src(imo->imo_multicast_vif) :
268				    INADDR_ANY;
269		} else
270			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
271		/*
272		 * Confirm that the outgoing interface supports multicast.
273		 */
274		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
275			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
276				ipstat.ips_noroute++;
277				error = ENETUNREACH;
278				goto bad;
279			}
280		}
281		/*
282		 * If source address not specified yet, use address
283		 * of outgoing interface.
284		 */
285		if (ip->ip_src.s_addr == INADDR_ANY) {
286			/* Interface may have no addresses. */
287			if (ia != NULL)
288				ip->ip_src = IA_SIN(ia)->sin_addr;
289		}
290
291		IN_MULTI_LOCK();
292		IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
293		if (inm != NULL &&
294		   (imo == NULL || imo->imo_multicast_loop)) {
295			IN_MULTI_UNLOCK();
296			/*
297			 * If we belong to the destination multicast group
298			 * on the outgoing interface, and the caller did not
299			 * forbid loopback, loop back a copy.
300			 */
301			ip_mloopback(ifp, m, dst, hlen);
302		}
303		else {
304			IN_MULTI_UNLOCK();
305			/*
306			 * If we are acting as a multicast router, perform
307			 * multicast forwarding as if the packet had just
308			 * arrived on the interface to which we are about
309			 * to send.  The multicast forwarding function
310			 * recursively calls this function, using the
311			 * IP_FORWARDING flag to prevent infinite recursion.
312			 *
313			 * Multicasts that are looped back by ip_mloopback(),
314			 * above, will be forwarded by the ip_input() routine,
315			 * if necessary.
316			 */
317			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
318				/*
319				 * If rsvp daemon is not running, do not
320				 * set ip_moptions. This ensures that the packet
321				 * is multicast and not just sent down one link
322				 * as prescribed by rsvpd.
323				 */
324				if (!rsvp_on)
325					imo = NULL;
326				if (ip_mforward &&
327				    ip_mforward(ip, ifp, m, imo) != 0) {
328					m_freem(m);
329					goto done;
330				}
331			}
332		}
333
334		/*
335		 * Multicasts with a time-to-live of zero may be looped-
336		 * back, above, but must not be transmitted on a network.
337		 * Also, multicasts addressed to the loopback interface
338		 * are not sent -- the above call to ip_mloopback() will
339		 * loop back a copy if this host actually belongs to the
340		 * destination group on the loopback interface.
341		 */
342		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
343			m_freem(m);
344			goto done;
345		}
346
347		goto sendit;
348	}
349#ifndef notdef
350	/*
351	 * If the source address is not specified yet, use the address
352	 * of the outoing interface.
353	 */
354	if (ip->ip_src.s_addr == INADDR_ANY) {
355		/* Interface may have no addresses. */
356		if (ia != NULL) {
357			ip->ip_src = IA_SIN(ia)->sin_addr;
358		}
359	}
360#endif /* notdef */
361	/*
362	 * Verify that we have any chance at all of being able to queue the
363	 * packet or packet fragments, unless ALTQ is enabled on the given
364	 * interface in which case packetdrop should be done by queueing.
365	 */
366#ifdef ALTQ
367	if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
368	    ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
369	    ifp->if_snd.ifq_maxlen))
370#else
371	if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
372	    ifp->if_snd.ifq_maxlen)
373#endif /* ALTQ */
374	{
375		error = ENOBUFS;
376		ipstat.ips_odropped++;
377		ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
378		goto bad;
379	}
380
381	/*
382	 * Look for broadcast address and
383	 * verify user is allowed to send
384	 * such a packet.
385	 */
386	if (isbroadcast) {
387		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
388			error = EADDRNOTAVAIL;
389			goto bad;
390		}
391		if ((flags & IP_ALLOWBROADCAST) == 0) {
392			error = EACCES;
393			goto bad;
394		}
395		/* don't allow broadcast messages to be fragmented */
396		if (ip->ip_len > ifp->if_mtu) {
397			error = EMSGSIZE;
398			goto bad;
399		}
400		if (flags & IP_SENDONES)
401			ip->ip_dst.s_addr = INADDR_BROADCAST;
402		m->m_flags |= M_BCAST;
403	} else {
404		m->m_flags &= ~M_BCAST;
405	}
406
407sendit:
408#ifdef IPSEC
409	/* get SP for this packet */
410	if (inp == NULL)
411		sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
412		    flags, &error);
413	else
414		sp = ipsec4_getpolicybypcb(m, IPSEC_DIR_OUTBOUND, inp, &error);
415
416	if (sp == NULL) {
417		ipsecstat.out_inval++;
418		goto bad;
419	}
420
421	error = 0;
422
423	/* check policy */
424	switch (sp->policy) {
425	case IPSEC_POLICY_DISCARD:
426		/*
427		 * This packet is just discarded.
428		 */
429		ipsecstat.out_polvio++;
430		goto bad;
431
432	case IPSEC_POLICY_BYPASS:
433	case IPSEC_POLICY_NONE:
434	case IPSEC_POLICY_TCP:
435		/* no need to do IPsec. */
436		goto skip_ipsec;
437
438	case IPSEC_POLICY_IPSEC:
439		if (sp->req == NULL) {
440			/* acquire a policy */
441			error = key_spdacquire(sp);
442			goto bad;
443		}
444		break;
445
446	case IPSEC_POLICY_ENTRUST:
447	default:
448		printf("ip_output: Invalid policy found. %d\n", sp->policy);
449	}
450    {
451	struct ipsec_output_state state;
452	bzero(&state, sizeof(state));
453	state.m = m;
454	if (flags & IP_ROUTETOIF) {
455		state.ro = &iproute;
456		bzero(&iproute, sizeof(iproute));
457	} else
458		state.ro = ro;
459	state.dst = (struct sockaddr *)dst;
460
461	ip->ip_sum = 0;
462
463	/*
464	 * XXX
465	 * delayed checksums are not currently compatible with IPsec
466	 */
467	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
468		in_delayed_cksum(m);
469		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
470	}
471
472	ip->ip_len = htons(ip->ip_len);
473	ip->ip_off = htons(ip->ip_off);
474
475	error = ipsec4_output(&state, sp, flags);
476
477	m = state.m;
478	if (flags & IP_ROUTETOIF) {
479		/*
480		 * if we have tunnel mode SA, we may need to ignore
481		 * IP_ROUTETOIF.
482		 */
483		if (state.ro != &iproute || state.ro->ro_rt != NULL) {
484			flags &= ~IP_ROUTETOIF;
485			ro = state.ro;
486		}
487	} else
488		ro = state.ro;
489	dst = (struct sockaddr_in *)state.dst;
490	if (error) {
491		/* mbuf is already reclaimed in ipsec4_output. */
492		m = NULL;
493		switch (error) {
494		case EHOSTUNREACH:
495		case ENETUNREACH:
496		case EMSGSIZE:
497		case ENOBUFS:
498		case ENOMEM:
499			break;
500		default:
501			printf("ip4_output (ipsec): error code %d\n", error);
502			/*fall through*/
503		case ENOENT:
504			/* don't show these error codes to the user */
505			error = 0;
506			break;
507		}
508		goto bad;
509	}
510
511	/* be sure to update variables that are affected by ipsec4_output() */
512	ip = mtod(m, struct ip *);
513	hlen = ip->ip_hl << 2;
514	if (ro->ro_rt == NULL) {
515		if ((flags & IP_ROUTETOIF) == 0) {
516			printf("ip_output: "
517				"can't update route after IPsec processing\n");
518			error = EHOSTUNREACH;	/*XXX*/
519			goto bad;
520		}
521	} else {
522		if (state.encap) {
523			ia = ifatoia(ro->ro_rt->rt_ifa);
524			ifp = ro->ro_rt->rt_ifp;
525		}
526	}
527    }
528
529	/* make it flipped, again. */
530	ip->ip_len = ntohs(ip->ip_len);
531	ip->ip_off = ntohs(ip->ip_off);
532skip_ipsec:
533#endif /*IPSEC*/
534#ifdef FAST_IPSEC
535	/*
536	 * Check the security policy (SP) for the packet and, if
537	 * required, do IPsec-related processing.  There are two
538	 * cases here; the first time a packet is sent through
539	 * it will be untagged and handled by ipsec4_checkpolicy.
540	 * If the packet is resubmitted to ip_output (e.g. after
541	 * AH, ESP, etc. processing), there will be a tag to bypass
542	 * the lookup and related policy checking.
543	 */
544	mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
545	s = splnet();
546	if (mtag != NULL) {
547		tdbi = (struct tdb_ident *)(mtag + 1);
548		sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND);
549		if (sp == NULL)
550			error = -EINVAL;	/* force silent drop */
551		m_tag_delete(m, mtag);
552	} else {
553		sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags,
554					&error, inp);
555	}
556	/*
557	 * There are four return cases:
558	 *    sp != NULL	 	    apply IPsec policy
559	 *    sp == NULL, error == 0	    no IPsec handling needed
560	 *    sp == NULL, error == -EINVAL  discard packet w/o error
561	 *    sp == NULL, error != 0	    discard packet, report error
562	 */
563	if (sp != NULL) {
564		/* Loop detection, check if ipsec processing already done */
565		KASSERT(sp->req != NULL, ("ip_output: no ipsec request"));
566		for (mtag = m_tag_first(m); mtag != NULL;
567		     mtag = m_tag_next(m, mtag)) {
568			if (mtag->m_tag_cookie != MTAG_ABI_COMPAT)
569				continue;
570			if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
571			    mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
572				continue;
573			/*
574			 * Check if policy has an SA associated with it.
575			 * This can happen when an SP has yet to acquire
576			 * an SA; e.g. on first reference.  If it occurs,
577			 * then we let ipsec4_process_packet do its thing.
578			 */
579			if (sp->req->sav == NULL)
580				break;
581			tdbi = (struct tdb_ident *)(mtag + 1);
582			if (tdbi->spi == sp->req->sav->spi &&
583			    tdbi->proto == sp->req->sav->sah->saidx.proto &&
584			    bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst,
585				 sizeof (union sockaddr_union)) == 0) {
586				/*
587				 * No IPsec processing is needed, free
588				 * reference to SP.
589				 *
590				 * NB: null pointer to avoid free at
591				 *     done: below.
592				 */
593				KEY_FREESP(&sp), sp = NULL;
594				splx(s);
595				goto spd_done;
596			}
597		}
598
599		/*
600		 * Do delayed checksums now because we send before
601		 * this is done in the normal processing path.
602		 */
603		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
604			in_delayed_cksum(m);
605			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
606		}
607
608		ip->ip_len = htons(ip->ip_len);
609		ip->ip_off = htons(ip->ip_off);
610
611		/* NB: callee frees mbuf */
612		error = ipsec4_process_packet(m, sp->req, flags, 0);
613		/*
614		 * Preserve KAME behaviour: ENOENT can be returned
615		 * when an SA acquire is in progress.  Don't propagate
616		 * this to user-level; it confuses applications.
617		 *
618		 * XXX this will go away when the SADB is redone.
619		 */
620		if (error == ENOENT)
621			error = 0;
622		splx(s);
623		goto done;
624	} else {
625		splx(s);
626
627		if (error != 0) {
628			/*
629			 * Hack: -EINVAL is used to signal that a packet
630			 * should be silently discarded.  This is typically
631			 * because we asked key management for an SA and
632			 * it was delayed (e.g. kicked up to IKE).
633			 */
634			if (error == -EINVAL)
635				error = 0;
636			goto bad;
637		} else {
638			/* No IPsec processing for this packet. */
639		}
640#ifdef notyet
641		/*
642		 * If deferred crypto processing is needed, check that
643		 * the interface supports it.
644		 */
645		mtag = m_tag_find(m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL);
646		if (mtag != NULL && (ifp->if_capenable & IFCAP_IPSEC) == 0) {
647			/* notify IPsec to do its own crypto */
648			ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1));
649			error = EHOSTUNREACH;
650			goto bad;
651		}
652#endif
653	}
654spd_done:
655#endif /* FAST_IPSEC */
656
657	/* Jump over all PFIL processing if hooks are not active. */
658	if (inet_pfil_hook.ph_busy_count == -1)
659		goto passout;
660
661	/* Run through list of hooks for output packets. */
662	odst.s_addr = ip->ip_dst.s_addr;
663	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
664	if (error != 0 || m == NULL)
665		goto done;
666
667	ip = mtod(m, struct ip *);
668
669	/* See if destination IP address was changed by packet filter. */
670	if (odst.s_addr != ip->ip_dst.s_addr) {
671		m->m_flags |= M_SKIP_FIREWALL;
672		/* If destination is now ourself drop to ip_input(). */
673		if (in_localip(ip->ip_dst)) {
674			m->m_flags |= M_FASTFWD_OURS;
675			if (m->m_pkthdr.rcvif == NULL)
676				m->m_pkthdr.rcvif = loif;
677			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
678				m->m_pkthdr.csum_flags |=
679				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
680				m->m_pkthdr.csum_data = 0xffff;
681			}
682			m->m_pkthdr.csum_flags |=
683			    CSUM_IP_CHECKED | CSUM_IP_VALID;
684
685			error = netisr_queue(NETISR_IP, m);
686			goto done;
687		} else
688			goto again;	/* Redo the routing table lookup. */
689	}
690
691#ifdef IPFIREWALL_FORWARD
692	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
693	if (m->m_flags & M_FASTFWD_OURS) {
694		if (m->m_pkthdr.rcvif == NULL)
695			m->m_pkthdr.rcvif = loif;
696		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
697			m->m_pkthdr.csum_flags |=
698			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
699			m->m_pkthdr.csum_data = 0xffff;
700		}
701		m->m_pkthdr.csum_flags |=
702			    CSUM_IP_CHECKED | CSUM_IP_VALID;
703
704		error = netisr_queue(NETISR_IP, m);
705		goto done;
706	}
707	/* Or forward to some other address? */
708	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
709	if (fwd_tag) {
710#ifndef IPFIREWALL_FORWARD_EXTENDED
711		if (!in_localip(ip->ip_src) && !in_localaddr(ip->ip_dst)) {
712#endif
713			dst = (struct sockaddr_in *)&ro->ro_dst;
714			bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
715			m->m_flags |= M_SKIP_FIREWALL;
716			m_tag_delete(m, fwd_tag);
717			goto again;
718#ifndef IPFIREWALL_FORWARD_EXTENDED
719		} else {
720			m_tag_delete(m, fwd_tag);
721			/* Continue. */
722		}
723#endif
724	}
725#endif /* IPFIREWALL_FORWARD */
726
727passout:
728	/* 127/8 must not appear on wire - RFC1122. */
729	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
730	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
731		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
732			ipstat.ips_badaddr++;
733			error = EADDRNOTAVAIL;
734			goto bad;
735		}
736	}
737
738	m->m_pkthdr.csum_flags |= CSUM_IP;
739	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
740	if (sw_csum & CSUM_DELAY_DATA) {
741		in_delayed_cksum(m);
742		sw_csum &= ~CSUM_DELAY_DATA;
743	}
744	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
745
746	/*
747	 * If small enough for interface, or the interface will take
748	 * care of the fragmentation for us, can just send directly.
749	 */
750	if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT &&
751	    ((ip->ip_off & IP_DF) == 0))) {
752		ip->ip_len = htons(ip->ip_len);
753		ip->ip_off = htons(ip->ip_off);
754		ip->ip_sum = 0;
755		if (sw_csum & CSUM_DELAY_IP)
756			ip->ip_sum = in_cksum(m, hlen);
757
758		/* Record statistics for this interface address. */
759		if (!(flags & IP_FORWARDING) && ia) {
760			ia->ia_ifa.if_opackets++;
761			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
762		}
763
764#ifdef IPSEC
765		/* clean ipsec history once it goes out of the node */
766		ipsec_delaux(m);
767#endif
768
769#ifdef MBUF_STRESS_TEST
770		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
771			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
772#endif
773		/*
774		 * Reset layer specific mbuf flags
775		 * to avoid confusing lower layers.
776		 */
777		m->m_flags &= ~(M_PROTOFLAGS);
778
779		error = (*ifp->if_output)(ifp, m,
780				(struct sockaddr *)dst, ro->ro_rt);
781		goto done;
782	}
783
784	if (ip->ip_off & IP_DF) {
785		error = EMSGSIZE;
786		/*
787		 * This case can happen if the user changed the MTU
788		 * of an interface after enabling IP on it.  Because
789		 * most netifs don't keep track of routes pointing to
790		 * them, there is no way for one to update all its
791		 * routes when the MTU is changed.
792		 */
793		if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
794		    (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
795			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
796		}
797		ipstat.ips_cantfrag++;
798		goto bad;
799	}
800
801	/*
802	 * Too large for interface; fragment if possible. If successful,
803	 * on return, m will point to a list of packets to be sent.
804	 */
805	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
806	if (error)
807		goto bad;
808	for (; m; m = m0) {
809		m0 = m->m_nextpkt;
810		m->m_nextpkt = 0;
811#ifdef IPSEC
812		/* clean ipsec history once it goes out of the node */
813		ipsec_delaux(m);
814#endif
815		if (error == 0) {
816			/* Record statistics for this interface address. */
817			if (ia != NULL) {
818				ia->ia_ifa.if_opackets++;
819				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
820			}
821			/*
822			 * Reset layer specific mbuf flags
823			 * to avoid confusing upper layers.
824			 */
825			m->m_flags &= ~(M_PROTOFLAGS);
826
827			error = (*ifp->if_output)(ifp, m,
828			    (struct sockaddr *)dst, ro->ro_rt);
829		} else
830			m_freem(m);
831	}
832
833	if (error == 0)
834		ipstat.ips_fragmented++;
835
836done:
837	if (ro == &iproute && ro->ro_rt) {
838		RTFREE(ro->ro_rt);
839	}
840#ifdef IPSEC
841	if (sp != NULL) {
842		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
843			printf("DP ip_output call free SP:%p\n", sp));
844		key_freesp(sp);
845	}
846#endif
847#ifdef FAST_IPSEC
848	if (sp != NULL)
849		KEY_FREESP(&sp);
850#endif
851	return (error);
852bad:
853	m_freem(m);
854	goto done;
855}
856
857/*
858 * Create a chain of fragments which fit the given mtu. m_frag points to the
859 * mbuf to be fragmented; on return it points to the chain with the fragments.
860 * Return 0 if no error. If error, m_frag may contain a partially built
861 * chain of fragments that should be freed by the caller.
862 *
863 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
864 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
865 */
866int
867ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
868	    u_long if_hwassist_flags, int sw_csum)
869{
870	int error = 0;
871	int hlen = ip->ip_hl << 2;
872	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
873	int off;
874	struct mbuf *m0 = *m_frag;	/* the original packet		*/
875	int firstlen;
876	struct mbuf **mnext;
877	int nfrags;
878
879	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
880		ipstat.ips_cantfrag++;
881		return EMSGSIZE;
882	}
883
884	/*
885	 * Must be able to put at least 8 bytes per fragment.
886	 */
887	if (len < 8)
888		return EMSGSIZE;
889
890	/*
891	 * If the interface will not calculate checksums on
892	 * fragmented packets, then do it here.
893	 */
894	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
895	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
896		in_delayed_cksum(m0);
897		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
898	}
899
900	if (len > PAGE_SIZE) {
901		/*
902		 * Fragment large datagrams such that each segment
903		 * contains a multiple of PAGE_SIZE amount of data,
904		 * plus headers. This enables a receiver to perform
905		 * page-flipping zero-copy optimizations.
906		 *
907		 * XXX When does this help given that sender and receiver
908		 * could have different page sizes, and also mtu could
909		 * be less than the receiver's page size ?
910		 */
911		int newlen;
912		struct mbuf *m;
913
914		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
915			off += m->m_len;
916
917		/*
918		 * firstlen (off - hlen) must be aligned on an
919		 * 8-byte boundary
920		 */
921		if (off < hlen)
922			goto smart_frag_failure;
923		off = ((off - hlen) & ~7) + hlen;
924		newlen = (~PAGE_MASK) & mtu;
925		if ((newlen + sizeof (struct ip)) > mtu) {
926			/* we failed, go back the default */
927smart_frag_failure:
928			newlen = len;
929			off = hlen + len;
930		}
931		len = newlen;
932
933	} else {
934		off = hlen + len;
935	}
936
937	firstlen = off - hlen;
938	mnext = &m0->m_nextpkt;		/* pointer to next packet */
939
940	/*
941	 * Loop through length of segment after first fragment,
942	 * make new header and copy data of each part and link onto chain.
943	 * Here, m0 is the original packet, m is the fragment being created.
944	 * The fragments are linked off the m_nextpkt of the original
945	 * packet, which after processing serves as the first fragment.
946	 */
947	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
948		struct ip *mhip;	/* ip header on the fragment */
949		struct mbuf *m;
950		int mhlen = sizeof (struct ip);
951
952		MGETHDR(m, M_DONTWAIT, MT_DATA);
953		if (m == NULL) {
954			error = ENOBUFS;
955			ipstat.ips_odropped++;
956			goto done;
957		}
958		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
959		/*
960		 * In the first mbuf, leave room for the link header, then
961		 * copy the original IP header including options. The payload
962		 * goes into an additional mbuf chain returned by m_copy().
963		 */
964		m->m_data += max_linkhdr;
965		mhip = mtod(m, struct ip *);
966		*mhip = *ip;
967		if (hlen > sizeof (struct ip)) {
968			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
969			mhip->ip_v = IPVERSION;
970			mhip->ip_hl = mhlen >> 2;
971		}
972		m->m_len = mhlen;
973		/* XXX do we need to add ip->ip_off below ? */
974		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
975		if (off + len >= ip->ip_len) {	/* last fragment */
976			len = ip->ip_len - off;
977			m->m_flags |= M_LASTFRAG;
978		} else
979			mhip->ip_off |= IP_MF;
980		mhip->ip_len = htons((u_short)(len + mhlen));
981		m->m_next = m_copy(m0, off, len);
982		if (m->m_next == NULL) {	/* copy failed */
983			m_free(m);
984			error = ENOBUFS;	/* ??? */
985			ipstat.ips_odropped++;
986			goto done;
987		}
988		m->m_pkthdr.len = mhlen + len;
989		m->m_pkthdr.rcvif = NULL;
990#ifdef MAC
991		mac_create_fragment(m0, m);
992#endif
993		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
994		mhip->ip_off = htons(mhip->ip_off);
995		mhip->ip_sum = 0;
996		if (sw_csum & CSUM_DELAY_IP)
997			mhip->ip_sum = in_cksum(m, mhlen);
998		*mnext = m;
999		mnext = &m->m_nextpkt;
1000	}
1001	ipstat.ips_ofragments += nfrags;
1002
1003	/* set first marker for fragment chain */
1004	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1005	m0->m_pkthdr.csum_data = nfrags;
1006
1007	/*
1008	 * Update first fragment by trimming what's been copied out
1009	 * and updating header.
1010	 */
1011	m_adj(m0, hlen + firstlen - ip->ip_len);
1012	m0->m_pkthdr.len = hlen + firstlen;
1013	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
1014	ip->ip_off |= IP_MF;
1015	ip->ip_off = htons(ip->ip_off);
1016	ip->ip_sum = 0;
1017	if (sw_csum & CSUM_DELAY_IP)
1018		ip->ip_sum = in_cksum(m0, hlen);
1019
1020done:
1021	*m_frag = m0;
1022	return error;
1023}
1024
1025void
1026in_delayed_cksum(struct mbuf *m)
1027{
1028	struct ip *ip;
1029	u_short csum, offset;
1030
1031	ip = mtod(m, struct ip *);
1032	offset = ip->ip_hl << 2 ;
1033	csum = in_cksum_skip(m, ip->ip_len, offset);
1034	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
1035		csum = 0xffff;
1036	offset += m->m_pkthdr.csum_data;	/* checksum offset */
1037
1038	if (offset + sizeof(u_short) > m->m_len) {
1039		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
1040		    m->m_len, offset, ip->ip_p);
1041		/*
1042		 * XXX
1043		 * this shouldn't happen, but if it does, the
1044		 * correct behavior may be to insert the checksum
1045		 * in the existing chain instead of rearranging it.
1046		 */
1047		m = m_pullup(m, offset + sizeof(u_short));
1048	}
1049	*(u_short *)(m->m_data + offset) = csum;
1050}
1051
1052/*
1053 * IP socket option processing.
1054 */
1055int
1056ip_ctloutput(so, sopt)
1057	struct socket *so;
1058	struct sockopt *sopt;
1059{
1060	struct	inpcb *inp = sotoinpcb(so);
1061	int	error, optval;
1062
1063	error = optval = 0;
1064	if (sopt->sopt_level != IPPROTO_IP) {
1065		return (EINVAL);
1066	}
1067
1068	switch (sopt->sopt_dir) {
1069	case SOPT_SET:
1070		switch (sopt->sopt_name) {
1071		case IP_OPTIONS:
1072#ifdef notyet
1073		case IP_RETOPTS:
1074#endif
1075		{
1076			struct mbuf *m;
1077			if (sopt->sopt_valsize > MLEN) {
1078				error = EMSGSIZE;
1079				break;
1080			}
1081			MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
1082			if (m == NULL) {
1083				error = ENOBUFS;
1084				break;
1085			}
1086			m->m_len = sopt->sopt_valsize;
1087			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
1088					    m->m_len);
1089			INP_LOCK(inp);
1090			error = ip_pcbopts(inp, sopt->sopt_name, m);
1091			INP_UNLOCK(inp);
1092			return (error);
1093		}
1094
1095		case IP_TOS:
1096		case IP_TTL:
1097		case IP_MINTTL:
1098		case IP_RECVOPTS:
1099		case IP_RECVRETOPTS:
1100		case IP_RECVDSTADDR:
1101		case IP_RECVTTL:
1102		case IP_RECVIF:
1103		case IP_FAITH:
1104		case IP_ONESBCAST:
1105		case IP_DONTFRAG:
1106			error = sooptcopyin(sopt, &optval, sizeof optval,
1107					    sizeof optval);
1108			if (error)
1109				break;
1110
1111			switch (sopt->sopt_name) {
1112			case IP_TOS:
1113				inp->inp_ip_tos = optval;
1114				break;
1115
1116			case IP_TTL:
1117				inp->inp_ip_ttl = optval;
1118				break;
1119
1120			case IP_MINTTL:
1121				if (optval > 0 && optval <= MAXTTL)
1122					inp->inp_ip_minttl = optval;
1123				else
1124					error = EINVAL;
1125				break;
1126
1127#define	OPTSET(bit) do {						\
1128	INP_LOCK(inp);							\
1129	if (optval)							\
1130		inp->inp_flags |= bit;					\
1131	else								\
1132		inp->inp_flags &= ~bit;					\
1133	INP_UNLOCK(inp);						\
1134} while (0)
1135
1136			case IP_RECVOPTS:
1137				OPTSET(INP_RECVOPTS);
1138				break;
1139
1140			case IP_RECVRETOPTS:
1141				OPTSET(INP_RECVRETOPTS);
1142				break;
1143
1144			case IP_RECVDSTADDR:
1145				OPTSET(INP_RECVDSTADDR);
1146				break;
1147
1148			case IP_RECVTTL:
1149				OPTSET(INP_RECVTTL);
1150				break;
1151
1152			case IP_RECVIF:
1153				OPTSET(INP_RECVIF);
1154				break;
1155
1156			case IP_FAITH:
1157				OPTSET(INP_FAITH);
1158				break;
1159
1160			case IP_ONESBCAST:
1161				OPTSET(INP_ONESBCAST);
1162				break;
1163			case IP_DONTFRAG:
1164				OPTSET(INP_DONTFRAG);
1165				break;
1166			}
1167			break;
1168#undef OPTSET
1169
1170		case IP_MULTICAST_IF:
1171		case IP_MULTICAST_VIF:
1172		case IP_MULTICAST_TTL:
1173		case IP_MULTICAST_LOOP:
1174		case IP_ADD_MEMBERSHIP:
1175		case IP_DROP_MEMBERSHIP:
1176			error = ip_setmoptions(inp, sopt);
1177			break;
1178
1179		case IP_PORTRANGE:
1180			error = sooptcopyin(sopt, &optval, sizeof optval,
1181					    sizeof optval);
1182			if (error)
1183				break;
1184
1185			INP_LOCK(inp);
1186			switch (optval) {
1187			case IP_PORTRANGE_DEFAULT:
1188				inp->inp_flags &= ~(INP_LOWPORT);
1189				inp->inp_flags &= ~(INP_HIGHPORT);
1190				break;
1191
1192			case IP_PORTRANGE_HIGH:
1193				inp->inp_flags &= ~(INP_LOWPORT);
1194				inp->inp_flags |= INP_HIGHPORT;
1195				break;
1196
1197			case IP_PORTRANGE_LOW:
1198				inp->inp_flags &= ~(INP_HIGHPORT);
1199				inp->inp_flags |= INP_LOWPORT;
1200				break;
1201
1202			default:
1203				error = EINVAL;
1204				break;
1205			}
1206			INP_UNLOCK(inp);
1207			break;
1208
1209#if defined(IPSEC) || defined(FAST_IPSEC)
1210		case IP_IPSEC_POLICY:
1211		{
1212			caddr_t req;
1213			size_t len = 0;
1214			int priv;
1215			struct mbuf *m;
1216			int optname;
1217
1218			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1219				break;
1220			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1221				break;
1222			priv = (sopt->sopt_td != NULL &&
1223				suser(sopt->sopt_td) != 0) ? 0 : 1;
1224			req = mtod(m, caddr_t);
1225			len = m->m_len;
1226			optname = sopt->sopt_name;
1227			error = ipsec4_set_policy(inp, optname, req, len, priv);
1228			m_freem(m);
1229			break;
1230		}
1231#endif /*IPSEC*/
1232
1233		default:
1234			error = ENOPROTOOPT;
1235			break;
1236		}
1237		break;
1238
1239	case SOPT_GET:
1240		switch (sopt->sopt_name) {
1241		case IP_OPTIONS:
1242		case IP_RETOPTS:
1243			if (inp->inp_options)
1244				error = sooptcopyout(sopt,
1245						     mtod(inp->inp_options,
1246							  char *),
1247						     inp->inp_options->m_len);
1248			else
1249				sopt->sopt_valsize = 0;
1250			break;
1251
1252		case IP_TOS:
1253		case IP_TTL:
1254		case IP_MINTTL:
1255		case IP_RECVOPTS:
1256		case IP_RECVRETOPTS:
1257		case IP_RECVDSTADDR:
1258		case IP_RECVTTL:
1259		case IP_RECVIF:
1260		case IP_PORTRANGE:
1261		case IP_FAITH:
1262		case IP_ONESBCAST:
1263		case IP_DONTFRAG:
1264			switch (sopt->sopt_name) {
1265
1266			case IP_TOS:
1267				optval = inp->inp_ip_tos;
1268				break;
1269
1270			case IP_TTL:
1271				optval = inp->inp_ip_ttl;
1272				break;
1273
1274			case IP_MINTTL:
1275				optval = inp->inp_ip_minttl;
1276				break;
1277
1278#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1279
1280			case IP_RECVOPTS:
1281				optval = OPTBIT(INP_RECVOPTS);
1282				break;
1283
1284			case IP_RECVRETOPTS:
1285				optval = OPTBIT(INP_RECVRETOPTS);
1286				break;
1287
1288			case IP_RECVDSTADDR:
1289				optval = OPTBIT(INP_RECVDSTADDR);
1290				break;
1291
1292			case IP_RECVTTL:
1293				optval = OPTBIT(INP_RECVTTL);
1294				break;
1295
1296			case IP_RECVIF:
1297				optval = OPTBIT(INP_RECVIF);
1298				break;
1299
1300			case IP_PORTRANGE:
1301				if (inp->inp_flags & INP_HIGHPORT)
1302					optval = IP_PORTRANGE_HIGH;
1303				else if (inp->inp_flags & INP_LOWPORT)
1304					optval = IP_PORTRANGE_LOW;
1305				else
1306					optval = 0;
1307				break;
1308
1309			case IP_FAITH:
1310				optval = OPTBIT(INP_FAITH);
1311				break;
1312
1313			case IP_ONESBCAST:
1314				optval = OPTBIT(INP_ONESBCAST);
1315				break;
1316			case IP_DONTFRAG:
1317				optval = OPTBIT(INP_DONTFRAG);
1318				break;
1319			}
1320			error = sooptcopyout(sopt, &optval, sizeof optval);
1321			break;
1322
1323		case IP_MULTICAST_IF:
1324		case IP_MULTICAST_VIF:
1325		case IP_MULTICAST_TTL:
1326		case IP_MULTICAST_LOOP:
1327		case IP_ADD_MEMBERSHIP:
1328		case IP_DROP_MEMBERSHIP:
1329			error = ip_getmoptions(inp, sopt);
1330			break;
1331
1332#if defined(IPSEC) || defined(FAST_IPSEC)
1333		case IP_IPSEC_POLICY:
1334		{
1335			struct mbuf *m = NULL;
1336			caddr_t req = NULL;
1337			size_t len = 0;
1338
1339			if (m != 0) {
1340				req = mtod(m, caddr_t);
1341				len = m->m_len;
1342			}
1343			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1344			if (error == 0)
1345				error = soopt_mcopyout(sopt, m); /* XXX */
1346			if (error == 0)
1347				m_freem(m);
1348			break;
1349		}
1350#endif /*IPSEC*/
1351
1352		default:
1353			error = ENOPROTOOPT;
1354			break;
1355		}
1356		break;
1357	}
1358	return (error);
1359}
1360
1361/*
1362 * XXX
1363 * The whole multicast option thing needs to be re-thought.
1364 * Several of these options are equally applicable to non-multicast
1365 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1366 * standard option (IP_TTL).
1367 */
1368
1369/*
1370 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1371 */
1372static struct ifnet *
1373ip_multicast_if(a, ifindexp)
1374	struct in_addr *a;
1375	int *ifindexp;
1376{
1377	int ifindex;
1378	struct ifnet *ifp;
1379
1380	if (ifindexp)
1381		*ifindexp = 0;
1382	if (ntohl(a->s_addr) >> 24 == 0) {
1383		ifindex = ntohl(a->s_addr) & 0xffffff;
1384		if (ifindex < 0 || if_index < ifindex)
1385			return NULL;
1386		ifp = ifnet_byindex(ifindex);
1387		if (ifindexp)
1388			*ifindexp = ifindex;
1389	} else {
1390		INADDR_TO_IFP(*a, ifp);
1391	}
1392	return ifp;
1393}
1394
1395/*
1396 * Given an inpcb, return its multicast options structure pointer.  Accepts
1397 * an unlocked inpcb pointer, but will return it locked.  May sleep.
1398 */
1399static struct ip_moptions *
1400ip_findmoptions(struct inpcb *inp)
1401{
1402	struct ip_moptions *imo;
1403
1404	INP_LOCK(inp);
1405	if (inp->inp_moptions != NULL)
1406		return (inp->inp_moptions);
1407
1408	INP_UNLOCK(inp);
1409
1410	imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
1411
1412	imo->imo_multicast_ifp = NULL;
1413	imo->imo_multicast_addr.s_addr = INADDR_ANY;
1414	imo->imo_multicast_vif = -1;
1415	imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1416	imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1417	imo->imo_num_memberships = 0;
1418
1419	INP_LOCK(inp);
1420	if (inp->inp_moptions != NULL) {
1421		free(imo, M_IPMOPTS);
1422		return (inp->inp_moptions);
1423	}
1424	inp->inp_moptions = imo;
1425	return (imo);
1426}
1427
1428/*
1429 * Set the IP multicast options in response to user setsockopt().
1430 */
1431static int
1432ip_setmoptions(struct inpcb *inp, struct sockopt *sopt)
1433{
1434	int error = 0;
1435	int i;
1436	struct in_addr addr;
1437	struct ip_mreq mreq;
1438	struct ifnet *ifp;
1439	struct ip_moptions *imo;
1440	struct route ro;
1441	struct sockaddr_in *dst;
1442	int ifindex;
1443	int s;
1444
1445	switch (sopt->sopt_name) {
1446	/* store an index number for the vif you wanna use in the send */
1447	case IP_MULTICAST_VIF:
1448		if (legal_vif_num == 0) {
1449			error = EOPNOTSUPP;
1450			break;
1451		}
1452		error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1453		if (error)
1454			break;
1455		if (!legal_vif_num(i) && (i != -1)) {
1456			error = EINVAL;
1457			break;
1458		}
1459		imo = ip_findmoptions(inp);
1460		imo->imo_multicast_vif = i;
1461		INP_UNLOCK(inp);
1462		break;
1463
1464	case IP_MULTICAST_IF:
1465		/*
1466		 * Select the interface for outgoing multicast packets.
1467		 */
1468		error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1469		if (error)
1470			break;
1471		/*
1472		 * INADDR_ANY is used to remove a previous selection.
1473		 * When no interface is selected, a default one is
1474		 * chosen every time a multicast packet is sent.
1475		 */
1476		imo = ip_findmoptions(inp);
1477		if (addr.s_addr == INADDR_ANY) {
1478			imo->imo_multicast_ifp = NULL;
1479			INP_UNLOCK(inp);
1480			break;
1481		}
1482		/*
1483		 * The selected interface is identified by its local
1484		 * IP address.  Find the interface and confirm that
1485		 * it supports multicasting.
1486		 */
1487		s = splimp();
1488		ifp = ip_multicast_if(&addr, &ifindex);
1489		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1490			INP_UNLOCK(inp);
1491			splx(s);
1492			error = EADDRNOTAVAIL;
1493			break;
1494		}
1495		imo->imo_multicast_ifp = ifp;
1496		if (ifindex)
1497			imo->imo_multicast_addr = addr;
1498		else
1499			imo->imo_multicast_addr.s_addr = INADDR_ANY;
1500		INP_UNLOCK(inp);
1501		splx(s);
1502		break;
1503
1504	case IP_MULTICAST_TTL:
1505		/*
1506		 * Set the IP time-to-live for outgoing multicast packets.
1507		 * The original multicast API required a char argument,
1508		 * which is inconsistent with the rest of the socket API.
1509		 * We allow either a char or an int.
1510		 */
1511		if (sopt->sopt_valsize == 1) {
1512			u_char ttl;
1513			error = sooptcopyin(sopt, &ttl, 1, 1);
1514			if (error)
1515				break;
1516			imo = ip_findmoptions(inp);
1517			imo->imo_multicast_ttl = ttl;
1518			INP_UNLOCK(inp);
1519		} else {
1520			u_int ttl;
1521			error = sooptcopyin(sopt, &ttl, sizeof ttl,
1522					    sizeof ttl);
1523			if (error)
1524				break;
1525			if (ttl > 255)
1526				error = EINVAL;
1527			else {
1528				imo = ip_findmoptions(inp);
1529				imo->imo_multicast_ttl = ttl;
1530				INP_UNLOCK(inp);
1531			}
1532		}
1533		break;
1534
1535	case IP_MULTICAST_LOOP:
1536		/*
1537		 * Set the loopback flag for outgoing multicast packets.
1538		 * Must be zero or one.  The original multicast API required a
1539		 * char argument, which is inconsistent with the rest
1540		 * of the socket API.  We allow either a char or an int.
1541		 */
1542		if (sopt->sopt_valsize == 1) {
1543			u_char loop;
1544			error = sooptcopyin(sopt, &loop, 1, 1);
1545			if (error)
1546				break;
1547			imo = ip_findmoptions(inp);
1548			imo->imo_multicast_loop = !!loop;
1549			INP_UNLOCK(inp);
1550		} else {
1551			u_int loop;
1552			error = sooptcopyin(sopt, &loop, sizeof loop,
1553					    sizeof loop);
1554			if (error)
1555				break;
1556			imo = ip_findmoptions(inp);
1557			imo->imo_multicast_loop = !!loop;
1558			INP_UNLOCK(inp);
1559		}
1560		break;
1561
1562	case IP_ADD_MEMBERSHIP:
1563		/*
1564		 * Add a multicast group membership.
1565		 * Group must be a valid IP multicast address.
1566		 */
1567		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1568		if (error)
1569			break;
1570
1571		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1572			error = EINVAL;
1573			break;
1574		}
1575		s = splimp();
1576		/*
1577		 * If no interface address was provided, use the interface of
1578		 * the route to the given multicast address.
1579		 */
1580		if (mreq.imr_interface.s_addr == INADDR_ANY) {
1581			bzero((caddr_t)&ro, sizeof(ro));
1582			dst = (struct sockaddr_in *)&ro.ro_dst;
1583			dst->sin_len = sizeof(*dst);
1584			dst->sin_family = AF_INET;
1585			dst->sin_addr = mreq.imr_multiaddr;
1586			rtalloc_ign(&ro, RTF_CLONING);
1587			if (ro.ro_rt == NULL) {
1588				error = EADDRNOTAVAIL;
1589				splx(s);
1590				break;
1591			}
1592			ifp = ro.ro_rt->rt_ifp;
1593			RTFREE(ro.ro_rt);
1594		}
1595		else {
1596			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1597		}
1598
1599		/*
1600		 * See if we found an interface, and confirm that it
1601		 * supports multicast.
1602		 */
1603		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1604			error = EADDRNOTAVAIL;
1605			splx(s);
1606			break;
1607		}
1608		/*
1609		 * See if the membership already exists or if all the
1610		 * membership slots are full.
1611		 */
1612		imo = ip_findmoptions(inp);
1613		for (i = 0; i < imo->imo_num_memberships; ++i) {
1614			if (imo->imo_membership[i]->inm_ifp == ifp &&
1615			    imo->imo_membership[i]->inm_addr.s_addr
1616						== mreq.imr_multiaddr.s_addr)
1617				break;
1618		}
1619		if (i < imo->imo_num_memberships) {
1620			INP_UNLOCK(inp);
1621			error = EADDRINUSE;
1622			splx(s);
1623			break;
1624		}
1625		if (i == IP_MAX_MEMBERSHIPS) {
1626			INP_UNLOCK(inp);
1627			error = ETOOMANYREFS;
1628			splx(s);
1629			break;
1630		}
1631		/*
1632		 * Everything looks good; add a new record to the multicast
1633		 * address list for the given interface.
1634		 */
1635		if ((imo->imo_membership[i] =
1636		    in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
1637			INP_UNLOCK(inp);
1638			error = ENOBUFS;
1639			splx(s);
1640			break;
1641		}
1642		++imo->imo_num_memberships;
1643		INP_UNLOCK(inp);
1644		splx(s);
1645		break;
1646
1647	case IP_DROP_MEMBERSHIP:
1648		/*
1649		 * Drop a multicast group membership.
1650		 * Group must be a valid IP multicast address.
1651		 */
1652		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1653		if (error)
1654			break;
1655
1656		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1657			error = EINVAL;
1658			break;
1659		}
1660
1661		s = splimp();
1662		/*
1663		 * If an interface address was specified, get a pointer
1664		 * to its ifnet structure.
1665		 */
1666		if (mreq.imr_interface.s_addr == INADDR_ANY)
1667			ifp = NULL;
1668		else {
1669			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1670			if (ifp == NULL) {
1671				error = EADDRNOTAVAIL;
1672				splx(s);
1673				break;
1674			}
1675		}
1676		/*
1677		 * Find the membership in the membership array.
1678		 */
1679		imo = ip_findmoptions(inp);
1680		for (i = 0; i < imo->imo_num_memberships; ++i) {
1681			if ((ifp == NULL ||
1682			     imo->imo_membership[i]->inm_ifp == ifp) &&
1683			     imo->imo_membership[i]->inm_addr.s_addr ==
1684			     mreq.imr_multiaddr.s_addr)
1685				break;
1686		}
1687		if (i == imo->imo_num_memberships) {
1688			INP_UNLOCK(inp);
1689			error = EADDRNOTAVAIL;
1690			splx(s);
1691			break;
1692		}
1693		/*
1694		 * Give up the multicast address record to which the
1695		 * membership points.
1696		 */
1697		in_delmulti(imo->imo_membership[i]);
1698		/*
1699		 * Remove the gap in the membership array.
1700		 */
1701		for (++i; i < imo->imo_num_memberships; ++i)
1702			imo->imo_membership[i-1] = imo->imo_membership[i];
1703		--imo->imo_num_memberships;
1704		INP_UNLOCK(inp);
1705		splx(s);
1706		break;
1707
1708	default:
1709		error = EOPNOTSUPP;
1710		break;
1711	}
1712
1713	return (error);
1714}
1715
1716/*
1717 * Return the IP multicast options in response to user getsockopt().
1718 */
1719static int
1720ip_getmoptions(struct inpcb *inp, struct sockopt *sopt)
1721{
1722	struct ip_moptions *imo;
1723	struct in_addr addr;
1724	struct in_ifaddr *ia;
1725	int error, optval;
1726	u_char coptval;
1727
1728	INP_LOCK(inp);
1729	imo = inp->inp_moptions;
1730
1731	error = 0;
1732	switch (sopt->sopt_name) {
1733	case IP_MULTICAST_VIF:
1734		if (imo != NULL)
1735			optval = imo->imo_multicast_vif;
1736		else
1737			optval = -1;
1738		INP_UNLOCK(inp);
1739		error = sooptcopyout(sopt, &optval, sizeof optval);
1740		break;
1741
1742	case IP_MULTICAST_IF:
1743		if (imo == NULL || imo->imo_multicast_ifp == NULL)
1744			addr.s_addr = INADDR_ANY;
1745		else if (imo->imo_multicast_addr.s_addr) {
1746			/* return the value user has set */
1747			addr = imo->imo_multicast_addr;
1748		} else {
1749			IFP_TO_IA(imo->imo_multicast_ifp, ia);
1750			addr.s_addr = (ia == NULL) ? INADDR_ANY
1751				: IA_SIN(ia)->sin_addr.s_addr;
1752		}
1753		INP_UNLOCK(inp);
1754		error = sooptcopyout(sopt, &addr, sizeof addr);
1755		break;
1756
1757	case IP_MULTICAST_TTL:
1758		if (imo == 0)
1759			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
1760		else
1761			optval = coptval = imo->imo_multicast_ttl;
1762		INP_UNLOCK(inp);
1763		if (sopt->sopt_valsize == 1)
1764			error = sooptcopyout(sopt, &coptval, 1);
1765		else
1766			error = sooptcopyout(sopt, &optval, sizeof optval);
1767		break;
1768
1769	case IP_MULTICAST_LOOP:
1770		if (imo == 0)
1771			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
1772		else
1773			optval = coptval = imo->imo_multicast_loop;
1774		INP_UNLOCK(inp);
1775		if (sopt->sopt_valsize == 1)
1776			error = sooptcopyout(sopt, &coptval, 1);
1777		else
1778			error = sooptcopyout(sopt, &optval, sizeof optval);
1779		break;
1780
1781	default:
1782		INP_UNLOCK(inp);
1783		error = ENOPROTOOPT;
1784		break;
1785	}
1786	INP_UNLOCK_ASSERT(inp);
1787
1788	return (error);
1789}
1790
1791/*
1792 * Discard the IP multicast options.
1793 */
1794void
1795ip_freemoptions(imo)
1796	register struct ip_moptions *imo;
1797{
1798	register int i;
1799
1800	if (imo != NULL) {
1801		for (i = 0; i < imo->imo_num_memberships; ++i)
1802			in_delmulti(imo->imo_membership[i]);
1803		free(imo, M_IPMOPTS);
1804	}
1805}
1806
1807/*
1808 * Routine called from ip_output() to loop back a copy of an IP multicast
1809 * packet to the input queue of a specified interface.  Note that this
1810 * calls the output routine of the loopback "driver", but with an interface
1811 * pointer that might NOT be a loopback interface -- evil, but easier than
1812 * replicating that code here.
1813 */
1814static void
1815ip_mloopback(ifp, m, dst, hlen)
1816	struct ifnet *ifp;
1817	register struct mbuf *m;
1818	register struct sockaddr_in *dst;
1819	int hlen;
1820{
1821	register struct ip *ip;
1822	struct mbuf *copym;
1823
1824	copym = m_copy(m, 0, M_COPYALL);
1825	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
1826		copym = m_pullup(copym, hlen);
1827	if (copym != NULL) {
1828		/* If needed, compute the checksum and mark it as valid. */
1829		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1830			in_delayed_cksum(copym);
1831			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1832			copym->m_pkthdr.csum_flags |=
1833			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1834			copym->m_pkthdr.csum_data = 0xffff;
1835		}
1836		/*
1837		 * We don't bother to fragment if the IP length is greater
1838		 * than the interface's MTU.  Can this possibly matter?
1839		 */
1840		ip = mtod(copym, struct ip *);
1841		ip->ip_len = htons(ip->ip_len);
1842		ip->ip_off = htons(ip->ip_off);
1843		ip->ip_sum = 0;
1844		ip->ip_sum = in_cksum(copym, hlen);
1845		/*
1846		 * NB:
1847		 * It's not clear whether there are any lingering
1848		 * reentrancy problems in other areas which might
1849		 * be exposed by using ip_input directly (in
1850		 * particular, everything which modifies the packet
1851		 * in-place).  Yet another option is using the
1852		 * protosw directly to deliver the looped back
1853		 * packet.  For the moment, we'll err on the side
1854		 * of safety by using if_simloop().
1855		 */
1856#if 1 /* XXX */
1857		if (dst->sin_family != AF_INET) {
1858			printf("ip_mloopback: bad address family %d\n",
1859						dst->sin_family);
1860			dst->sin_family = AF_INET;
1861		}
1862#endif
1863
1864#ifdef notdef
1865		copym->m_pkthdr.rcvif = ifp;
1866		ip_input(copym);
1867#else
1868		if_simloop(ifp, copym, dst->sin_family, 0);
1869#endif
1870	}
1871}
1872