1/*	$NetBSD: ip_output.c,v 1.212 2011/12/31 20:41:59 christos Exp $	*/
2
3/*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*-
33 * Copyright (c) 1998 The NetBSD Foundation, Inc.
34 * All rights reserved.
35 *
36 * This code is derived from software contributed to The NetBSD Foundation
37 * by Public Access Networks Corporation ("Panix").  It was developed under
38 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 *    notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 *    notice, this list of conditions and the following disclaimer in the
47 *    documentation and/or other materials provided with the distribution.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
50 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
51 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
52 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
53 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
54 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
55 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
56 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
57 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
58 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
59 * POSSIBILITY OF SUCH DAMAGE.
60 */
61
62/*
63 * Copyright (c) 1982, 1986, 1988, 1990, 1993
64 *	The Regents of the University of California.  All rights reserved.
65 *
66 * Redistribution and use in source and binary forms, with or without
67 * modification, are permitted provided that the following conditions
68 * are met:
69 * 1. Redistributions of source code must retain the above copyright
70 *    notice, this list of conditions and the following disclaimer.
71 * 2. Redistributions in binary form must reproduce the above copyright
72 *    notice, this list of conditions and the following disclaimer in the
73 *    documentation and/or other materials provided with the distribution.
74 * 3. Neither the name of the University nor the names of its contributors
75 *    may be used to endorse or promote products derived from this software
76 *    without specific prior written permission.
77 *
78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
81 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
88 * SUCH DAMAGE.
89 *
90 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
91 */
92
93#include <sys/cdefs.h>
94__KERNEL_RCSID(0, "$NetBSD: ip_output.c,v 1.212 2011/12/31 20:41:59 christos Exp $");
95
96#include "opt_pfil_hooks.h"
97#include "opt_inet.h"
98#include "opt_ipsec.h"
99#include "opt_mrouting.h"
100
101#include <sys/param.h>
102#include <sys/malloc.h>
103#include <sys/mbuf.h>
104#include <sys/errno.h>
105#include <sys/protosw.h>
106#include <sys/socket.h>
107#include <sys/socketvar.h>
108#include <sys/kauth.h>
109#ifdef FAST_IPSEC
110#include <sys/domain.h>
111#endif
112#include <sys/systm.h>
113#include <sys/proc.h>
114
115#include <net/if.h>
116#include <net/route.h>
117#include <net/pfil.h>
118
119#include <netinet/in.h>
120#include <netinet/in_systm.h>
121#include <netinet/ip.h>
122#include <netinet/in_pcb.h>
123#include <netinet/in_var.h>
124#include <netinet/ip_var.h>
125#include <netinet/ip_private.h>
126#include <netinet/in_offload.h>
127
128#ifdef MROUTING
129#include <netinet/ip_mroute.h>
130#endif
131
132#ifdef KAME_IPSEC
133#include <netinet6/ipsec.h>
134#include <netinet6/ipsec_private.h>
135#include <netkey/key.h>
136#include <netkey/key_debug.h>
137#endif /*KAME_IPSEC*/
138
139#ifdef FAST_IPSEC
140#include <netipsec/ipsec.h>
141#include <netipsec/key.h>
142#include <netipsec/xform.h>
143#endif	/* FAST_IPSEC*/
144
145#ifdef IPSEC_NAT_T
146#include <netinet/udp.h>
147#endif
148
149static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
150static struct ifnet *ip_multicast_if(struct in_addr *, int *);
151static void ip_mloopback(struct ifnet *, struct mbuf *,
152    const struct sockaddr_in *);
153
154#ifdef PFIL_HOOKS
155extern struct pfil_head inet_pfil_hook;			/* XXX */
156#endif
157
158int	ip_do_loopback_cksum = 0;
159
160/*
161 * IP output.  The packet in mbuf chain m contains a skeletal IP
162 * header (with len, off, ttl, proto, tos, src, dst).
163 * The mbuf chain containing the packet will be freed.
164 * The mbuf opt, if present, will not be freed.
165 */
166int
167ip_output(struct mbuf *m0, ...)
168{
169	struct rtentry *rt;
170	struct ip *ip;
171	struct ifnet *ifp;
172	struct mbuf *m = m0;
173	int hlen = sizeof (struct ip);
174	int len, error = 0;
175	struct route iproute;
176	const struct sockaddr_in *dst;
177	struct in_ifaddr *ia;
178	struct ifaddr *xifa;
179	struct mbuf *opt;
180	struct route *ro;
181	int flags, sw_csum;
182	int *mtu_p;
183	u_long mtu;
184	struct ip_moptions *imo;
185	struct socket *so;
186	va_list ap;
187#ifdef IPSEC_NAT_T
188	int natt_frag = 0;
189#endif
190#ifdef KAME_IPSEC
191	struct secpolicy *sp = NULL;
192#endif /*KAME_IPSEC*/
193#ifdef FAST_IPSEC
194	struct inpcb *inp;
195	struct secpolicy *sp = NULL;
196	int s;
197#endif
198	u_int16_t ip_len;
199	union {
200		struct sockaddr		dst;
201		struct sockaddr_in	dst4;
202	} u;
203	struct sockaddr *rdst = &u.dst;	/* real IP destination, as opposed
204					 * to the nexthop
205					 */
206
207	len = 0;
208	va_start(ap, m0);
209	opt = va_arg(ap, struct mbuf *);
210	ro = va_arg(ap, struct route *);
211	flags = va_arg(ap, int);
212	imo = va_arg(ap, struct ip_moptions *);
213	so = va_arg(ap, struct socket *);
214	if (flags & IP_RETURNMTU)
215		mtu_p = va_arg(ap, int *);
216	else
217		mtu_p = NULL;
218	va_end(ap);
219
220	MCLAIM(m, &ip_tx_mowner);
221#ifdef FAST_IPSEC
222	if (so != NULL && so->so_proto->pr_domain->dom_family == AF_INET)
223		inp = (struct inpcb *)so->so_pcb;
224	else
225		inp = NULL;
226#endif /* FAST_IPSEC */
227
228#ifdef	DIAGNOSTIC
229	if ((m->m_flags & M_PKTHDR) == 0)
230		panic("ip_output: no HDR");
231
232	if ((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) != 0) {
233		panic("ip_output: IPv6 checksum offload flags: %d",
234		    m->m_pkthdr.csum_flags);
235	}
236
237	if ((m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) ==
238	    (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
239		panic("ip_output: conflicting checksum offload flags: %d",
240		    m->m_pkthdr.csum_flags);
241	}
242#endif
243	if (opt) {
244		m = ip_insertoptions(m, opt, &len);
245		if (len >= sizeof(struct ip))
246			hlen = len;
247	}
248	ip = mtod(m, struct ip *);
249	/*
250	 * Fill in IP header.
251	 */
252	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
253		ip->ip_v = IPVERSION;
254		ip->ip_off = htons(0);
255		/* ip->ip_id filled in after we find out source ia */
256		ip->ip_hl = hlen >> 2;
257		IP_STATINC(IP_STAT_LOCALOUT);
258	} else {
259		hlen = ip->ip_hl << 2;
260	}
261	/*
262	 * Route packet.
263	 */
264	memset(&iproute, 0, sizeof(iproute));
265	if (ro == NULL)
266		ro = &iproute;
267	sockaddr_in_init(&u.dst4, &ip->ip_dst, 0);
268	dst = satocsin(rtcache_getdst(ro));
269	/*
270	 * If there is a cached route,
271	 * check that it is to the same destination
272	 * and is still up.  If not, free it and try again.
273	 * The address family should also be checked in case of sharing the
274	 * cache with IPv6.
275	 */
276	if (dst == NULL)
277		;
278	else if (dst->sin_family != AF_INET ||
279		 !in_hosteq(dst->sin_addr, ip->ip_dst))
280		rtcache_free(ro);
281
282	if ((rt = rtcache_validate(ro)) == NULL &&
283	    (rt = rtcache_update(ro, 1)) == NULL) {
284		dst = &u.dst4;
285		rtcache_setdst(ro, &u.dst);
286	}
287	/*
288	 * If routing to interface only,
289	 * short circuit routing lookup.
290	 */
291	if (flags & IP_ROUTETOIF) {
292		if ((ia = ifatoia(ifa_ifwithladdr(sintocsa(dst)))) == NULL) {
293			IP_STATINC(IP_STAT_NOROUTE);
294			error = ENETUNREACH;
295			goto bad;
296		}
297		ifp = ia->ia_ifp;
298		mtu = ifp->if_mtu;
299		ip->ip_ttl = 1;
300	} else if ((IN_MULTICAST(ip->ip_dst.s_addr) ||
301	    ip->ip_dst.s_addr == INADDR_BROADCAST) &&
302	    imo != NULL && imo->imo_multicast_ifp != NULL) {
303		ifp = imo->imo_multicast_ifp;
304		mtu = ifp->if_mtu;
305		IFP_TO_IA(ifp, ia);
306	} else {
307		if (rt == NULL)
308			rt = rtcache_init(ro);
309		if (rt == NULL) {
310			IP_STATINC(IP_STAT_NOROUTE);
311			error = EHOSTUNREACH;
312			goto bad;
313		}
314		ia = ifatoia(rt->rt_ifa);
315		ifp = rt->rt_ifp;
316		if ((mtu = rt->rt_rmx.rmx_mtu) == 0)
317			mtu = ifp->if_mtu;
318		rt->rt_use++;
319		if (rt->rt_flags & RTF_GATEWAY)
320			dst = satosin(rt->rt_gateway);
321	}
322	if (IN_MULTICAST(ip->ip_dst.s_addr) ||
323	    (ip->ip_dst.s_addr == INADDR_BROADCAST)) {
324		struct in_multi *inm;
325
326		m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ?
327			M_BCAST : M_MCAST;
328		/*
329		 * See if the caller provided any multicast options
330		 */
331		if (imo != NULL)
332			ip->ip_ttl = imo->imo_multicast_ttl;
333		else
334			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
335
336		/*
337		 * if we don't know the outgoing ifp yet, we can't generate
338		 * output
339		 */
340		if (!ifp) {
341			IP_STATINC(IP_STAT_NOROUTE);
342			error = ENETUNREACH;
343			goto bad;
344		}
345
346		/*
347		 * If the packet is multicast or broadcast, confirm that
348		 * the outgoing interface can transmit it.
349		 */
350		if (((m->m_flags & M_MCAST) &&
351		     (ifp->if_flags & IFF_MULTICAST) == 0) ||
352		    ((m->m_flags & M_BCAST) &&
353		     (ifp->if_flags & (IFF_BROADCAST|IFF_POINTOPOINT)) == 0))  {
354			IP_STATINC(IP_STAT_NOROUTE);
355			error = ENETUNREACH;
356			goto bad;
357		}
358		/*
359		 * If source address not specified yet, use an address
360		 * of outgoing interface.
361		 */
362		if (in_nullhost(ip->ip_src)) {
363			struct in_ifaddr *xia;
364
365			IFP_TO_IA(ifp, xia);
366			if (!xia) {
367				error = EADDRNOTAVAIL;
368				goto bad;
369			}
370			xifa = &xia->ia_ifa;
371			if (xifa->ifa_getifa != NULL) {
372				xia = ifatoia((*xifa->ifa_getifa)(xifa, rdst));
373			}
374			ip->ip_src = xia->ia_addr.sin_addr;
375		}
376
377		IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
378		if (inm != NULL &&
379		   (imo == NULL || imo->imo_multicast_loop)) {
380			/*
381			 * If we belong to the destination multicast group
382			 * on the outgoing interface, and the caller did not
383			 * forbid loopback, loop back a copy.
384			 */
385			ip_mloopback(ifp, m, &u.dst4);
386		}
387#ifdef MROUTING
388		else {
389			/*
390			 * If we are acting as a multicast router, perform
391			 * multicast forwarding as if the packet had just
392			 * arrived on the interface to which we are about
393			 * to send.  The multicast forwarding function
394			 * recursively calls this function, using the
395			 * IP_FORWARDING flag to prevent infinite recursion.
396			 *
397			 * Multicasts that are looped back by ip_mloopback(),
398			 * above, will be forwarded by the ip_input() routine,
399			 * if necessary.
400			 */
401			extern struct socket *ip_mrouter;
402
403			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
404				if (ip_mforward(m, ifp) != 0) {
405					m_freem(m);
406					goto done;
407				}
408			}
409		}
410#endif
411		/*
412		 * Multicasts with a time-to-live of zero may be looped-
413		 * back, above, but must not be transmitted on a network.
414		 * Also, multicasts addressed to the loopback interface
415		 * are not sent -- the above call to ip_mloopback() will
416		 * loop back a copy if this host actually belongs to the
417		 * destination group on the loopback interface.
418		 */
419		if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) {
420			m_freem(m);
421			goto done;
422		}
423
424		goto sendit;
425	}
426	/*
427	 * If source address not specified yet, use address
428	 * of outgoing interface.
429	 */
430	if (in_nullhost(ip->ip_src)) {
431		xifa = &ia->ia_ifa;
432		if (xifa->ifa_getifa != NULL)
433			ia = ifatoia((*xifa->ifa_getifa)(xifa, rdst));
434		ip->ip_src = ia->ia_addr.sin_addr;
435	}
436
437	/*
438	 * packets with Class-D address as source are not valid per
439	 * RFC 1112
440	 */
441	if (IN_MULTICAST(ip->ip_src.s_addr)) {
442		IP_STATINC(IP_STAT_ODROPPED);
443		error = EADDRNOTAVAIL;
444		goto bad;
445	}
446
447	/*
448	 * Look for broadcast address and
449	 * and verify user is allowed to send
450	 * such a packet.
451	 */
452	if (in_broadcast(dst->sin_addr, ifp)) {
453		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
454			error = EADDRNOTAVAIL;
455			goto bad;
456		}
457		if ((flags & IP_ALLOWBROADCAST) == 0) {
458			error = EACCES;
459			goto bad;
460		}
461		/* don't allow broadcast messages to be fragmented */
462		if (ntohs(ip->ip_len) > ifp->if_mtu) {
463			error = EMSGSIZE;
464			goto bad;
465		}
466		m->m_flags |= M_BCAST;
467	} else
468		m->m_flags &= ~M_BCAST;
469
470sendit:
471	if ((flags & (IP_FORWARDING|IP_NOIPNEWID)) == 0) {
472		if (m->m_pkthdr.len < IP_MINFRAGSIZE) {
473			ip->ip_id = 0;
474		} else if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
475			ip->ip_id = ip_newid(ia);
476		} else {
477
478			/*
479			 * TSO capable interfaces (typically?) increment
480			 * ip_id for each segment.
481			 * "allocate" enough ids here to increase the chance
482			 * for them to be unique.
483			 *
484			 * note that the following calculation is not
485			 * needed to be precise.  wasting some ip_id is fine.
486			 */
487
488			unsigned int segsz = m->m_pkthdr.segsz;
489			unsigned int datasz = ntohs(ip->ip_len) - hlen;
490			unsigned int num = howmany(datasz, segsz);
491
492			ip->ip_id = ip_newid_range(ia, num);
493		}
494	}
495	/*
496	 * If we're doing Path MTU Discovery, we need to set DF unless
497	 * the route's MTU is locked.
498	 */
499	if ((flags & IP_MTUDISC) != 0 && rt != NULL &&
500	    (rt->rt_rmx.rmx_locks & RTV_MTU) == 0)
501		ip->ip_off |= htons(IP_DF);
502
503	/* Remember the current ip_len */
504	ip_len = ntohs(ip->ip_len);
505
506#ifdef KAME_IPSEC
507	/* get SP for this packet */
508	if (so == NULL)
509		sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
510		    flags, &error);
511	else {
512		if (IPSEC_PCB_SKIP_IPSEC(sotoinpcb_hdr(so)->inph_sp,
513					 IPSEC_DIR_OUTBOUND))
514			goto skip_ipsec;
515		sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error);
516	}
517
518	if (sp == NULL) {
519		IPSEC_STATINC(IPSEC_STAT_IN_INVAL);
520		goto bad;
521	}
522
523	error = 0;
524
525	/* check policy */
526	switch (sp->policy) {
527	case IPSEC_POLICY_DISCARD:
528		/*
529		 * This packet is just discarded.
530		 */
531		IPSEC_STATINC(IPSEC_STAT_OUT_POLVIO);
532		goto bad;
533
534	case IPSEC_POLICY_BYPASS:
535	case IPSEC_POLICY_NONE:
536		/* no need to do IPsec. */
537		goto skip_ipsec;
538
539	case IPSEC_POLICY_IPSEC:
540		if (sp->req == NULL) {
541			/* XXX should be panic ? */
542			printf("ip_output: No IPsec request specified.\n");
543			error = EINVAL;
544			goto bad;
545		}
546		break;
547
548	case IPSEC_POLICY_ENTRUST:
549	default:
550		printf("ip_output: Invalid policy found. %d\n", sp->policy);
551	}
552
553#ifdef IPSEC_NAT_T
554	/*
555	 * NAT-T ESP fragmentation: don't do IPSec processing now,
556	 * we'll do it on each fragmented packet.
557	 */
558	if (sp->req->sav &&
559	    ((sp->req->sav->natt_type & UDP_ENCAP_ESPINUDP) ||
560	     (sp->req->sav->natt_type & UDP_ENCAP_ESPINUDP_NON_IKE))) {
561		if (ntohs(ip->ip_len) > sp->req->sav->esp_frag) {
562			natt_frag = 1;
563			mtu = sp->req->sav->esp_frag;
564			goto skip_ipsec;
565		}
566	}
567#endif /* IPSEC_NAT_T */
568
569	/*
570	 * ipsec4_output() expects ip_len and ip_off in network
571	 * order.  They have been set to network order above.
572	 */
573
574    {
575	struct ipsec_output_state state;
576	memset(&state, 0, sizeof(state));
577	state.m = m;
578	if (flags & IP_ROUTETOIF) {
579		state.ro = &iproute;
580		memset(&iproute, 0, sizeof(iproute));
581	} else
582		state.ro = ro;
583	state.dst = sintocsa(dst);
584
585	/*
586	 * We can't defer the checksum of payload data if
587	 * we're about to encrypt/authenticate it.
588	 *
589	 * XXX When we support crypto offloading functions of
590	 * XXX network interfaces, we need to reconsider this,
591	 * XXX since it's likely that they'll support checksumming,
592	 * XXX as well.
593	 */
594	if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
595		in_delayed_cksum(m);
596		m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
597	}
598
599	error = ipsec4_output(&state, sp, flags);
600
601	m = state.m;
602	if (flags & IP_ROUTETOIF) {
603		/*
604		 * if we have tunnel mode SA, we may need to ignore
605		 * IP_ROUTETOIF.
606		 */
607		if (state.ro != &iproute ||
608		    rtcache_validate(state.ro) != NULL) {
609			flags &= ~IP_ROUTETOIF;
610			ro = state.ro;
611		}
612	} else
613		ro = state.ro;
614	dst = satocsin(state.dst);
615	if (error) {
616		/* mbuf is already reclaimed in ipsec4_output. */
617		m0 = NULL;
618		switch (error) {
619		case EHOSTUNREACH:
620		case ENETUNREACH:
621		case EMSGSIZE:
622		case ENOBUFS:
623		case ENOMEM:
624			break;
625		default:
626			printf("ip4_output (ipsec): error code %d\n", error);
627			/*fall through*/
628		case ENOENT:
629			/* don't show these error codes to the user */
630			error = 0;
631			break;
632		}
633		goto bad;
634	}
635
636	/* be sure to update variables that are affected by ipsec4_output() */
637	ip = mtod(m, struct ip *);
638	hlen = ip->ip_hl << 2;
639	ip_len = ntohs(ip->ip_len);
640
641	if ((rt = rtcache_validate(ro)) == NULL) {
642		if ((flags & IP_ROUTETOIF) == 0) {
643			printf("ip_output: "
644				"can't update route after IPsec processing\n");
645			error = EHOSTUNREACH;	/*XXX*/
646			goto bad;
647		}
648	} else {
649		/* nobody uses ia beyond here */
650		if (state.encap) {
651			ifp = rt->rt_ifp;
652			if ((mtu = rt->rt_rmx.rmx_mtu) == 0)
653				mtu = ifp->if_mtu;
654		}
655	}
656    }
657skip_ipsec:
658#endif /*KAME_IPSEC*/
659#ifdef FAST_IPSEC
660	/*
661	 * Check the security policy (SP) for the packet and, if
662	 * required, do IPsec-related processing.  There are two
663	 * cases here; the first time a packet is sent through
664	 * it will be untagged and handled by ipsec4_checkpolicy.
665	 * If the packet is resubmitted to ip_output (e.g. after
666	 * AH, ESP, etc. processing), there will be a tag to bypass
667	 * the lookup and related policy checking.
668	 */
669	if (!ipsec_outdone(m)) {
670		s = splsoftnet();
671		if (inp != NULL &&
672		    IPSEC_PCB_SKIP_IPSEC(inp->inp_sp, IPSEC_DIR_OUTBOUND)) {
673			splx(s);
674			goto spd_done;
675		}
676		sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags,
677				&error, inp);
678		/*
679		 * There are four return cases:
680		 *    sp != NULL	 	    apply IPsec policy
681		 *    sp == NULL, error == 0	    no IPsec handling needed
682		 *    sp == NULL, error == -EINVAL  discard packet w/o error
683		 *    sp == NULL, error != 0	    discard packet, report error
684		 */
685		if (sp != NULL) {
686#ifdef IPSEC_NAT_T
687			/*
688			 * NAT-T ESP fragmentation: don't do IPSec processing now,
689			 * we'll do it on each fragmented packet.
690			 */
691			if (sp->req->sav &&
692					((sp->req->sav->natt_type & UDP_ENCAP_ESPINUDP) ||
693					 (sp->req->sav->natt_type & UDP_ENCAP_ESPINUDP_NON_IKE))) {
694				if (ntohs(ip->ip_len) > sp->req->sav->esp_frag) {
695					natt_frag = 1;
696					mtu = sp->req->sav->esp_frag;
697					splx(s);
698					goto spd_done;
699				}
700			}
701#endif /* IPSEC_NAT_T */
702
703			/*
704			 * Do delayed checksums now because we send before
705			 * this is done in the normal processing path.
706			 */
707			if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
708				in_delayed_cksum(m);
709				m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
710			}
711
712#ifdef __FreeBSD__
713			ip->ip_len = htons(ip->ip_len);
714			ip->ip_off = htons(ip->ip_off);
715#endif
716
717			/* NB: callee frees mbuf */
718			error = ipsec4_process_packet(m, sp->req, flags, 0);
719			/*
720			 * Preserve KAME behaviour: ENOENT can be returned
721			 * when an SA acquire is in progress.  Don't propagate
722			 * this to user-level; it confuses applications.
723			 *
724			 * XXX this will go away when the SADB is redone.
725			 */
726			if (error == ENOENT)
727				error = 0;
728			splx(s);
729			goto done;
730		} else {
731			splx(s);
732
733			if (error != 0) {
734				/*
735				 * Hack: -EINVAL is used to signal that a packet
736				 * should be silently discarded.  This is typically
737				 * because we asked key management for an SA and
738				 * it was delayed (e.g. kicked up to IKE).
739				 */
740				if (error == -EINVAL)
741					error = 0;
742				goto bad;
743			} else {
744				/* No IPsec processing for this packet. */
745			}
746		}
747	}
748spd_done:
749#endif /* FAST_IPSEC */
750
751#ifdef PFIL_HOOKS
752	/*
753	 * Run through list of hooks for output packets.
754	 */
755	if ((error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT)) != 0)
756		goto done;
757	if (m == NULL)
758		goto done;
759
760	ip = mtod(m, struct ip *);
761	hlen = ip->ip_hl << 2;
762	ip_len = ntohs(ip->ip_len);
763#endif /* PFIL_HOOKS */
764
765	m->m_pkthdr.csum_data |= hlen << 16;
766
767#if IFA_STATS
768	/*
769	 * search for the source address structure to
770	 * maintain output statistics.
771	 */
772	INADDR_TO_IA(ip->ip_src, ia);
773#endif
774
775	/* Maybe skip checksums on loopback interfaces. */
776	if (IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) {
777		m->m_pkthdr.csum_flags |= M_CSUM_IPv4;
778	}
779	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx;
780	/*
781	 * If small enough for mtu of path, or if using TCP segmentation
782	 * offload, can just send directly.
783	 */
784	if (ip_len <= mtu ||
785	    (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) != 0) {
786#if IFA_STATS
787		if (ia)
788			ia->ia_ifa.ifa_data.ifad_outbytes += ip_len;
789#endif
790		/*
791		 * Always initialize the sum to 0!  Some HW assisted
792		 * checksumming requires this.
793		 */
794		ip->ip_sum = 0;
795
796		if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
797			/*
798			 * Perform any checksums that the hardware can't do
799			 * for us.
800			 *
801			 * XXX Does any hardware require the {th,uh}_sum
802			 * XXX fields to be 0?
803			 */
804			if (sw_csum & M_CSUM_IPv4) {
805				KASSERT(IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4));
806				ip->ip_sum = in_cksum(m, hlen);
807				m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4;
808			}
809			if (sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
810				if (IN_NEED_CHECKSUM(ifp,
811				    sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4))) {
812					in_delayed_cksum(m);
813				}
814				m->m_pkthdr.csum_flags &=
815				    ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
816			}
817		}
818
819#ifdef KAME_IPSEC
820		/* clean ipsec history once it goes out of the node */
821		ipsec_delaux(m);
822#endif
823
824		if (__predict_true(
825		    (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0 ||
826		    (ifp->if_capenable & IFCAP_TSOv4) != 0)) {
827			KERNEL_LOCK(1, NULL);
828			error =
829			    (*ifp->if_output)(ifp, m,
830				(m->m_flags & M_MCAST) ?
831				    sintocsa(rdst) : sintocsa(dst),
832				rt);
833			KERNEL_UNLOCK_ONE(NULL);
834		} else {
835			error =
836			    ip_tso_output(ifp, m,
837				(m->m_flags & M_MCAST) ?
838				    sintocsa(rdst) : sintocsa(dst),
839				rt);
840		}
841		goto done;
842	}
843
844	/*
845	 * We can't use HW checksumming if we're about to
846	 * to fragment the packet.
847	 *
848	 * XXX Some hardware can do this.
849	 */
850	if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
851		if (IN_NEED_CHECKSUM(ifp,
852		    m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4))) {
853			in_delayed_cksum(m);
854		}
855		m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
856	}
857
858	/*
859	 * Too large for interface; fragment if possible.
860	 * Must be able to put at least 8 bytes per fragment.
861	 */
862	if (ntohs(ip->ip_off) & IP_DF) {
863		if (flags & IP_RETURNMTU)
864			*mtu_p = mtu;
865		error = EMSGSIZE;
866		IP_STATINC(IP_STAT_CANTFRAG);
867		goto bad;
868	}
869
870	error = ip_fragment(m, ifp, mtu);
871	if (error) {
872		m = NULL;
873		goto bad;
874	}
875
876	for (; m; m = m0) {
877		m0 = m->m_nextpkt;
878		m->m_nextpkt = 0;
879		if (error == 0) {
880#if IFA_STATS
881			if (ia)
882				ia->ia_ifa.ifa_data.ifad_outbytes +=
883				    ntohs(ip->ip_len);
884#endif
885#ifdef KAME_IPSEC
886			/* clean ipsec history once it goes out of the node */
887			ipsec_delaux(m);
888#endif /* KAME_IPSEC */
889
890#ifdef IPSEC_NAT_T
891			/*
892			 * If we get there, the packet has not been handeld by
893			 * IPSec whereas it should have. Now that it has been
894			 * fragmented, re-inject it in ip_output so that IPsec
895			 * processing can occur.
896			 */
897			if (natt_frag) {
898				error = ip_output(m, opt,
899				    ro, flags | IP_RAWOUTPUT | IP_NOIPNEWID, imo, so, mtu_p);
900			} else
901#endif /* IPSEC_NAT_T */
902			{
903				KASSERT((m->m_pkthdr.csum_flags &
904				    (M_CSUM_UDPv4 | M_CSUM_TCPv4)) == 0);
905				KERNEL_LOCK(1, NULL);
906				error = (*ifp->if_output)(ifp, m,
907				    (m->m_flags & M_MCAST) ?
908					sintocsa(rdst) : sintocsa(dst),
909				    rt);
910				KERNEL_UNLOCK_ONE(NULL);
911			}
912		} else
913			m_freem(m);
914	}
915
916	if (error == 0)
917		IP_STATINC(IP_STAT_FRAGMENTED);
918done:
919	rtcache_free(&iproute);
920
921#ifdef KAME_IPSEC
922	if (sp != NULL) {
923		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
924			printf("DP ip_output call free SP:%p\n", sp));
925		key_freesp(sp);
926	}
927#endif /* KAME_IPSEC */
928#ifdef FAST_IPSEC
929	if (sp != NULL)
930		KEY_FREESP(&sp);
931#endif /* FAST_IPSEC */
932
933	return (error);
934bad:
935	m_freem(m);
936	goto done;
937}
938
939int
940ip_fragment(struct mbuf *m, struct ifnet *ifp, u_long mtu)
941{
942	struct ip *ip, *mhip;
943	struct mbuf *m0;
944	int len, hlen, off;
945	int mhlen, firstlen;
946	struct mbuf **mnext;
947	int sw_csum = m->m_pkthdr.csum_flags;
948	int fragments = 0;
949	int s;
950	int error = 0;
951
952	ip = mtod(m, struct ip *);
953	hlen = ip->ip_hl << 2;
954	if (ifp != NULL)
955		sw_csum &= ~ifp->if_csum_flags_tx;
956
957	len = (mtu - hlen) &~ 7;
958	if (len < 8) {
959		m_freem(m);
960		return (EMSGSIZE);
961	}
962
963	firstlen = len;
964	mnext = &m->m_nextpkt;
965
966	/*
967	 * Loop through length of segment after first fragment,
968	 * make new header and copy data of each part and link onto chain.
969	 */
970	m0 = m;
971	mhlen = sizeof (struct ip);
972	for (off = hlen + len; off < ntohs(ip->ip_len); off += len) {
973		MGETHDR(m, M_DONTWAIT, MT_HEADER);
974		if (m == 0) {
975			error = ENOBUFS;
976			IP_STATINC(IP_STAT_ODROPPED);
977			goto sendorfree;
978		}
979		MCLAIM(m, m0->m_owner);
980		*mnext = m;
981		mnext = &m->m_nextpkt;
982		m->m_data += max_linkhdr;
983		mhip = mtod(m, struct ip *);
984		*mhip = *ip;
985		/* we must inherit MCAST and BCAST flags */
986		m->m_flags |= m0->m_flags & (M_MCAST|M_BCAST);
987		if (hlen > sizeof (struct ip)) {
988			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
989			mhip->ip_hl = mhlen >> 2;
990		}
991		m->m_len = mhlen;
992		mhip->ip_off = ((off - hlen) >> 3) +
993		    (ntohs(ip->ip_off) & ~IP_MF);
994		if (ip->ip_off & htons(IP_MF))
995			mhip->ip_off |= IP_MF;
996		if (off + len >= ntohs(ip->ip_len))
997			len = ntohs(ip->ip_len) - off;
998		else
999			mhip->ip_off |= IP_MF;
1000		HTONS(mhip->ip_off);
1001		mhip->ip_len = htons((u_int16_t)(len + mhlen));
1002		m->m_next = m_copym(m0, off, len, M_DONTWAIT);
1003		if (m->m_next == 0) {
1004			error = ENOBUFS;	/* ??? */
1005			IP_STATINC(IP_STAT_ODROPPED);
1006			goto sendorfree;
1007		}
1008		m->m_pkthdr.len = mhlen + len;
1009		m->m_pkthdr.rcvif = NULL;
1010		mhip->ip_sum = 0;
1011		KASSERT((m->m_pkthdr.csum_flags & M_CSUM_IPv4) == 0);
1012		if (sw_csum & M_CSUM_IPv4) {
1013			mhip->ip_sum = in_cksum(m, mhlen);
1014		} else {
1015			/*
1016			 * checksum is hw-offloaded or not necessary.
1017			 */
1018			m->m_pkthdr.csum_flags |=
1019			    m0->m_pkthdr.csum_flags & M_CSUM_IPv4;
1020			m->m_pkthdr.csum_data |= mhlen << 16;
1021			KASSERT(!(ifp != NULL &&
1022			    IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4))
1023			    || (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0);
1024		}
1025		IP_STATINC(IP_STAT_OFRAGMENTS);
1026		fragments++;
1027	}
1028	/*
1029	 * Update first fragment by trimming what's been copied out
1030	 * and updating header, then send each fragment (in order).
1031	 */
1032	m = m0;
1033	m_adj(m, hlen + firstlen - ntohs(ip->ip_len));
1034	m->m_pkthdr.len = hlen + firstlen;
1035	ip->ip_len = htons((u_int16_t)m->m_pkthdr.len);
1036	ip->ip_off |= htons(IP_MF);
1037	ip->ip_sum = 0;
1038	if (sw_csum & M_CSUM_IPv4) {
1039		ip->ip_sum = in_cksum(m, hlen);
1040		m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4;
1041	} else {
1042		/*
1043		 * checksum is hw-offloaded or not necessary.
1044		 */
1045		KASSERT(!(ifp != NULL && IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4))
1046		   || (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0);
1047		KASSERT(M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data) >=
1048			sizeof(struct ip));
1049	}
1050sendorfree:
1051	/*
1052	 * If there is no room for all the fragments, don't queue
1053	 * any of them.
1054	 */
1055	if (ifp != NULL) {
1056		s = splnet();
1057		if (ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len < fragments &&
1058		    error == 0) {
1059			error = ENOBUFS;
1060			IP_STATINC(IP_STAT_ODROPPED);
1061			IFQ_INC_DROPS(&ifp->if_snd);
1062		}
1063		splx(s);
1064	}
1065	if (error) {
1066		for (m = m0; m; m = m0) {
1067			m0 = m->m_nextpkt;
1068			m->m_nextpkt = NULL;
1069			m_freem(m);
1070		}
1071	}
1072	return (error);
1073}
1074
1075/*
1076 * Process a delayed payload checksum calculation.
1077 */
1078void
1079in_delayed_cksum(struct mbuf *m)
1080{
1081	struct ip *ip;
1082	u_int16_t csum, offset;
1083
1084	ip = mtod(m, struct ip *);
1085	offset = ip->ip_hl << 2;
1086	csum = in4_cksum(m, 0, offset, ntohs(ip->ip_len) - offset);
1087	if (csum == 0 && (m->m_pkthdr.csum_flags & M_CSUM_UDPv4) != 0)
1088		csum = 0xffff;
1089
1090	offset += M_CSUM_DATA_IPv4_OFFSET(m->m_pkthdr.csum_data);
1091
1092	if ((offset + sizeof(u_int16_t)) > m->m_len) {
1093		/* This happen when ip options were inserted
1094		printf("in_delayed_cksum: pullup len %d off %d proto %d\n",
1095		    m->m_len, offset, ip->ip_p);
1096		 */
1097		m_copyback(m, offset, sizeof(csum), (void *) &csum);
1098	} else
1099		*(u_int16_t *)(mtod(m, char *) + offset) = csum;
1100}
1101
1102/*
1103 * Determine the maximum length of the options to be inserted;
1104 * we would far rather allocate too much space rather than too little.
1105 */
1106
1107u_int
1108ip_optlen(struct inpcb *inp)
1109{
1110	struct mbuf *m = inp->inp_options;
1111
1112	if (m && m->m_len > offsetof(struct ipoption, ipopt_dst))
1113		return (m->m_len - offsetof(struct ipoption, ipopt_dst));
1114	else
1115		return 0;
1116}
1117
1118
1119/*
1120 * Insert IP options into preformed packet.
1121 * Adjust IP destination as required for IP source routing,
1122 * as indicated by a non-zero in_addr at the start of the options.
1123 */
1124static struct mbuf *
1125ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
1126{
1127	struct ipoption *p = mtod(opt, struct ipoption *);
1128	struct mbuf *n;
1129	struct ip *ip = mtod(m, struct ip *);
1130	unsigned optlen;
1131
1132	optlen = opt->m_len - sizeof(p->ipopt_dst);
1133	if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET)
1134		return (m);		/* XXX should fail */
1135	if (!in_nullhost(p->ipopt_dst))
1136		ip->ip_dst = p->ipopt_dst;
1137	if (M_READONLY(m) || M_LEADINGSPACE(m) < optlen) {
1138		MGETHDR(n, M_DONTWAIT, MT_HEADER);
1139		if (n == 0)
1140			return (m);
1141		MCLAIM(n, m->m_owner);
1142		M_MOVE_PKTHDR(n, m);
1143		m->m_len -= sizeof(struct ip);
1144		m->m_data += sizeof(struct ip);
1145		n->m_next = m;
1146		m = n;
1147		m->m_len = optlen + sizeof(struct ip);
1148		m->m_data += max_linkhdr;
1149		bcopy((void *)ip, mtod(m, void *), sizeof(struct ip));
1150	} else {
1151		m->m_data -= optlen;
1152		m->m_len += optlen;
1153		memmove(mtod(m, void *), ip, sizeof(struct ip));
1154	}
1155	m->m_pkthdr.len += optlen;
1156	ip = mtod(m, struct ip *);
1157	bcopy((void *)p->ipopt_list, (void *)(ip + 1), (unsigned)optlen);
1158	*phlen = sizeof(struct ip) + optlen;
1159	ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
1160	return (m);
1161}
1162
1163/*
1164 * Copy options from ip to jp,
1165 * omitting those not copied during fragmentation.
1166 */
1167int
1168ip_optcopy(struct ip *ip, struct ip *jp)
1169{
1170	u_char *cp, *dp;
1171	int opt, optlen, cnt;
1172
1173	cp = (u_char *)(ip + 1);
1174	dp = (u_char *)(jp + 1);
1175	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
1176	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1177		opt = cp[0];
1178		if (opt == IPOPT_EOL)
1179			break;
1180		if (opt == IPOPT_NOP) {
1181			/* Preserve for IP mcast tunnel's LSRR alignment. */
1182			*dp++ = IPOPT_NOP;
1183			optlen = 1;
1184			continue;
1185		}
1186#ifdef DIAGNOSTIC
1187		if (cnt < IPOPT_OLEN + sizeof(*cp))
1188			panic("malformed IPv4 option passed to ip_optcopy");
1189#endif
1190		optlen = cp[IPOPT_OLEN];
1191#ifdef DIAGNOSTIC
1192		if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
1193			panic("malformed IPv4 option passed to ip_optcopy");
1194#endif
1195		/* bogus lengths should have been caught by ip_dooptions */
1196		if (optlen > cnt)
1197			optlen = cnt;
1198		if (IPOPT_COPIED(opt)) {
1199			bcopy((void *)cp, (void *)dp, (unsigned)optlen);
1200			dp += optlen;
1201		}
1202	}
1203	for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
1204		*dp++ = IPOPT_EOL;
1205	return (optlen);
1206}
1207
1208/*
1209 * IP socket option processing.
1210 */
1211int
1212ip_ctloutput(int op, struct socket *so, struct sockopt *sopt)
1213{
1214	struct inpcb *inp = sotoinpcb(so);
1215	int optval = 0;
1216	int error = 0;
1217#if defined(KAME_IPSEC) || defined(FAST_IPSEC)
1218	struct lwp *l = curlwp;	/*XXX*/
1219#endif
1220
1221	if (sopt->sopt_level != IPPROTO_IP) {
1222		if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER)
1223			return 0;
1224		return ENOPROTOOPT;
1225	}
1226
1227	switch (op) {
1228	case PRCO_SETOPT:
1229		switch (sopt->sopt_name) {
1230		case IP_OPTIONS:
1231#ifdef notyet
1232		case IP_RETOPTS:
1233#endif
1234			error = ip_pcbopts(&inp->inp_options, sopt);
1235			break;
1236
1237		case IP_TOS:
1238		case IP_TTL:
1239		case IP_MINTTL:
1240		case IP_RECVOPTS:
1241		case IP_RECVRETOPTS:
1242		case IP_RECVDSTADDR:
1243		case IP_RECVIF:
1244		case IP_RECVTTL:
1245			error = sockopt_getint(sopt, &optval);
1246			if (error)
1247				break;
1248
1249			switch (sopt->sopt_name) {
1250			case IP_TOS:
1251				inp->inp_ip.ip_tos = optval;
1252				break;
1253
1254			case IP_TTL:
1255				inp->inp_ip.ip_ttl = optval;
1256				break;
1257
1258			case IP_MINTTL:
1259				if (optval > 0 && optval <= MAXTTL)
1260					inp->inp_ip_minttl = optval;
1261				else
1262					error = EINVAL;
1263				break;
1264#define	OPTSET(bit) \
1265	if (optval) \
1266		inp->inp_flags |= bit; \
1267	else \
1268		inp->inp_flags &= ~bit;
1269
1270			case IP_RECVOPTS:
1271				OPTSET(INP_RECVOPTS);
1272				break;
1273
1274			case IP_RECVRETOPTS:
1275				OPTSET(INP_RECVRETOPTS);
1276				break;
1277
1278			case IP_RECVDSTADDR:
1279				OPTSET(INP_RECVDSTADDR);
1280				break;
1281
1282			case IP_RECVIF:
1283				OPTSET(INP_RECVIF);
1284				break;
1285
1286			case IP_RECVTTL:
1287				OPTSET(INP_RECVTTL);
1288				break;
1289			}
1290		break;
1291#undef OPTSET
1292
1293		case IP_MULTICAST_IF:
1294		case IP_MULTICAST_TTL:
1295		case IP_MULTICAST_LOOP:
1296		case IP_ADD_MEMBERSHIP:
1297		case IP_DROP_MEMBERSHIP:
1298			error = ip_setmoptions(&inp->inp_moptions, sopt);
1299			break;
1300
1301		case IP_PORTRANGE:
1302			error = sockopt_getint(sopt, &optval);
1303			if (error)
1304				break;
1305
1306			/* INP_LOCK(inp); */
1307			switch (optval) {
1308			case IP_PORTRANGE_DEFAULT:
1309			case IP_PORTRANGE_HIGH:
1310				inp->inp_flags &= ~(INP_LOWPORT);
1311				break;
1312
1313			case IP_PORTRANGE_LOW:
1314				inp->inp_flags |= INP_LOWPORT;
1315				break;
1316
1317			default:
1318				error = EINVAL;
1319				break;
1320			}
1321			/* INP_UNLOCK(inp); */
1322			break;
1323
1324#if defined(KAME_IPSEC) || defined(FAST_IPSEC)
1325		case IP_IPSEC_POLICY:
1326		    {
1327			error = ipsec4_set_policy(inp, sopt->sopt_name,
1328			    sopt->sopt_data, sopt->sopt_size, l->l_cred);
1329			break;
1330		    }
1331#endif /*IPSEC*/
1332
1333		default:
1334			error = ENOPROTOOPT;
1335			break;
1336		}
1337		break;
1338
1339	case PRCO_GETOPT:
1340		switch (sopt->sopt_name) {
1341		case IP_OPTIONS:
1342		case IP_RETOPTS:
1343			if (inp->inp_options) {
1344				struct mbuf *m;
1345
1346				m = m_copym(inp->inp_options, 0, M_COPYALL,
1347				    M_DONTWAIT);
1348				if (m == NULL) {
1349					error = ENOBUFS;
1350					break;
1351				}
1352
1353				error = sockopt_setmbuf(sopt, m);
1354			}
1355			break;
1356
1357		case IP_TOS:
1358		case IP_TTL:
1359		case IP_MINTTL:
1360		case IP_RECVOPTS:
1361		case IP_RECVRETOPTS:
1362		case IP_RECVDSTADDR:
1363		case IP_RECVIF:
1364		case IP_RECVTTL:
1365		case IP_ERRORMTU:
1366			switch (sopt->sopt_name) {
1367			case IP_TOS:
1368				optval = inp->inp_ip.ip_tos;
1369				break;
1370
1371			case IP_TTL:
1372				optval = inp->inp_ip.ip_ttl;
1373				break;
1374
1375			case IP_MINTTL:
1376				optval = inp->inp_ip_minttl;
1377				break;
1378
1379			case IP_ERRORMTU:
1380				optval = inp->inp_errormtu;
1381				break;
1382
1383#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1384
1385			case IP_RECVOPTS:
1386				optval = OPTBIT(INP_RECVOPTS);
1387				break;
1388
1389			case IP_RECVRETOPTS:
1390				optval = OPTBIT(INP_RECVRETOPTS);
1391				break;
1392
1393			case IP_RECVDSTADDR:
1394				optval = OPTBIT(INP_RECVDSTADDR);
1395				break;
1396
1397			case IP_RECVIF:
1398				optval = OPTBIT(INP_RECVIF);
1399				break;
1400
1401			case IP_RECVTTL:
1402				optval = OPTBIT(INP_RECVTTL);
1403				break;
1404			}
1405			error = sockopt_setint(sopt, optval);
1406			break;
1407
1408#if 0	/* defined(KAME_IPSEC) || defined(FAST_IPSEC) */
1409		case IP_IPSEC_POLICY:
1410		{
1411			struct mbuf *m = NULL;
1412
1413			/* XXX this will return EINVAL as sopt is empty */
1414			error = ipsec4_get_policy(inp, sopt->sopt_data,
1415			    sopt->sopt_size, &m);
1416			if (error == 0)
1417				error = sockopt_setmbuf(sopt, m);
1418			break;
1419		}
1420#endif /*IPSEC*/
1421
1422		case IP_MULTICAST_IF:
1423		case IP_MULTICAST_TTL:
1424		case IP_MULTICAST_LOOP:
1425		case IP_ADD_MEMBERSHIP:
1426		case IP_DROP_MEMBERSHIP:
1427			error = ip_getmoptions(inp->inp_moptions, sopt);
1428			break;
1429
1430		case IP_PORTRANGE:
1431			if (inp->inp_flags & INP_LOWPORT)
1432				optval = IP_PORTRANGE_LOW;
1433			else
1434				optval = IP_PORTRANGE_DEFAULT;
1435
1436			error = sockopt_setint(sopt, optval);
1437
1438			break;
1439
1440		default:
1441			error = ENOPROTOOPT;
1442			break;
1443		}
1444		break;
1445	}
1446	return (error);
1447}
1448
1449/*
1450 * Set up IP options in pcb for insertion in output packets.
1451 * Store in mbuf with pointer in pcbopt, adding pseudo-option
1452 * with destination address if source routed.
1453 */
1454int
1455ip_pcbopts(struct mbuf **pcbopt, const struct sockopt *sopt)
1456{
1457	struct mbuf *m;
1458	const u_char *cp;
1459	u_char *dp;
1460	int cnt;
1461	uint8_t optval, olen, offset;
1462
1463	/* turn off any old options */
1464	if (*pcbopt)
1465		(void)m_free(*pcbopt);
1466	*pcbopt = NULL;
1467
1468	cp = sopt->sopt_data;
1469	cnt = sopt->sopt_size;
1470
1471	if (cnt == 0)
1472		return (0);	/* Only turning off any previous options */
1473
1474#ifndef	__vax__
1475	if (cnt % sizeof(int32_t))
1476		return (EINVAL);
1477#endif
1478
1479	m = m_get(M_DONTWAIT, MT_SOOPTS);
1480	if (m == NULL)
1481		return (ENOBUFS);
1482
1483	dp = mtod(m, u_char *);
1484	memset(dp, 0, sizeof(struct in_addr));
1485	dp += sizeof(struct in_addr);
1486	m->m_len = sizeof(struct in_addr);
1487
1488	/*
1489	 * IP option list according to RFC791. Each option is of the form
1490	 *
1491	 *	[optval] [olen] [(olen - 2) data bytes]
1492	 *
1493	 * we validate the list and copy options to an mbuf for prepending
1494	 * to data packets. The IP first-hop destination address will be
1495	 * stored before actual options and is zero if unset.
1496	 */
1497	while (cnt > 0) {
1498		optval = cp[IPOPT_OPTVAL];
1499
1500		if (optval == IPOPT_EOL || optval == IPOPT_NOP) {
1501			olen = 1;
1502		} else {
1503			if (cnt < IPOPT_OLEN + 1)
1504				goto bad;
1505
1506			olen = cp[IPOPT_OLEN];
1507			if (olen < IPOPT_OLEN + 1 || olen > cnt)
1508				goto bad;
1509		}
1510
1511		if (optval == IPOPT_LSRR || optval == IPOPT_SSRR) {
1512			/*
1513			 * user process specifies route as:
1514			 *	->A->B->C->D
1515			 * D must be our final destination (but we can't
1516			 * check that since we may not have connected yet).
1517			 * A is first hop destination, which doesn't appear in
1518			 * actual IP option, but is stored before the options.
1519			 */
1520			if (olen < IPOPT_OFFSET + 1 + sizeof(struct in_addr))
1521				goto bad;
1522
1523			offset = cp[IPOPT_OFFSET];
1524			memcpy(mtod(m, u_char *), cp + IPOPT_OFFSET + 1,
1525			    sizeof(struct in_addr));
1526
1527			cp += sizeof(struct in_addr);
1528			cnt -= sizeof(struct in_addr);
1529			olen -= sizeof(struct in_addr);
1530
1531			if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr))
1532				goto bad;
1533
1534			memcpy(dp, cp, olen);
1535			dp[IPOPT_OPTVAL] = optval;
1536			dp[IPOPT_OLEN] = olen;
1537			dp[IPOPT_OFFSET] = offset;
1538			break;
1539		} else {
1540			if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr))
1541				goto bad;
1542
1543			memcpy(dp, cp, olen);
1544			break;
1545		}
1546
1547		dp += olen;
1548		m->m_len += olen;
1549
1550		if (optval == IPOPT_EOL)
1551			break;
1552
1553		cp += olen;
1554		cnt -= olen;
1555	}
1556
1557	*pcbopt = m;
1558	return (0);
1559
1560bad:
1561	(void)m_free(m);
1562	return (EINVAL);
1563}
1564
1565/*
1566 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1567 */
1568static struct ifnet *
1569ip_multicast_if(struct in_addr *a, int *ifindexp)
1570{
1571	int ifindex;
1572	struct ifnet *ifp = NULL;
1573	struct in_ifaddr *ia;
1574
1575	if (ifindexp)
1576		*ifindexp = 0;
1577	if (ntohl(a->s_addr) >> 24 == 0) {
1578		ifindex = ntohl(a->s_addr) & 0xffffff;
1579		if (ifindex < 0 || if_indexlim <= ifindex)
1580			return NULL;
1581		ifp = ifindex2ifnet[ifindex];
1582		if (!ifp)
1583			return NULL;
1584		if (ifindexp)
1585			*ifindexp = ifindex;
1586	} else {
1587		LIST_FOREACH(ia, &IN_IFADDR_HASH(a->s_addr), ia_hash) {
1588			if (in_hosteq(ia->ia_addr.sin_addr, *a) &&
1589			    (ia->ia_ifp->if_flags & IFF_MULTICAST) != 0) {
1590				ifp = ia->ia_ifp;
1591				break;
1592			}
1593		}
1594	}
1595	return ifp;
1596}
1597
1598static int
1599ip_getoptval(const struct sockopt *sopt, u_int8_t *val, u_int maxval)
1600{
1601	u_int tval;
1602	u_char cval;
1603	int error;
1604
1605	if (sopt == NULL)
1606		return EINVAL;
1607
1608	switch (sopt->sopt_size) {
1609	case sizeof(u_char):
1610		error = sockopt_get(sopt, &cval, sizeof(u_char));
1611		tval = cval;
1612		break;
1613
1614	case sizeof(u_int):
1615		error = sockopt_get(sopt, &tval, sizeof(u_int));
1616		break;
1617
1618	default:
1619		error = EINVAL;
1620	}
1621
1622	if (error)
1623		return error;
1624
1625	if (tval > maxval)
1626		return EINVAL;
1627
1628	*val = tval;
1629	return 0;
1630}
1631
1632/*
1633 * Set the IP multicast options in response to user setsockopt().
1634 */
1635int
1636ip_setmoptions(struct ip_moptions **imop, const struct sockopt *sopt)
1637{
1638	int error = 0;
1639	int i;
1640	struct in_addr addr;
1641	struct ip_mreq lmreq, *mreq;
1642	struct ifnet *ifp;
1643	struct ip_moptions *imo = *imop;
1644	int ifindex;
1645
1646	if (imo == NULL) {
1647		/*
1648		 * No multicast option buffer attached to the pcb;
1649		 * allocate one and initialize to default values.
1650		 */
1651		imo = malloc(sizeof(*imo), M_IPMOPTS, M_NOWAIT);
1652		if (imo == NULL)
1653			return (ENOBUFS);
1654
1655		*imop = imo;
1656		imo->imo_multicast_ifp = NULL;
1657		imo->imo_multicast_addr.s_addr = INADDR_ANY;
1658		imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1659		imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1660		imo->imo_num_memberships = 0;
1661	}
1662
1663	switch (sopt->sopt_name) {
1664	case IP_MULTICAST_IF:
1665		/*
1666		 * Select the interface for outgoing multicast packets.
1667		 */
1668		error = sockopt_get(sopt, &addr, sizeof(addr));
1669		if (error)
1670			break;
1671
1672		/*
1673		 * INADDR_ANY is used to remove a previous selection.
1674		 * When no interface is selected, a default one is
1675		 * chosen every time a multicast packet is sent.
1676		 */
1677		if (in_nullhost(addr)) {
1678			imo->imo_multicast_ifp = NULL;
1679			break;
1680		}
1681		/*
1682		 * The selected interface is identified by its local
1683		 * IP address.  Find the interface and confirm that
1684		 * it supports multicasting.
1685		 */
1686		ifp = ip_multicast_if(&addr, &ifindex);
1687		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1688			error = EADDRNOTAVAIL;
1689			break;
1690		}
1691		imo->imo_multicast_ifp = ifp;
1692		if (ifindex)
1693			imo->imo_multicast_addr = addr;
1694		else
1695			imo->imo_multicast_addr.s_addr = INADDR_ANY;
1696		break;
1697
1698	case IP_MULTICAST_TTL:
1699		/*
1700		 * Set the IP time-to-live for outgoing multicast packets.
1701		 */
1702		error = ip_getoptval(sopt, &imo->imo_multicast_ttl, MAXTTL);
1703		break;
1704
1705	case IP_MULTICAST_LOOP:
1706		/*
1707		 * Set the loopback flag for outgoing multicast packets.
1708		 * Must be zero or one.
1709		 */
1710		error = ip_getoptval(sopt, &imo->imo_multicast_loop, 1);
1711		break;
1712
1713	case IP_ADD_MEMBERSHIP:
1714		/*
1715		 * Add a multicast group membership.
1716		 * Group must be a valid IP multicast address.
1717		 */
1718		error = sockopt_get(sopt, &lmreq, sizeof(lmreq));
1719		if (error)
1720			break;
1721
1722		mreq = &lmreq;
1723
1724		if (!IN_MULTICAST(mreq->imr_multiaddr.s_addr)) {
1725			error = EINVAL;
1726			break;
1727		}
1728		/*
1729		 * If no interface address was provided, use the interface of
1730		 * the route to the given multicast address.
1731		 */
1732		if (in_nullhost(mreq->imr_interface)) {
1733			struct rtentry *rt;
1734			union {
1735				struct sockaddr		dst;
1736				struct sockaddr_in	dst4;
1737			} u;
1738			struct route ro;
1739
1740			memset(&ro, 0, sizeof(ro));
1741
1742			sockaddr_in_init(&u.dst4, &mreq->imr_multiaddr, 0);
1743			rtcache_setdst(&ro, &u.dst);
1744			ifp = (rt = rtcache_init(&ro)) != NULL ? rt->rt_ifp
1745			                                        : NULL;
1746			rtcache_free(&ro);
1747		} else {
1748			ifp = ip_multicast_if(&mreq->imr_interface, NULL);
1749		}
1750		/*
1751		 * See if we found an interface, and confirm that it
1752		 * supports multicast.
1753		 */
1754		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1755			error = EADDRNOTAVAIL;
1756			break;
1757		}
1758		/*
1759		 * See if the membership already exists or if all the
1760		 * membership slots are full.
1761		 */
1762		for (i = 0; i < imo->imo_num_memberships; ++i) {
1763			if (imo->imo_membership[i]->inm_ifp == ifp &&
1764			    in_hosteq(imo->imo_membership[i]->inm_addr,
1765				      mreq->imr_multiaddr))
1766				break;
1767		}
1768		if (i < imo->imo_num_memberships) {
1769			error = EADDRINUSE;
1770			break;
1771		}
1772		if (i == IP_MAX_MEMBERSHIPS) {
1773			error = ETOOMANYREFS;
1774			break;
1775		}
1776		/*
1777		 * Everything looks good; add a new record to the multicast
1778		 * address list for the given interface.
1779		 */
1780		if ((imo->imo_membership[i] =
1781		    in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) {
1782			error = ENOBUFS;
1783			break;
1784		}
1785		++imo->imo_num_memberships;
1786		break;
1787
1788	case IP_DROP_MEMBERSHIP:
1789		/*
1790		 * Drop a multicast group membership.
1791		 * Group must be a valid IP multicast address.
1792		 */
1793		error = sockopt_get(sopt, &lmreq, sizeof(lmreq));
1794		if (error)
1795			break;
1796
1797		mreq = &lmreq;
1798
1799		if (!IN_MULTICAST(mreq->imr_multiaddr.s_addr)) {
1800			error = EINVAL;
1801			break;
1802		}
1803		/*
1804		 * If an interface address was specified, get a pointer
1805		 * to its ifnet structure.
1806		 */
1807		if (in_nullhost(mreq->imr_interface))
1808			ifp = NULL;
1809		else {
1810			ifp = ip_multicast_if(&mreq->imr_interface, NULL);
1811			if (ifp == NULL) {
1812				error = EADDRNOTAVAIL;
1813				break;
1814			}
1815		}
1816		/*
1817		 * Find the membership in the membership array.
1818		 */
1819		for (i = 0; i < imo->imo_num_memberships; ++i) {
1820			if ((ifp == NULL ||
1821			     imo->imo_membership[i]->inm_ifp == ifp) &&
1822			     in_hosteq(imo->imo_membership[i]->inm_addr,
1823				       mreq->imr_multiaddr))
1824				break;
1825		}
1826		if (i == imo->imo_num_memberships) {
1827			error = EADDRNOTAVAIL;
1828			break;
1829		}
1830		/*
1831		 * Give up the multicast address record to which the
1832		 * membership points.
1833		 */
1834		in_delmulti(imo->imo_membership[i]);
1835		/*
1836		 * Remove the gap in the membership array.
1837		 */
1838		for (++i; i < imo->imo_num_memberships; ++i)
1839			imo->imo_membership[i-1] = imo->imo_membership[i];
1840		--imo->imo_num_memberships;
1841		break;
1842
1843	default:
1844		error = EOPNOTSUPP;
1845		break;
1846	}
1847
1848	/*
1849	 * If all options have default values, no need to keep the mbuf.
1850	 */
1851	if (imo->imo_multicast_ifp == NULL &&
1852	    imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
1853	    imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
1854	    imo->imo_num_memberships == 0) {
1855		free(*imop, M_IPMOPTS);
1856		*imop = NULL;
1857	}
1858
1859	return (error);
1860}
1861
1862/*
1863 * Return the IP multicast options in response to user getsockopt().
1864 */
1865int
1866ip_getmoptions(struct ip_moptions *imo, struct sockopt *sopt)
1867{
1868	struct in_addr addr;
1869	struct in_ifaddr *ia;
1870	int error;
1871	uint8_t optval;
1872
1873	error = 0;
1874
1875	switch (sopt->sopt_name) {
1876	case IP_MULTICAST_IF:
1877		if (imo == NULL || imo->imo_multicast_ifp == NULL)
1878			addr = zeroin_addr;
1879		else if (imo->imo_multicast_addr.s_addr) {
1880			/* return the value user has set */
1881			addr = imo->imo_multicast_addr;
1882		} else {
1883			IFP_TO_IA(imo->imo_multicast_ifp, ia);
1884			addr = ia ? ia->ia_addr.sin_addr : zeroin_addr;
1885		}
1886		error = sockopt_set(sopt, &addr, sizeof(addr));
1887		break;
1888
1889	case IP_MULTICAST_TTL:
1890		optval = imo ? imo->imo_multicast_ttl
1891			     : IP_DEFAULT_MULTICAST_TTL;
1892
1893		error = sockopt_set(sopt, &optval, sizeof(optval));
1894		break;
1895
1896	case IP_MULTICAST_LOOP:
1897		optval = imo ? imo->imo_multicast_loop
1898			     : IP_DEFAULT_MULTICAST_LOOP;
1899
1900		error = sockopt_set(sopt, &optval, sizeof(optval));
1901		break;
1902
1903	default:
1904		error = EOPNOTSUPP;
1905	}
1906
1907	return (error);
1908}
1909
1910/*
1911 * Discard the IP multicast options.
1912 */
1913void
1914ip_freemoptions(struct ip_moptions *imo)
1915{
1916	int i;
1917
1918	if (imo != NULL) {
1919		for (i = 0; i < imo->imo_num_memberships; ++i)
1920			in_delmulti(imo->imo_membership[i]);
1921		free(imo, M_IPMOPTS);
1922	}
1923}
1924
1925/*
1926 * Routine called from ip_output() to loop back a copy of an IP multicast
1927 * packet to the input queue of a specified interface.  Note that this
1928 * calls the output routine of the loopback "driver", but with an interface
1929 * pointer that might NOT be lo0ifp -- easier than replicating that code here.
1930 */
1931static void
1932ip_mloopback(struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in *dst)
1933{
1934	struct ip *ip;
1935	struct mbuf *copym;
1936
1937	copym = m_copypacket(m, M_DONTWAIT);
1938	if (copym != NULL
1939	 && (copym->m_flags & M_EXT || copym->m_len < sizeof(struct ip)))
1940		copym = m_pullup(copym, sizeof(struct ip));
1941	if (copym == NULL)
1942		return;
1943	/*
1944	 * We don't bother to fragment if the IP length is greater
1945	 * than the interface's MTU.  Can this possibly matter?
1946	 */
1947	ip = mtod(copym, struct ip *);
1948
1949	if (copym->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
1950		in_delayed_cksum(copym);
1951		copym->m_pkthdr.csum_flags &=
1952		    ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
1953	}
1954
1955	ip->ip_sum = 0;
1956	ip->ip_sum = in_cksum(copym, ip->ip_hl << 2);
1957	(void)looutput(ifp, copym, sintocsa(dst), NULL);
1958}
1959