ip6_output.c revision 196864
1/*-
2 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the project nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	$KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $
30 */
31
32/*-
33 * Copyright (c) 1982, 1986, 1988, 1990, 1993
34 *	The Regents of the University of California.  All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
61 */
62
63#include <sys/cdefs.h>
64__FBSDID("$FreeBSD: head/sys/netinet6/ip6_output.c 196864 2009-09-05 16:43:16Z qingli $");
65
66#include "opt_inet.h"
67#include "opt_inet6.h"
68#include "opt_ipsec.h"
69
70#include <sys/param.h>
71#include <sys/kernel.h>
72#include <sys/malloc.h>
73#include <sys/mbuf.h>
74#include <sys/errno.h>
75#include <sys/priv.h>
76#include <sys/proc.h>
77#include <sys/protosw.h>
78#include <sys/socket.h>
79#include <sys/socketvar.h>
80#include <sys/syslog.h>
81#include <sys/ucred.h>
82
83#include <net/if.h>
84#include <net/netisr.h>
85#include <net/route.h>
86#include <net/pfil.h>
87#include <net/vnet.h>
88
89#include <netinet/in.h>
90#include <netinet/in_var.h>
91#include <netinet6/in6_var.h>
92#include <netinet/ip6.h>
93#include <netinet/icmp6.h>
94#include <netinet6/ip6_var.h>
95#include <netinet/in_pcb.h>
96#include <netinet/tcp_var.h>
97#include <netinet6/nd6.h>
98
99#ifdef IPSEC
100#include <netipsec/ipsec.h>
101#include <netipsec/ipsec6.h>
102#include <netipsec/key.h>
103#include <netinet6/ip6_ipsec.h>
104#endif /* IPSEC */
105
106#include <netinet6/ip6protosw.h>
107#include <netinet6/scope6_var.h>
108
109extern int in6_mcast_loop;
110
111struct ip6_exthdrs {
112	struct mbuf *ip6e_ip6;
113	struct mbuf *ip6e_hbh;
114	struct mbuf *ip6e_dest1;
115	struct mbuf *ip6e_rthdr;
116	struct mbuf *ip6e_dest2;
117};
118
119static int ip6_pcbopt __P((int, u_char *, int, struct ip6_pktopts **,
120			   struct ucred *, int));
121static int ip6_pcbopts __P((struct ip6_pktopts **, struct mbuf *,
122	struct socket *, struct sockopt *));
123static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *);
124static int ip6_setpktopt __P((int, u_char *, int, struct ip6_pktopts *,
125	struct ucred *, int, int, int));
126
127static int ip6_copyexthdr(struct mbuf **, caddr_t, int);
128static int ip6_insertfraghdr __P((struct mbuf *, struct mbuf *, int,
129	struct ip6_frag **));
130static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
131static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
132static int ip6_getpmtu __P((struct route_in6 *, struct route_in6 *,
133	struct ifnet *, struct in6_addr *, u_long *, int *));
134static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
135
136
137/*
138 * Make an extension header from option data.  hp is the source, and
139 * mp is the destination.
140 */
141#define MAKE_EXTHDR(hp, mp)						\
142    do {								\
143	if (hp) {							\
144		struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
145		error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
146		    ((eh)->ip6e_len + 1) << 3);				\
147		if (error)						\
148			goto freehdrs;					\
149	}								\
150    } while (/*CONSTCOND*/ 0)
151
152/*
153 * Form a chain of extension headers.
154 * m is the extension header mbuf
155 * mp is the previous mbuf in the chain
156 * p is the next header
157 * i is the type of option.
158 */
159#define MAKE_CHAIN(m, mp, p, i)\
160    do {\
161	if (m) {\
162		if (!hdrsplit) \
163			panic("assumption failed: hdr not split"); \
164		*mtod((m), u_char *) = *(p);\
165		*(p) = (i);\
166		p = mtod((m), u_char *);\
167		(m)->m_next = (mp)->m_next;\
168		(mp)->m_next = (m);\
169		(mp) = (m);\
170	}\
171    } while (/*CONSTCOND*/ 0)
172
173/*
174 * IP6 output. The packet in mbuf chain m contains a skeletal IP6
175 * header (with pri, len, nxt, hlim, src, dst).
176 * This function may modify ver and hlim only.
177 * The mbuf chain containing the packet will be freed.
178 * The mbuf opt, if present, will not be freed.
179 *
180 * type of "mtu": rt_rmx.rmx_mtu is u_long, ifnet.ifr_mtu is int, and
181 * nd_ifinfo.linkmtu is u_int32_t.  so we use u_long to hold largest one,
182 * which is rt_rmx.rmx_mtu.
183 *
184 * ifpp - XXX: just for statistics
185 */
186int
187ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
188    struct route_in6 *ro, int flags, struct ip6_moptions *im6o,
189    struct ifnet **ifpp, struct inpcb *inp)
190{
191	struct ip6_hdr *ip6, *mhip6;
192	struct ifnet *ifp, *origifp;
193	struct mbuf *m = m0;
194	struct mbuf *mprev = NULL;
195	int hlen, tlen, len, off;
196	struct route_in6 ip6route;
197	struct rtentry *rt = NULL;
198	struct sockaddr_in6 *dst, src_sa, dst_sa;
199	struct in6_addr odst;
200	int error = 0;
201	struct in6_ifaddr *ia = NULL;
202	u_long mtu;
203	int alwaysfrag, dontfrag;
204	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
205	struct ip6_exthdrs exthdrs;
206	struct in6_addr finaldst, src0, dst0;
207	u_int32_t zone;
208	struct route_in6 *ro_pmtu = NULL;
209	int hdrsplit = 0;
210	int needipsec = 0;
211#ifdef IPSEC
212	struct ipsec_output_state state;
213	struct ip6_rthdr *rh = NULL;
214	int needipsectun = 0;
215	int segleft_org = 0;
216	struct secpolicy *sp = NULL;
217#endif /* IPSEC */
218
219	ip6 = mtod(m, struct ip6_hdr *);
220	if (ip6 == NULL) {
221		printf ("ip6 is NULL");
222		goto bad;
223	}
224
225	finaldst = ip6->ip6_dst;
226
227	bzero(&exthdrs, sizeof(exthdrs));
228
229	if (opt) {
230		/* Hop-by-Hop options header */
231		MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
232		/* Destination options header(1st part) */
233		if (opt->ip6po_rthdr) {
234			/*
235			 * Destination options header(1st part)
236			 * This only makes sense with a routing header.
237			 * See Section 9.2 of RFC 3542.
238			 * Disabling this part just for MIP6 convenience is
239			 * a bad idea.  We need to think carefully about a
240			 * way to make the advanced API coexist with MIP6
241			 * options, which might automatically be inserted in
242			 * the kernel.
243			 */
244			MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
245		}
246		/* Routing header */
247		MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
248		/* Destination options header(2nd part) */
249		MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
250	}
251
252	/*
253	 * IPSec checking which handles several cases.
254	 * FAST IPSEC: We re-injected the packet.
255	 */
256#ifdef IPSEC
257	switch(ip6_ipsec_output(&m, inp, &flags, &error, &ifp, &sp))
258	{
259	case 1:                 /* Bad packet */
260		goto freehdrs;
261	case -1:                /* Do IPSec */
262		needipsec = 1;
263	case 0:                 /* No IPSec */
264	default:
265		break;
266	}
267#endif /* IPSEC */
268
269	/*
270	 * Calculate the total length of the extension header chain.
271	 * Keep the length of the unfragmentable part for fragmentation.
272	 */
273	optlen = 0;
274	if (exthdrs.ip6e_hbh)
275		optlen += exthdrs.ip6e_hbh->m_len;
276	if (exthdrs.ip6e_dest1)
277		optlen += exthdrs.ip6e_dest1->m_len;
278	if (exthdrs.ip6e_rthdr)
279		optlen += exthdrs.ip6e_rthdr->m_len;
280	unfragpartlen = optlen + sizeof(struct ip6_hdr);
281
282	/* NOTE: we don't add AH/ESP length here. do that later. */
283	if (exthdrs.ip6e_dest2)
284		optlen += exthdrs.ip6e_dest2->m_len;
285
286	/*
287	 * If we need IPsec, or there is at least one extension header,
288	 * separate IP6 header from the payload.
289	 */
290	if ((needipsec || optlen) && !hdrsplit) {
291		if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
292			m = NULL;
293			goto freehdrs;
294		}
295		m = exthdrs.ip6e_ip6;
296		hdrsplit++;
297	}
298
299	/* adjust pointer */
300	ip6 = mtod(m, struct ip6_hdr *);
301
302	/* adjust mbuf packet header length */
303	m->m_pkthdr.len += optlen;
304	plen = m->m_pkthdr.len - sizeof(*ip6);
305
306	/* If this is a jumbo payload, insert a jumbo payload option. */
307	if (plen > IPV6_MAXPACKET) {
308		if (!hdrsplit) {
309			if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
310				m = NULL;
311				goto freehdrs;
312			}
313			m = exthdrs.ip6e_ip6;
314			hdrsplit++;
315		}
316		/* adjust pointer */
317		ip6 = mtod(m, struct ip6_hdr *);
318		if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
319			goto freehdrs;
320		ip6->ip6_plen = 0;
321	} else
322		ip6->ip6_plen = htons(plen);
323
324	/*
325	 * Concatenate headers and fill in next header fields.
326	 * Here we have, on "m"
327	 *	IPv6 payload
328	 * and we insert headers accordingly.  Finally, we should be getting:
329	 *	IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
330	 *
331	 * during the header composing process, "m" points to IPv6 header.
332	 * "mprev" points to an extension header prior to esp.
333	 */
334	u_char *nexthdrp = &ip6->ip6_nxt;
335	mprev = m;
336
337	/*
338	 * we treat dest2 specially.  this makes IPsec processing
339	 * much easier.  the goal here is to make mprev point the
340	 * mbuf prior to dest2.
341	 *
342	 * result: IPv6 dest2 payload
343	 * m and mprev will point to IPv6 header.
344	 */
345	if (exthdrs.ip6e_dest2) {
346		if (!hdrsplit)
347			panic("assumption failed: hdr not split");
348		exthdrs.ip6e_dest2->m_next = m->m_next;
349		m->m_next = exthdrs.ip6e_dest2;
350		*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
351		ip6->ip6_nxt = IPPROTO_DSTOPTS;
352	}
353
354	/*
355	 * result: IPv6 hbh dest1 rthdr dest2 payload
356	 * m will point to IPv6 header.  mprev will point to the
357	 * extension header prior to dest2 (rthdr in the above case).
358	 */
359	MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
360	MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
361		   IPPROTO_DSTOPTS);
362	MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
363		   IPPROTO_ROUTING);
364
365#ifdef IPSEC
366	if (!needipsec)
367		goto skip_ipsec2;
368
369	/*
370	 * pointers after IPsec headers are not valid any more.
371	 * other pointers need a great care too.
372	 * (IPsec routines should not mangle mbufs prior to AH/ESP)
373	 */
374	exthdrs.ip6e_dest2 = NULL;
375
376	if (exthdrs.ip6e_rthdr) {
377		rh = mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *);
378		segleft_org = rh->ip6r_segleft;
379		rh->ip6r_segleft = 0;
380	}
381
382	bzero(&state, sizeof(state));
383	state.m = m;
384	error = ipsec6_output_trans(&state, nexthdrp, mprev, sp, flags,
385				    &needipsectun);
386	m = state.m;
387	if (error == EJUSTRETURN) {
388		/*
389		 * We had a SP with a level of 'use' and no SA. We
390		 * will just continue to process the packet without
391		 * IPsec processing.
392		 */
393		;
394	} else if (error) {
395		/* mbuf is already reclaimed in ipsec6_output_trans. */
396		m = NULL;
397		switch (error) {
398		case EHOSTUNREACH:
399		case ENETUNREACH:
400		case EMSGSIZE:
401		case ENOBUFS:
402		case ENOMEM:
403			break;
404		default:
405			printf("[%s:%d] (ipsec): error code %d\n",
406			    __func__, __LINE__, error);
407			/* FALLTHROUGH */
408		case ENOENT:
409			/* don't show these error codes to the user */
410			error = 0;
411			break;
412		}
413		goto bad;
414	} else if (!needipsectun) {
415		/*
416		 * In the FAST IPSec case we have already
417		 * re-injected the packet and it has been freed
418		 * by the ipsec_done() function.  So, just clean
419		 * up after ourselves.
420		 */
421		m = NULL;
422		goto done;
423	}
424	if (exthdrs.ip6e_rthdr) {
425		/* ah6_output doesn't modify mbuf chain */
426		rh->ip6r_segleft = segleft_org;
427	}
428skip_ipsec2:;
429#endif /* IPSEC */
430
431	/*
432	 * If there is a routing header, discard the packet.
433	 */
434	if (exthdrs.ip6e_rthdr) {
435		 error = EINVAL;
436		 goto bad;
437	}
438
439	/* Source address validation */
440	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
441	    (flags & IPV6_UNSPECSRC) == 0) {
442		error = EOPNOTSUPP;
443		V_ip6stat.ip6s_badscope++;
444		goto bad;
445	}
446	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
447		error = EOPNOTSUPP;
448		V_ip6stat.ip6s_badscope++;
449		goto bad;
450	}
451
452	V_ip6stat.ip6s_localout++;
453
454	/*
455	 * Route packet.
456	 */
457	if (ro == 0) {
458		ro = &ip6route;
459		bzero((caddr_t)ro, sizeof(*ro));
460	}
461	ro_pmtu = ro;
462	if (opt && opt->ip6po_rthdr)
463		ro = &opt->ip6po_route;
464	dst = (struct sockaddr_in6 *)&ro->ro_dst;
465
466again:
467	/*
468	 * if specified, try to fill in the traffic class field.
469	 * do not override if a non-zero value is already set.
470	 * we check the diffserv field and the ecn field separately.
471	 */
472	if (opt && opt->ip6po_tclass >= 0) {
473		int mask = 0;
474
475		if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
476			mask |= 0xfc;
477		if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
478			mask |= 0x03;
479		if (mask != 0)
480			ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
481	}
482
483	/* fill in or override the hop limit field, if necessary. */
484	if (opt && opt->ip6po_hlim != -1)
485		ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
486	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
487		if (im6o != NULL)
488			ip6->ip6_hlim = im6o->im6o_multicast_hlim;
489		else
490			ip6->ip6_hlim = V_ip6_defmcasthlim;
491	}
492
493#ifdef IPSEC
494	/*
495	 * We may re-inject packets into the stack here.
496	 */
497	if (needipsec && needipsectun) {
498		struct ipsec_output_state state;
499
500		/*
501		 * All the extension headers will become inaccessible
502		 * (since they can be encrypted).
503		 * Don't panic, we need no more updates to extension headers
504		 * on inner IPv6 packet (since they are now encapsulated).
505		 *
506		 * IPv6 [ESP|AH] IPv6 [extension headers] payload
507		 */
508		bzero(&exthdrs, sizeof(exthdrs));
509		exthdrs.ip6e_ip6 = m;
510
511		bzero(&state, sizeof(state));
512		state.m = m;
513		state.ro = (struct route *)ro;
514		state.dst = (struct sockaddr *)dst;
515
516		error = ipsec6_output_tunnel(&state, sp, flags);
517
518		m = state.m;
519		ro = (struct route_in6 *)state.ro;
520		dst = (struct sockaddr_in6 *)state.dst;
521		if (error == EJUSTRETURN) {
522			/*
523			 * We had a SP with a level of 'use' and no SA. We
524			 * will just continue to process the packet without
525			 * IPsec processing.
526			 */
527			;
528		} else if (error) {
529			/* mbuf is already reclaimed in ipsec6_output_tunnel. */
530			m0 = m = NULL;
531			m = NULL;
532			switch (error) {
533			case EHOSTUNREACH:
534			case ENETUNREACH:
535			case EMSGSIZE:
536			case ENOBUFS:
537			case ENOMEM:
538				break;
539			default:
540				printf("[%s:%d] (ipsec): error code %d\n",
541				    __func__, __LINE__, error);
542				/* FALLTHROUGH */
543			case ENOENT:
544				/* don't show these error codes to the user */
545				error = 0;
546				break;
547			}
548			goto bad;
549		} else {
550			/*
551			 * In the FAST IPSec case we have already
552			 * re-injected the packet and it has been freed
553			 * by the ipsec_done() function.  So, just clean
554			 * up after ourselves.
555			 */
556			m = NULL;
557			goto done;
558		}
559
560		exthdrs.ip6e_ip6 = m;
561	}
562#endif /* IPSEC */
563
564	/* adjust pointer */
565	ip6 = mtod(m, struct ip6_hdr *);
566
567	bzero(&dst_sa, sizeof(dst_sa));
568	dst_sa.sin6_family = AF_INET6;
569	dst_sa.sin6_len = sizeof(dst_sa);
570	dst_sa.sin6_addr = ip6->ip6_dst;
571	if ((error = in6_selectroute(&dst_sa, opt, im6o, ro,
572	    &ifp, &rt)) != 0) {
573		switch (error) {
574		case EHOSTUNREACH:
575			V_ip6stat.ip6s_noroute++;
576			break;
577		case EADDRNOTAVAIL:
578		default:
579			break; /* XXX statistics? */
580		}
581		if (ifp != NULL)
582			in6_ifstat_inc(ifp, ifs6_out_discard);
583		goto bad;
584	}
585	if (rt == NULL) {
586		/*
587		 * If in6_selectroute() does not return a route entry,
588		 * dst may not have been updated.
589		 */
590		*dst = dst_sa;	/* XXX */
591	}
592
593	/*
594	 * then rt (for unicast) and ifp must be non-NULL valid values.
595	 */
596	if ((flags & IPV6_FORWARDING) == 0) {
597		/* XXX: the FORWARDING flag can be set for mrouting. */
598		in6_ifstat_inc(ifp, ifs6_out_request);
599	}
600	if (rt != NULL) {
601		ia = (struct in6_ifaddr *)(rt->rt_ifa);
602		rt->rt_use++;
603	}
604
605
606	/*
607	 * The outgoing interface must be in the zone of source and
608	 * destination addresses.
609	 */
610	origifp = ifp;
611
612	src0 = ip6->ip6_src;
613	if (in6_setscope(&src0, origifp, &zone))
614		goto badscope;
615	bzero(&src_sa, sizeof(src_sa));
616	src_sa.sin6_family = AF_INET6;
617	src_sa.sin6_len = sizeof(src_sa);
618	src_sa.sin6_addr = ip6->ip6_src;
619	if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id)
620		goto badscope;
621
622	dst0 = ip6->ip6_dst;
623	if (in6_setscope(&dst0, origifp, &zone))
624		goto badscope;
625	/* re-initialize to be sure */
626	bzero(&dst_sa, sizeof(dst_sa));
627	dst_sa.sin6_family = AF_INET6;
628	dst_sa.sin6_len = sizeof(dst_sa);
629	dst_sa.sin6_addr = ip6->ip6_dst;
630	if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) {
631		goto badscope;
632	}
633
634	/* We should use ia_ifp to support the case of
635	 * sending packets to an address of our own.
636	 */
637	if (ia != NULL && ia->ia_ifp)
638		ifp = ia->ia_ifp;
639
640	/* scope check is done. */
641	goto routefound;
642
643  badscope:
644	V_ip6stat.ip6s_badscope++;
645	in6_ifstat_inc(origifp, ifs6_out_discard);
646	if (error == 0)
647		error = EHOSTUNREACH; /* XXX */
648	goto bad;
649
650  routefound:
651	if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
652		if (opt && opt->ip6po_nextroute.ro_rt) {
653			/*
654			 * The nexthop is explicitly specified by the
655			 * application.  We assume the next hop is an IPv6
656			 * address.
657			 */
658			dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
659		}
660		else if ((rt->rt_flags & RTF_GATEWAY))
661			dst = (struct sockaddr_in6 *)rt->rt_gateway;
662	}
663
664	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
665		m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
666	} else {
667		m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
668		in6_ifstat_inc(ifp, ifs6_out_mcast);
669		/*
670		 * Confirm that the outgoing interface supports multicast.
671		 */
672		if (!(ifp->if_flags & IFF_MULTICAST)) {
673			V_ip6stat.ip6s_noroute++;
674			in6_ifstat_inc(ifp, ifs6_out_discard);
675			error = ENETUNREACH;
676			goto bad;
677		}
678		if ((im6o == NULL && in6_mcast_loop) ||
679		    (im6o && im6o->im6o_multicast_loop)) {
680			/*
681			 * Loop back multicast datagram if not expressly
682			 * forbidden to do so, even if we have not joined
683			 * the address; protocols will filter it later,
684			 * thus deferring a hash lookup and lock acquisition
685			 * at the expense of an m_copym().
686			 */
687			ip6_mloopback(ifp, m, dst);
688		} else {
689			/*
690			 * If we are acting as a multicast router, perform
691			 * multicast forwarding as if the packet had just
692			 * arrived on the interface to which we are about
693			 * to send.  The multicast forwarding function
694			 * recursively calls this function, using the
695			 * IPV6_FORWARDING flag to prevent infinite recursion.
696			 *
697			 * Multicasts that are looped back by ip6_mloopback(),
698			 * above, will be forwarded by the ip6_input() routine,
699			 * if necessary.
700			 */
701			if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
702				/*
703				 * XXX: ip6_mforward expects that rcvif is NULL
704				 * when it is called from the originating path.
705				 * However, it is not always the case, since
706				 * some versions of MGETHDR() does not
707				 * initialize the field.
708				 */
709				m->m_pkthdr.rcvif = NULL;
710				if (ip6_mforward(ip6, ifp, m) != 0) {
711					m_freem(m);
712					goto done;
713				}
714			}
715		}
716		/*
717		 * Multicasts with a hoplimit of zero may be looped back,
718		 * above, but must not be transmitted on a network.
719		 * Also, multicasts addressed to the loopback interface
720		 * are not sent -- the above call to ip6_mloopback() will
721		 * loop back a copy if this host actually belongs to the
722		 * destination group on the loopback interface.
723		 */
724		if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) ||
725		    IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
726			m_freem(m);
727			goto done;
728		}
729	}
730
731	/*
732	 * Fill the outgoing inteface to tell the upper layer
733	 * to increment per-interface statistics.
734	 */
735	if (ifpp)
736		*ifpp = ifp;
737
738	/* Determine path MTU. */
739	if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu,
740	    &alwaysfrag)) != 0)
741		goto bad;
742
743	/*
744	 * The caller of this function may specify to use the minimum MTU
745	 * in some cases.
746	 * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
747	 * setting.  The logic is a bit complicated; by default, unicast
748	 * packets will follow path MTU while multicast packets will be sent at
749	 * the minimum MTU.  If IP6PO_MINMTU_ALL is specified, all packets
750	 * including unicast ones will be sent at the minimum MTU.  Multicast
751	 * packets will always be sent at the minimum MTU unless
752	 * IP6PO_MINMTU_DISABLE is explicitly specified.
753	 * See RFC 3542 for more details.
754	 */
755	if (mtu > IPV6_MMTU) {
756		if ((flags & IPV6_MINMTU))
757			mtu = IPV6_MMTU;
758		else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
759			mtu = IPV6_MMTU;
760		else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
761			 (opt == NULL ||
762			  opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
763			mtu = IPV6_MMTU;
764		}
765	}
766
767	/*
768	 * clear embedded scope identifiers if necessary.
769	 * in6_clearscope will touch the addresses only when necessary.
770	 */
771	in6_clearscope(&ip6->ip6_src);
772	in6_clearscope(&ip6->ip6_dst);
773
774	/*
775	 * If the outgoing packet contains a hop-by-hop options header,
776	 * it must be examined and processed even by the source node.
777	 * (RFC 2460, section 4.)
778	 */
779	if (exthdrs.ip6e_hbh) {
780		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
781		u_int32_t dummy; /* XXX unused */
782		u_int32_t plen = 0; /* XXX: ip6_process will check the value */
783
784#ifdef DIAGNOSTIC
785		if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
786			panic("ip6e_hbh is not continuous");
787#endif
788		/*
789		 *  XXX: if we have to send an ICMPv6 error to the sender,
790		 *       we need the M_LOOP flag since icmp6_error() expects
791		 *       the IPv6 and the hop-by-hop options header are
792		 *       continuous unless the flag is set.
793		 */
794		m->m_flags |= M_LOOP;
795		m->m_pkthdr.rcvif = ifp;
796		if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1),
797		    ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
798		    &dummy, &plen) < 0) {
799			/* m was already freed at this point */
800			error = EINVAL;/* better error? */
801			goto done;
802		}
803		m->m_flags &= ~M_LOOP; /* XXX */
804		m->m_pkthdr.rcvif = NULL;
805	}
806
807	/* Jump over all PFIL processing if hooks are not active. */
808	if (!PFIL_HOOKED(&inet6_pfil_hook))
809		goto passout;
810
811	odst = ip6->ip6_dst;
812	/* Run through list of hooks for output packets. */
813	error = pfil_run_hooks(&inet6_pfil_hook, &m, ifp, PFIL_OUT, inp);
814	if (error != 0 || m == NULL)
815		goto done;
816	ip6 = mtod(m, struct ip6_hdr *);
817
818	/* See if destination IP address was changed by packet filter. */
819	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
820		m->m_flags |= M_SKIP_FIREWALL;
821		/* If destination is now ourself drop to ip6_input(). */
822		if (in6_localaddr(&ip6->ip6_dst)) {
823			if (m->m_pkthdr.rcvif == NULL)
824				m->m_pkthdr.rcvif = V_loif;
825			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
826				m->m_pkthdr.csum_flags |=
827				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
828				m->m_pkthdr.csum_data = 0xffff;
829			}
830			m->m_pkthdr.csum_flags |=
831			    CSUM_IP_CHECKED | CSUM_IP_VALID;
832			error = netisr_queue(NETISR_IPV6, m);
833			goto done;
834		} else
835			goto again;	/* Redo the routing table lookup. */
836	}
837
838	/* XXX: IPFIREWALL_FORWARD */
839
840passout:
841	/*
842	 * Send the packet to the outgoing interface.
843	 * If necessary, do IPv6 fragmentation before sending.
844	 *
845	 * the logic here is rather complex:
846	 * 1: normal case (dontfrag == 0, alwaysfrag == 0)
847	 * 1-a:	send as is if tlen <= path mtu
848	 * 1-b:	fragment if tlen > path mtu
849	 *
850	 * 2: if user asks us not to fragment (dontfrag == 1)
851	 * 2-a:	send as is if tlen <= interface mtu
852	 * 2-b:	error if tlen > interface mtu
853	 *
854	 * 3: if we always need to attach fragment header (alwaysfrag == 1)
855	 *	always fragment
856	 *
857	 * 4: if dontfrag == 1 && alwaysfrag == 1
858	 *	error, as we cannot handle this conflicting request
859	 */
860	tlen = m->m_pkthdr.len;
861
862	if (opt && (opt->ip6po_flags & IP6PO_DONTFRAG))
863		dontfrag = 1;
864	else
865		dontfrag = 0;
866	if (dontfrag && alwaysfrag) {	/* case 4 */
867		/* conflicting request - can't transmit */
868		error = EMSGSIZE;
869		goto bad;
870	}
871	if (dontfrag && tlen > IN6_LINKMTU(ifp)) {	/* case 2-b */
872		/*
873		 * Even if the DONTFRAG option is specified, we cannot send the
874		 * packet when the data length is larger than the MTU of the
875		 * outgoing interface.
876		 * Notify the error by sending IPV6_PATHMTU ancillary data as
877		 * well as returning an error code (the latter is not described
878		 * in the API spec.)
879		 */
880		u_int32_t mtu32;
881		struct ip6ctlparam ip6cp;
882
883		mtu32 = (u_int32_t)mtu;
884		bzero(&ip6cp, sizeof(ip6cp));
885		ip6cp.ip6c_cmdarg = (void *)&mtu32;
886		pfctlinput2(PRC_MSGSIZE, (struct sockaddr *)&ro_pmtu->ro_dst,
887		    (void *)&ip6cp);
888
889		error = EMSGSIZE;
890		goto bad;
891	}
892
893	/*
894	 * transmit packet without fragmentation
895	 */
896	if (dontfrag || (!alwaysfrag && tlen <= mtu)) {	/* case 1-a and 2-a */
897		struct in6_ifaddr *ia6;
898
899		ip6 = mtod(m, struct ip6_hdr *);
900		ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
901		if (ia6) {
902			/* Record statistics for this interface address. */
903			ia6->ia_ifa.if_opackets++;
904			ia6->ia_ifa.if_obytes += m->m_pkthdr.len;
905			ifa_free(&ia6->ia_ifa);
906		}
907		error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
908		goto done;
909	}
910
911	/*
912	 * try to fragment the packet.  case 1-b and 3
913	 */
914	if (mtu < IPV6_MMTU) {
915		/* path MTU cannot be less than IPV6_MMTU */
916		error = EMSGSIZE;
917		in6_ifstat_inc(ifp, ifs6_out_fragfail);
918		goto bad;
919	} else if (ip6->ip6_plen == 0) {
920		/* jumbo payload cannot be fragmented */
921		error = EMSGSIZE;
922		in6_ifstat_inc(ifp, ifs6_out_fragfail);
923		goto bad;
924	} else {
925		struct mbuf **mnext, *m_frgpart;
926		struct ip6_frag *ip6f;
927		u_int32_t id = htonl(ip6_randomid());
928		u_char nextproto;
929
930		int qslots = ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len;
931
932		/*
933		 * Too large for the destination or interface;
934		 * fragment if possible.
935		 * Must be able to put at least 8 bytes per fragment.
936		 */
937		hlen = unfragpartlen;
938		if (mtu > IPV6_MAXPACKET)
939			mtu = IPV6_MAXPACKET;
940
941		len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
942		if (len < 8) {
943			error = EMSGSIZE;
944			in6_ifstat_inc(ifp, ifs6_out_fragfail);
945			goto bad;
946		}
947
948		/*
949		 * Verify that we have any chance at all of being able to queue
950		 *      the packet or packet fragments
951		 */
952		if (qslots <= 0 || ((u_int)qslots * (mtu - hlen)
953		    < tlen  /* - hlen */)) {
954			error = ENOBUFS;
955			V_ip6stat.ip6s_odropped++;
956			goto bad;
957		}
958
959		mnext = &m->m_nextpkt;
960
961		/*
962		 * Change the next header field of the last header in the
963		 * unfragmentable part.
964		 */
965		if (exthdrs.ip6e_rthdr) {
966			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
967			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
968		} else if (exthdrs.ip6e_dest1) {
969			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
970			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
971		} else if (exthdrs.ip6e_hbh) {
972			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
973			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
974		} else {
975			nextproto = ip6->ip6_nxt;
976			ip6->ip6_nxt = IPPROTO_FRAGMENT;
977		}
978
979		/*
980		 * Loop through length of segment after first fragment,
981		 * make new header and copy data of each part and link onto
982		 * chain.
983		 */
984		m0 = m;
985		for (off = hlen; off < tlen; off += len) {
986			MGETHDR(m, M_DONTWAIT, MT_HEADER);
987			if (!m) {
988				error = ENOBUFS;
989				V_ip6stat.ip6s_odropped++;
990				goto sendorfree;
991			}
992			m->m_pkthdr.rcvif = NULL;
993			m->m_flags = m0->m_flags & M_COPYFLAGS;
994			*mnext = m;
995			mnext = &m->m_nextpkt;
996			m->m_data += max_linkhdr;
997			mhip6 = mtod(m, struct ip6_hdr *);
998			*mhip6 = *ip6;
999			m->m_len = sizeof(*mhip6);
1000			error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
1001			if (error) {
1002				V_ip6stat.ip6s_odropped++;
1003				goto sendorfree;
1004			}
1005			ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
1006			if (off + len >= tlen)
1007				len = tlen - off;
1008			else
1009				ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
1010			mhip6->ip6_plen = htons((u_short)(len + hlen +
1011			    sizeof(*ip6f) - sizeof(struct ip6_hdr)));
1012			if ((m_frgpart = m_copy(m0, off, len)) == 0) {
1013				error = ENOBUFS;
1014				V_ip6stat.ip6s_odropped++;
1015				goto sendorfree;
1016			}
1017			m_cat(m, m_frgpart);
1018			m->m_pkthdr.len = len + hlen + sizeof(*ip6f);
1019			m->m_pkthdr.rcvif = NULL;
1020			ip6f->ip6f_reserved = 0;
1021			ip6f->ip6f_ident = id;
1022			ip6f->ip6f_nxt = nextproto;
1023			V_ip6stat.ip6s_ofragments++;
1024			in6_ifstat_inc(ifp, ifs6_out_fragcreat);
1025		}
1026
1027		in6_ifstat_inc(ifp, ifs6_out_fragok);
1028	}
1029
1030	/*
1031	 * Remove leading garbages.
1032	 */
1033sendorfree:
1034	m = m0->m_nextpkt;
1035	m0->m_nextpkt = 0;
1036	m_freem(m0);
1037	for (m0 = m; m; m = m0) {
1038		m0 = m->m_nextpkt;
1039		m->m_nextpkt = 0;
1040		if (error == 0) {
1041			/* Record statistics for this interface address. */
1042			if (ia) {
1043				ia->ia_ifa.if_opackets++;
1044				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1045			}
1046			error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
1047		} else
1048			m_freem(m);
1049	}
1050
1051	if (error == 0)
1052		V_ip6stat.ip6s_fragmented++;
1053
1054done:
1055	if (ro == &ip6route && ro->ro_rt) { /* brace necessary for RTFREE */
1056		RTFREE(ro->ro_rt);
1057	} else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) {
1058		RTFREE(ro_pmtu->ro_rt);
1059	}
1060#ifdef IPSEC
1061	if (sp != NULL)
1062		KEY_FREESP(&sp);
1063#endif
1064
1065	return (error);
1066
1067freehdrs:
1068	m_freem(exthdrs.ip6e_hbh);	/* m_freem will check if mbuf is 0 */
1069	m_freem(exthdrs.ip6e_dest1);
1070	m_freem(exthdrs.ip6e_rthdr);
1071	m_freem(exthdrs.ip6e_dest2);
1072	/* FALLTHROUGH */
1073bad:
1074	if (m)
1075		m_freem(m);
1076	goto done;
1077}
1078
1079static int
1080ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
1081{
1082	struct mbuf *m;
1083
1084	if (hlen > MCLBYTES)
1085		return (ENOBUFS); /* XXX */
1086
1087	MGET(m, M_DONTWAIT, MT_DATA);
1088	if (!m)
1089		return (ENOBUFS);
1090
1091	if (hlen > MLEN) {
1092		MCLGET(m, M_DONTWAIT);
1093		if ((m->m_flags & M_EXT) == 0) {
1094			m_free(m);
1095			return (ENOBUFS);
1096		}
1097	}
1098	m->m_len = hlen;
1099	if (hdr)
1100		bcopy(hdr, mtod(m, caddr_t), hlen);
1101
1102	*mp = m;
1103	return (0);
1104}
1105
1106/*
1107 * Insert jumbo payload option.
1108 */
1109static int
1110ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
1111{
1112	struct mbuf *mopt;
1113	u_char *optbuf;
1114	u_int32_t v;
1115
1116#define JUMBOOPTLEN	8	/* length of jumbo payload option and padding */
1117
1118	/*
1119	 * If there is no hop-by-hop options header, allocate new one.
1120	 * If there is one but it doesn't have enough space to store the
1121	 * jumbo payload option, allocate a cluster to store the whole options.
1122	 * Otherwise, use it to store the options.
1123	 */
1124	if (exthdrs->ip6e_hbh == 0) {
1125		MGET(mopt, M_DONTWAIT, MT_DATA);
1126		if (mopt == 0)
1127			return (ENOBUFS);
1128		mopt->m_len = JUMBOOPTLEN;
1129		optbuf = mtod(mopt, u_char *);
1130		optbuf[1] = 0;	/* = ((JUMBOOPTLEN) >> 3) - 1 */
1131		exthdrs->ip6e_hbh = mopt;
1132	} else {
1133		struct ip6_hbh *hbh;
1134
1135		mopt = exthdrs->ip6e_hbh;
1136		if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
1137			/*
1138			 * XXX assumption:
1139			 * - exthdrs->ip6e_hbh is not referenced from places
1140			 *   other than exthdrs.
1141			 * - exthdrs->ip6e_hbh is not an mbuf chain.
1142			 */
1143			int oldoptlen = mopt->m_len;
1144			struct mbuf *n;
1145
1146			/*
1147			 * XXX: give up if the whole (new) hbh header does
1148			 * not fit even in an mbuf cluster.
1149			 */
1150			if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
1151				return (ENOBUFS);
1152
1153			/*
1154			 * As a consequence, we must always prepare a cluster
1155			 * at this point.
1156			 */
1157			MGET(n, M_DONTWAIT, MT_DATA);
1158			if (n) {
1159				MCLGET(n, M_DONTWAIT);
1160				if ((n->m_flags & M_EXT) == 0) {
1161					m_freem(n);
1162					n = NULL;
1163				}
1164			}
1165			if (!n)
1166				return (ENOBUFS);
1167			n->m_len = oldoptlen + JUMBOOPTLEN;
1168			bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
1169			    oldoptlen);
1170			optbuf = mtod(n, caddr_t) + oldoptlen;
1171			m_freem(mopt);
1172			mopt = exthdrs->ip6e_hbh = n;
1173		} else {
1174			optbuf = mtod(mopt, u_char *) + mopt->m_len;
1175			mopt->m_len += JUMBOOPTLEN;
1176		}
1177		optbuf[0] = IP6OPT_PADN;
1178		optbuf[1] = 1;
1179
1180		/*
1181		 * Adjust the header length according to the pad and
1182		 * the jumbo payload option.
1183		 */
1184		hbh = mtod(mopt, struct ip6_hbh *);
1185		hbh->ip6h_len += (JUMBOOPTLEN >> 3);
1186	}
1187
1188	/* fill in the option. */
1189	optbuf[2] = IP6OPT_JUMBO;
1190	optbuf[3] = 4;
1191	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
1192	bcopy(&v, &optbuf[4], sizeof(u_int32_t));
1193
1194	/* finally, adjust the packet header length */
1195	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
1196
1197	return (0);
1198#undef JUMBOOPTLEN
1199}
1200
1201/*
1202 * Insert fragment header and copy unfragmentable header portions.
1203 */
1204static int
1205ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
1206    struct ip6_frag **frghdrp)
1207{
1208	struct mbuf *n, *mlast;
1209
1210	if (hlen > sizeof(struct ip6_hdr)) {
1211		n = m_copym(m0, sizeof(struct ip6_hdr),
1212		    hlen - sizeof(struct ip6_hdr), M_DONTWAIT);
1213		if (n == 0)
1214			return (ENOBUFS);
1215		m->m_next = n;
1216	} else
1217		n = m;
1218
1219	/* Search for the last mbuf of unfragmentable part. */
1220	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
1221		;
1222
1223	if ((mlast->m_flags & M_EXT) == 0 &&
1224	    M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
1225		/* use the trailing space of the last mbuf for the fragment hdr */
1226		*frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) +
1227		    mlast->m_len);
1228		mlast->m_len += sizeof(struct ip6_frag);
1229		m->m_pkthdr.len += sizeof(struct ip6_frag);
1230	} else {
1231		/* allocate a new mbuf for the fragment header */
1232		struct mbuf *mfrg;
1233
1234		MGET(mfrg, M_DONTWAIT, MT_DATA);
1235		if (mfrg == 0)
1236			return (ENOBUFS);
1237		mfrg->m_len = sizeof(struct ip6_frag);
1238		*frghdrp = mtod(mfrg, struct ip6_frag *);
1239		mlast->m_next = mfrg;
1240	}
1241
1242	return (0);
1243}
1244
1245static int
1246ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro,
1247    struct ifnet *ifp, struct in6_addr *dst, u_long *mtup,
1248    int *alwaysfragp)
1249{
1250	u_int32_t mtu = 0;
1251	int alwaysfrag = 0;
1252	int error = 0;
1253
1254	if (ro_pmtu != ro) {
1255		/* The first hop and the final destination may differ. */
1256		struct sockaddr_in6 *sa6_dst =
1257		    (struct sockaddr_in6 *)&ro_pmtu->ro_dst;
1258		if (ro_pmtu->ro_rt &&
1259		    ((ro_pmtu->ro_rt->rt_flags & RTF_UP) == 0 ||
1260		     !IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))) {
1261			RTFREE(ro_pmtu->ro_rt);
1262			ro_pmtu->ro_rt = (struct rtentry *)NULL;
1263		}
1264		if (ro_pmtu->ro_rt == NULL) {
1265			bzero(sa6_dst, sizeof(*sa6_dst));
1266			sa6_dst->sin6_family = AF_INET6;
1267			sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
1268			sa6_dst->sin6_addr = *dst;
1269
1270			rtalloc((struct route *)ro_pmtu);
1271		}
1272	}
1273	if (ro_pmtu->ro_rt) {
1274		u_int32_t ifmtu;
1275		struct in_conninfo inc;
1276
1277		bzero(&inc, sizeof(inc));
1278		inc.inc_flags |= INC_ISIPV6;
1279		inc.inc6_faddr = *dst;
1280
1281		if (ifp == NULL)
1282			ifp = ro_pmtu->ro_rt->rt_ifp;
1283		ifmtu = IN6_LINKMTU(ifp);
1284		mtu = tcp_hc_getmtu(&inc);
1285		if (mtu)
1286			mtu = min(mtu, ro_pmtu->ro_rt->rt_rmx.rmx_mtu);
1287		else
1288			mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu;
1289		if (mtu == 0)
1290			mtu = ifmtu;
1291		else if (mtu < IPV6_MMTU) {
1292			/*
1293			 * RFC2460 section 5, last paragraph:
1294			 * if we record ICMPv6 too big message with
1295			 * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
1296			 * or smaller, with framgent header attached.
1297			 * (fragment header is needed regardless from the
1298			 * packet size, for translators to identify packets)
1299			 */
1300			alwaysfrag = 1;
1301			mtu = IPV6_MMTU;
1302		} else if (mtu > ifmtu) {
1303			/*
1304			 * The MTU on the route is larger than the MTU on
1305			 * the interface!  This shouldn't happen, unless the
1306			 * MTU of the interface has been changed after the
1307			 * interface was brought up.  Change the MTU in the
1308			 * route to match the interface MTU (as long as the
1309			 * field isn't locked).
1310			 */
1311			mtu = ifmtu;
1312			ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu;
1313		}
1314	} else if (ifp) {
1315		mtu = IN6_LINKMTU(ifp);
1316	} else
1317		error = EHOSTUNREACH; /* XXX */
1318
1319	*mtup = mtu;
1320	if (alwaysfragp)
1321		*alwaysfragp = alwaysfrag;
1322	return (error);
1323}
1324
1325/*
1326 * IP6 socket option processing.
1327 */
1328int
1329ip6_ctloutput(struct socket *so, struct sockopt *sopt)
1330{
1331	int optdatalen, uproto;
1332	void *optdata;
1333	struct inpcb *in6p = sotoinpcb(so);
1334	int error, optval;
1335	int level, op, optname;
1336	int optlen;
1337	struct thread *td;
1338
1339	level = sopt->sopt_level;
1340	op = sopt->sopt_dir;
1341	optname = sopt->sopt_name;
1342	optlen = sopt->sopt_valsize;
1343	td = sopt->sopt_td;
1344	error = 0;
1345	optval = 0;
1346	uproto = (int)so->so_proto->pr_protocol;
1347
1348	if (level == IPPROTO_IPV6) {
1349		switch (op) {
1350
1351		case SOPT_SET:
1352			switch (optname) {
1353			case IPV6_2292PKTOPTIONS:
1354#ifdef IPV6_PKTOPTIONS
1355			case IPV6_PKTOPTIONS:
1356#endif
1357			{
1358				struct mbuf *m;
1359
1360				error = soopt_getm(sopt, &m); /* XXX */
1361				if (error != 0)
1362					break;
1363				error = soopt_mcopyin(sopt, m); /* XXX */
1364				if (error != 0)
1365					break;
1366				error = ip6_pcbopts(&in6p->in6p_outputopts,
1367						    m, so, sopt);
1368				m_freem(m); /* XXX */
1369				break;
1370			}
1371
1372			/*
1373			 * Use of some Hop-by-Hop options or some
1374			 * Destination options, might require special
1375			 * privilege.  That is, normal applications
1376			 * (without special privilege) might be forbidden
1377			 * from setting certain options in outgoing packets,
1378			 * and might never see certain options in received
1379			 * packets. [RFC 2292 Section 6]
1380			 * KAME specific note:
1381			 *  KAME prevents non-privileged users from sending or
1382			 *  receiving ANY hbh/dst options in order to avoid
1383			 *  overhead of parsing options in the kernel.
1384			 */
1385			case IPV6_RECVHOPOPTS:
1386			case IPV6_RECVDSTOPTS:
1387			case IPV6_RECVRTHDRDSTOPTS:
1388				if (td != NULL) {
1389					error = priv_check(td,
1390					    PRIV_NETINET_SETHDROPTS);
1391					if (error)
1392						break;
1393				}
1394				/* FALLTHROUGH */
1395			case IPV6_UNICAST_HOPS:
1396			case IPV6_HOPLIMIT:
1397			case IPV6_FAITH:
1398
1399			case IPV6_RECVPKTINFO:
1400			case IPV6_RECVHOPLIMIT:
1401			case IPV6_RECVRTHDR:
1402			case IPV6_RECVPATHMTU:
1403			case IPV6_RECVTCLASS:
1404			case IPV6_V6ONLY:
1405			case IPV6_AUTOFLOWLABEL:
1406			case IPV6_BINDANY:
1407				if (optname == IPV6_BINDANY && td != NULL) {
1408					error = priv_check(td,
1409					    PRIV_NETINET_BINDANY);
1410					if (error)
1411						break;
1412				}
1413
1414				if (optlen != sizeof(int)) {
1415					error = EINVAL;
1416					break;
1417				}
1418				error = sooptcopyin(sopt, &optval,
1419					sizeof optval, sizeof optval);
1420				if (error)
1421					break;
1422				switch (optname) {
1423
1424				case IPV6_UNICAST_HOPS:
1425					if (optval < -1 || optval >= 256)
1426						error = EINVAL;
1427					else {
1428						/* -1 = kernel default */
1429						in6p->in6p_hops = optval;
1430						if ((in6p->inp_vflag &
1431						     INP_IPV4) != 0)
1432							in6p->inp_ip_ttl = optval;
1433					}
1434					break;
1435#define OPTSET(bit) \
1436do { \
1437	if (optval) \
1438		in6p->inp_flags |= (bit); \
1439	else \
1440		in6p->inp_flags &= ~(bit); \
1441} while (/*CONSTCOND*/ 0)
1442#define OPTSET2292(bit) \
1443do { \
1444	in6p->inp_flags |= IN6P_RFC2292; \
1445	if (optval) \
1446		in6p->inp_flags |= (bit); \
1447	else \
1448		in6p->inp_flags &= ~(bit); \
1449} while (/*CONSTCOND*/ 0)
1450#define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0)
1451
1452				case IPV6_RECVPKTINFO:
1453					/* cannot mix with RFC2292 */
1454					if (OPTBIT(IN6P_RFC2292)) {
1455						error = EINVAL;
1456						break;
1457					}
1458					OPTSET(IN6P_PKTINFO);
1459					break;
1460
1461				case IPV6_HOPLIMIT:
1462				{
1463					struct ip6_pktopts **optp;
1464
1465					/* cannot mix with RFC2292 */
1466					if (OPTBIT(IN6P_RFC2292)) {
1467						error = EINVAL;
1468						break;
1469					}
1470					optp = &in6p->in6p_outputopts;
1471					error = ip6_pcbopt(IPV6_HOPLIMIT,
1472					    (u_char *)&optval, sizeof(optval),
1473					    optp, (td != NULL) ? td->td_ucred :
1474					    NULL, uproto);
1475					break;
1476				}
1477
1478				case IPV6_RECVHOPLIMIT:
1479					/* cannot mix with RFC2292 */
1480					if (OPTBIT(IN6P_RFC2292)) {
1481						error = EINVAL;
1482						break;
1483					}
1484					OPTSET(IN6P_HOPLIMIT);
1485					break;
1486
1487				case IPV6_RECVHOPOPTS:
1488					/* cannot mix with RFC2292 */
1489					if (OPTBIT(IN6P_RFC2292)) {
1490						error = EINVAL;
1491						break;
1492					}
1493					OPTSET(IN6P_HOPOPTS);
1494					break;
1495
1496				case IPV6_RECVDSTOPTS:
1497					/* cannot mix with RFC2292 */
1498					if (OPTBIT(IN6P_RFC2292)) {
1499						error = EINVAL;
1500						break;
1501					}
1502					OPTSET(IN6P_DSTOPTS);
1503					break;
1504
1505				case IPV6_RECVRTHDRDSTOPTS:
1506					/* cannot mix with RFC2292 */
1507					if (OPTBIT(IN6P_RFC2292)) {
1508						error = EINVAL;
1509						break;
1510					}
1511					OPTSET(IN6P_RTHDRDSTOPTS);
1512					break;
1513
1514				case IPV6_RECVRTHDR:
1515					/* cannot mix with RFC2292 */
1516					if (OPTBIT(IN6P_RFC2292)) {
1517						error = EINVAL;
1518						break;
1519					}
1520					OPTSET(IN6P_RTHDR);
1521					break;
1522
1523				case IPV6_FAITH:
1524					OPTSET(INP_FAITH);
1525					break;
1526
1527				case IPV6_RECVPATHMTU:
1528					/*
1529					 * We ignore this option for TCP
1530					 * sockets.
1531					 * (RFC3542 leaves this case
1532					 * unspecified.)
1533					 */
1534					if (uproto != IPPROTO_TCP)
1535						OPTSET(IN6P_MTU);
1536					break;
1537
1538				case IPV6_V6ONLY:
1539					/*
1540					 * make setsockopt(IPV6_V6ONLY)
1541					 * available only prior to bind(2).
1542					 * see ipng mailing list, Jun 22 2001.
1543					 */
1544					if (in6p->inp_lport ||
1545					    !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) {
1546						error = EINVAL;
1547						break;
1548					}
1549					OPTSET(IN6P_IPV6_V6ONLY);
1550					if (optval)
1551						in6p->inp_vflag &= ~INP_IPV4;
1552					else
1553						in6p->inp_vflag |= INP_IPV4;
1554					break;
1555				case IPV6_RECVTCLASS:
1556					/* cannot mix with RFC2292 XXX */
1557					if (OPTBIT(IN6P_RFC2292)) {
1558						error = EINVAL;
1559						break;
1560					}
1561					OPTSET(IN6P_TCLASS);
1562					break;
1563				case IPV6_AUTOFLOWLABEL:
1564					OPTSET(IN6P_AUTOFLOWLABEL);
1565					break;
1566
1567				case IPV6_BINDANY:
1568					OPTSET(INP_BINDANY);
1569					break;
1570				}
1571				break;
1572
1573			case IPV6_TCLASS:
1574			case IPV6_DONTFRAG:
1575			case IPV6_USE_MIN_MTU:
1576			case IPV6_PREFER_TEMPADDR:
1577				if (optlen != sizeof(optval)) {
1578					error = EINVAL;
1579					break;
1580				}
1581				error = sooptcopyin(sopt, &optval,
1582					sizeof optval, sizeof optval);
1583				if (error)
1584					break;
1585				{
1586					struct ip6_pktopts **optp;
1587					optp = &in6p->in6p_outputopts;
1588					error = ip6_pcbopt(optname,
1589					    (u_char *)&optval, sizeof(optval),
1590					    optp, (td != NULL) ? td->td_ucred :
1591					    NULL, uproto);
1592					break;
1593				}
1594
1595			case IPV6_2292PKTINFO:
1596			case IPV6_2292HOPLIMIT:
1597			case IPV6_2292HOPOPTS:
1598			case IPV6_2292DSTOPTS:
1599			case IPV6_2292RTHDR:
1600				/* RFC 2292 */
1601				if (optlen != sizeof(int)) {
1602					error = EINVAL;
1603					break;
1604				}
1605				error = sooptcopyin(sopt, &optval,
1606					sizeof optval, sizeof optval);
1607				if (error)
1608					break;
1609				switch (optname) {
1610				case IPV6_2292PKTINFO:
1611					OPTSET2292(IN6P_PKTINFO);
1612					break;
1613				case IPV6_2292HOPLIMIT:
1614					OPTSET2292(IN6P_HOPLIMIT);
1615					break;
1616				case IPV6_2292HOPOPTS:
1617					/*
1618					 * Check super-user privilege.
1619					 * See comments for IPV6_RECVHOPOPTS.
1620					 */
1621					if (td != NULL) {
1622						error = priv_check(td,
1623						    PRIV_NETINET_SETHDROPTS);
1624						if (error)
1625							return (error);
1626					}
1627					OPTSET2292(IN6P_HOPOPTS);
1628					break;
1629				case IPV6_2292DSTOPTS:
1630					if (td != NULL) {
1631						error = priv_check(td,
1632						    PRIV_NETINET_SETHDROPTS);
1633						if (error)
1634							return (error);
1635					}
1636					OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
1637					break;
1638				case IPV6_2292RTHDR:
1639					OPTSET2292(IN6P_RTHDR);
1640					break;
1641				}
1642				break;
1643			case IPV6_PKTINFO:
1644			case IPV6_HOPOPTS:
1645			case IPV6_RTHDR:
1646			case IPV6_DSTOPTS:
1647			case IPV6_RTHDRDSTOPTS:
1648			case IPV6_NEXTHOP:
1649			{
1650				/* new advanced API (RFC3542) */
1651				u_char *optbuf;
1652				u_char optbuf_storage[MCLBYTES];
1653				int optlen;
1654				struct ip6_pktopts **optp;
1655
1656				/* cannot mix with RFC2292 */
1657				if (OPTBIT(IN6P_RFC2292)) {
1658					error = EINVAL;
1659					break;
1660				}
1661
1662				/*
1663				 * We only ensure valsize is not too large
1664				 * here.  Further validation will be done
1665				 * later.
1666				 */
1667				error = sooptcopyin(sopt, optbuf_storage,
1668				    sizeof(optbuf_storage), 0);
1669				if (error)
1670					break;
1671				optlen = sopt->sopt_valsize;
1672				optbuf = optbuf_storage;
1673				optp = &in6p->in6p_outputopts;
1674				error = ip6_pcbopt(optname, optbuf, optlen,
1675				    optp, (td != NULL) ? td->td_ucred : NULL,
1676				    uproto);
1677				break;
1678			}
1679#undef OPTSET
1680
1681			case IPV6_MULTICAST_IF:
1682			case IPV6_MULTICAST_HOPS:
1683			case IPV6_MULTICAST_LOOP:
1684			case IPV6_JOIN_GROUP:
1685			case IPV6_LEAVE_GROUP:
1686			case IPV6_MSFILTER:
1687			case MCAST_BLOCK_SOURCE:
1688			case MCAST_UNBLOCK_SOURCE:
1689			case MCAST_JOIN_GROUP:
1690			case MCAST_LEAVE_GROUP:
1691			case MCAST_JOIN_SOURCE_GROUP:
1692			case MCAST_LEAVE_SOURCE_GROUP:
1693				error = ip6_setmoptions(in6p, sopt);
1694				break;
1695
1696			case IPV6_PORTRANGE:
1697				error = sooptcopyin(sopt, &optval,
1698				    sizeof optval, sizeof optval);
1699				if (error)
1700					break;
1701
1702				switch (optval) {
1703				case IPV6_PORTRANGE_DEFAULT:
1704					in6p->inp_flags &= ~(INP_LOWPORT);
1705					in6p->inp_flags &= ~(INP_HIGHPORT);
1706					break;
1707
1708				case IPV6_PORTRANGE_HIGH:
1709					in6p->inp_flags &= ~(INP_LOWPORT);
1710					in6p->inp_flags |= INP_HIGHPORT;
1711					break;
1712
1713				case IPV6_PORTRANGE_LOW:
1714					in6p->inp_flags &= ~(INP_HIGHPORT);
1715					in6p->inp_flags |= INP_LOWPORT;
1716					break;
1717
1718				default:
1719					error = EINVAL;
1720					break;
1721				}
1722				break;
1723
1724#ifdef IPSEC
1725			case IPV6_IPSEC_POLICY:
1726			{
1727				caddr_t req;
1728				struct mbuf *m;
1729
1730				if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1731					break;
1732				if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1733					break;
1734				req = mtod(m, caddr_t);
1735				error = ipsec_set_policy(in6p, optname, req,
1736				    m->m_len, (sopt->sopt_td != NULL) ?
1737				    sopt->sopt_td->td_ucred : NULL);
1738				m_freem(m);
1739				break;
1740			}
1741#endif /* IPSEC */
1742
1743			default:
1744				error = ENOPROTOOPT;
1745				break;
1746			}
1747			break;
1748
1749		case SOPT_GET:
1750			switch (optname) {
1751
1752			case IPV6_2292PKTOPTIONS:
1753#ifdef IPV6_PKTOPTIONS
1754			case IPV6_PKTOPTIONS:
1755#endif
1756				/*
1757				 * RFC3542 (effectively) deprecated the
1758				 * semantics of the 2292-style pktoptions.
1759				 * Since it was not reliable in nature (i.e.,
1760				 * applications had to expect the lack of some
1761				 * information after all), it would make sense
1762				 * to simplify this part by always returning
1763				 * empty data.
1764				 */
1765				sopt->sopt_valsize = 0;
1766				break;
1767
1768			case IPV6_RECVHOPOPTS:
1769			case IPV6_RECVDSTOPTS:
1770			case IPV6_RECVRTHDRDSTOPTS:
1771			case IPV6_UNICAST_HOPS:
1772			case IPV6_RECVPKTINFO:
1773			case IPV6_RECVHOPLIMIT:
1774			case IPV6_RECVRTHDR:
1775			case IPV6_RECVPATHMTU:
1776
1777			case IPV6_FAITH:
1778			case IPV6_V6ONLY:
1779			case IPV6_PORTRANGE:
1780			case IPV6_RECVTCLASS:
1781			case IPV6_AUTOFLOWLABEL:
1782				switch (optname) {
1783
1784				case IPV6_RECVHOPOPTS:
1785					optval = OPTBIT(IN6P_HOPOPTS);
1786					break;
1787
1788				case IPV6_RECVDSTOPTS:
1789					optval = OPTBIT(IN6P_DSTOPTS);
1790					break;
1791
1792				case IPV6_RECVRTHDRDSTOPTS:
1793					optval = OPTBIT(IN6P_RTHDRDSTOPTS);
1794					break;
1795
1796				case IPV6_UNICAST_HOPS:
1797					optval = in6p->in6p_hops;
1798					break;
1799
1800				case IPV6_RECVPKTINFO:
1801					optval = OPTBIT(IN6P_PKTINFO);
1802					break;
1803
1804				case IPV6_RECVHOPLIMIT:
1805					optval = OPTBIT(IN6P_HOPLIMIT);
1806					break;
1807
1808				case IPV6_RECVRTHDR:
1809					optval = OPTBIT(IN6P_RTHDR);
1810					break;
1811
1812				case IPV6_RECVPATHMTU:
1813					optval = OPTBIT(IN6P_MTU);
1814					break;
1815
1816				case IPV6_FAITH:
1817					optval = OPTBIT(INP_FAITH);
1818					break;
1819
1820				case IPV6_V6ONLY:
1821					optval = OPTBIT(IN6P_IPV6_V6ONLY);
1822					break;
1823
1824				case IPV6_PORTRANGE:
1825				    {
1826					int flags;
1827					flags = in6p->inp_flags;
1828					if (flags & INP_HIGHPORT)
1829						optval = IPV6_PORTRANGE_HIGH;
1830					else if (flags & INP_LOWPORT)
1831						optval = IPV6_PORTRANGE_LOW;
1832					else
1833						optval = 0;
1834					break;
1835				    }
1836				case IPV6_RECVTCLASS:
1837					optval = OPTBIT(IN6P_TCLASS);
1838					break;
1839
1840				case IPV6_AUTOFLOWLABEL:
1841					optval = OPTBIT(IN6P_AUTOFLOWLABEL);
1842					break;
1843
1844				case IPV6_BINDANY:
1845					optval = OPTBIT(INP_BINDANY);
1846					break;
1847				}
1848				if (error)
1849					break;
1850				error = sooptcopyout(sopt, &optval,
1851					sizeof optval);
1852				break;
1853
1854			case IPV6_PATHMTU:
1855			{
1856				u_long pmtu = 0;
1857				struct ip6_mtuinfo mtuinfo;
1858				struct route_in6 sro;
1859
1860				bzero(&sro, sizeof(sro));
1861
1862				if (!(so->so_state & SS_ISCONNECTED))
1863					return (ENOTCONN);
1864				/*
1865				 * XXX: we dot not consider the case of source
1866				 * routing, or optional information to specify
1867				 * the outgoing interface.
1868				 */
1869				error = ip6_getpmtu(&sro, NULL, NULL,
1870				    &in6p->in6p_faddr, &pmtu, NULL);
1871				if (sro.ro_rt)
1872					RTFREE(sro.ro_rt);
1873				if (error)
1874					break;
1875				if (pmtu > IPV6_MAXPACKET)
1876					pmtu = IPV6_MAXPACKET;
1877
1878				bzero(&mtuinfo, sizeof(mtuinfo));
1879				mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
1880				optdata = (void *)&mtuinfo;
1881				optdatalen = sizeof(mtuinfo);
1882				error = sooptcopyout(sopt, optdata,
1883				    optdatalen);
1884				break;
1885			}
1886
1887			case IPV6_2292PKTINFO:
1888			case IPV6_2292HOPLIMIT:
1889			case IPV6_2292HOPOPTS:
1890			case IPV6_2292RTHDR:
1891			case IPV6_2292DSTOPTS:
1892				switch (optname) {
1893				case IPV6_2292PKTINFO:
1894					optval = OPTBIT(IN6P_PKTINFO);
1895					break;
1896				case IPV6_2292HOPLIMIT:
1897					optval = OPTBIT(IN6P_HOPLIMIT);
1898					break;
1899				case IPV6_2292HOPOPTS:
1900					optval = OPTBIT(IN6P_HOPOPTS);
1901					break;
1902				case IPV6_2292RTHDR:
1903					optval = OPTBIT(IN6P_RTHDR);
1904					break;
1905				case IPV6_2292DSTOPTS:
1906					optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
1907					break;
1908				}
1909				error = sooptcopyout(sopt, &optval,
1910				    sizeof optval);
1911				break;
1912			case IPV6_PKTINFO:
1913			case IPV6_HOPOPTS:
1914			case IPV6_RTHDR:
1915			case IPV6_DSTOPTS:
1916			case IPV6_RTHDRDSTOPTS:
1917			case IPV6_NEXTHOP:
1918			case IPV6_TCLASS:
1919			case IPV6_DONTFRAG:
1920			case IPV6_USE_MIN_MTU:
1921			case IPV6_PREFER_TEMPADDR:
1922				error = ip6_getpcbopt(in6p->in6p_outputopts,
1923				    optname, sopt);
1924				break;
1925
1926			case IPV6_MULTICAST_IF:
1927			case IPV6_MULTICAST_HOPS:
1928			case IPV6_MULTICAST_LOOP:
1929			case IPV6_MSFILTER:
1930				error = ip6_getmoptions(in6p, sopt);
1931				break;
1932
1933#ifdef IPSEC
1934			case IPV6_IPSEC_POLICY:
1935			  {
1936				caddr_t req = NULL;
1937				size_t len = 0;
1938				struct mbuf *m = NULL;
1939				struct mbuf **mp = &m;
1940				size_t ovalsize = sopt->sopt_valsize;
1941				caddr_t oval = (caddr_t)sopt->sopt_val;
1942
1943				error = soopt_getm(sopt, &m); /* XXX */
1944				if (error != 0)
1945					break;
1946				error = soopt_mcopyin(sopt, m); /* XXX */
1947				if (error != 0)
1948					break;
1949				sopt->sopt_valsize = ovalsize;
1950				sopt->sopt_val = oval;
1951				if (m) {
1952					req = mtod(m, caddr_t);
1953					len = m->m_len;
1954				}
1955				error = ipsec_get_policy(in6p, req, len, mp);
1956				if (error == 0)
1957					error = soopt_mcopyout(sopt, m); /* XXX */
1958				if (error == 0 && m)
1959					m_freem(m);
1960				break;
1961			  }
1962#endif /* IPSEC */
1963
1964			default:
1965				error = ENOPROTOOPT;
1966				break;
1967			}
1968			break;
1969		}
1970	} else {		/* level != IPPROTO_IPV6 */
1971		error = EINVAL;
1972	}
1973	return (error);
1974}
1975
1976int
1977ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt)
1978{
1979	int error = 0, optval, optlen;
1980	const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
1981	struct inpcb *in6p = sotoinpcb(so);
1982	int level, op, optname;
1983
1984	level = sopt->sopt_level;
1985	op = sopt->sopt_dir;
1986	optname = sopt->sopt_name;
1987	optlen = sopt->sopt_valsize;
1988
1989	if (level != IPPROTO_IPV6) {
1990		return (EINVAL);
1991	}
1992
1993	switch (optname) {
1994	case IPV6_CHECKSUM:
1995		/*
1996		 * For ICMPv6 sockets, no modification allowed for checksum
1997		 * offset, permit "no change" values to help existing apps.
1998		 *
1999		 * RFC3542 says: "An attempt to set IPV6_CHECKSUM
2000		 * for an ICMPv6 socket will fail."
2001		 * The current behavior does not meet RFC3542.
2002		 */
2003		switch (op) {
2004		case SOPT_SET:
2005			if (optlen != sizeof(int)) {
2006				error = EINVAL;
2007				break;
2008			}
2009			error = sooptcopyin(sopt, &optval, sizeof(optval),
2010					    sizeof(optval));
2011			if (error)
2012				break;
2013			if ((optval % 2) != 0) {
2014				/* the API assumes even offset values */
2015				error = EINVAL;
2016			} else if (so->so_proto->pr_protocol ==
2017			    IPPROTO_ICMPV6) {
2018				if (optval != icmp6off)
2019					error = EINVAL;
2020			} else
2021				in6p->in6p_cksum = optval;
2022			break;
2023
2024		case SOPT_GET:
2025			if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
2026				optval = icmp6off;
2027			else
2028				optval = in6p->in6p_cksum;
2029
2030			error = sooptcopyout(sopt, &optval, sizeof(optval));
2031			break;
2032
2033		default:
2034			error = EINVAL;
2035			break;
2036		}
2037		break;
2038
2039	default:
2040		error = ENOPROTOOPT;
2041		break;
2042	}
2043
2044	return (error);
2045}
2046
2047/*
2048 * Set up IP6 options in pcb for insertion in output packets or
2049 * specifying behavior of outgoing packets.
2050 */
2051static int
2052ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m,
2053    struct socket *so, struct sockopt *sopt)
2054{
2055	struct ip6_pktopts *opt = *pktopt;
2056	int error = 0;
2057	struct thread *td = sopt->sopt_td;
2058
2059	/* turn off any old options. */
2060	if (opt) {
2061#ifdef DIAGNOSTIC
2062		if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
2063		    opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
2064		    opt->ip6po_rhinfo.ip6po_rhi_rthdr)
2065			printf("ip6_pcbopts: all specified options are cleared.\n");
2066#endif
2067		ip6_clearpktopts(opt, -1);
2068	} else
2069		opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK);
2070	*pktopt = NULL;
2071
2072	if (!m || m->m_len == 0) {
2073		/*
2074		 * Only turning off any previous options, regardless of
2075		 * whether the opt is just created or given.
2076		 */
2077		free(opt, M_IP6OPT);
2078		return (0);
2079	}
2080
2081	/*  set options specified by user. */
2082	if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ?
2083	    td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) {
2084		ip6_clearpktopts(opt, -1); /* XXX: discard all options */
2085		free(opt, M_IP6OPT);
2086		return (error);
2087	}
2088	*pktopt = opt;
2089	return (0);
2090}
2091
2092/*
2093 * initialize ip6_pktopts.  beware that there are non-zero default values in
2094 * the struct.
2095 */
2096void
2097ip6_initpktopts(struct ip6_pktopts *opt)
2098{
2099
2100	bzero(opt, sizeof(*opt));
2101	opt->ip6po_hlim = -1;	/* -1 means default hop limit */
2102	opt->ip6po_tclass = -1;	/* -1 means default traffic class */
2103	opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
2104	opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
2105}
2106
2107static int
2108ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
2109    struct ucred *cred, int uproto)
2110{
2111	struct ip6_pktopts *opt;
2112
2113	if (*pktopt == NULL) {
2114		*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
2115		    M_WAITOK);
2116		ip6_initpktopts(*pktopt);
2117	}
2118	opt = *pktopt;
2119
2120	return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto));
2121}
2122
2123static int
2124ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt)
2125{
2126	void *optdata = NULL;
2127	int optdatalen = 0;
2128	struct ip6_ext *ip6e;
2129	int error = 0;
2130	struct in6_pktinfo null_pktinfo;
2131	int deftclass = 0, on;
2132	int defminmtu = IP6PO_MINMTU_MCASTONLY;
2133	int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
2134
2135	switch (optname) {
2136	case IPV6_PKTINFO:
2137		if (pktopt && pktopt->ip6po_pktinfo)
2138			optdata = (void *)pktopt->ip6po_pktinfo;
2139		else {
2140			/* XXX: we don't have to do this every time... */
2141			bzero(&null_pktinfo, sizeof(null_pktinfo));
2142			optdata = (void *)&null_pktinfo;
2143		}
2144		optdatalen = sizeof(struct in6_pktinfo);
2145		break;
2146	case IPV6_TCLASS:
2147		if (pktopt && pktopt->ip6po_tclass >= 0)
2148			optdata = (void *)&pktopt->ip6po_tclass;
2149		else
2150			optdata = (void *)&deftclass;
2151		optdatalen = sizeof(int);
2152		break;
2153	case IPV6_HOPOPTS:
2154		if (pktopt && pktopt->ip6po_hbh) {
2155			optdata = (void *)pktopt->ip6po_hbh;
2156			ip6e = (struct ip6_ext *)pktopt->ip6po_hbh;
2157			optdatalen = (ip6e->ip6e_len + 1) << 3;
2158		}
2159		break;
2160	case IPV6_RTHDR:
2161		if (pktopt && pktopt->ip6po_rthdr) {
2162			optdata = (void *)pktopt->ip6po_rthdr;
2163			ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr;
2164			optdatalen = (ip6e->ip6e_len + 1) << 3;
2165		}
2166		break;
2167	case IPV6_RTHDRDSTOPTS:
2168		if (pktopt && pktopt->ip6po_dest1) {
2169			optdata = (void *)pktopt->ip6po_dest1;
2170			ip6e = (struct ip6_ext *)pktopt->ip6po_dest1;
2171			optdatalen = (ip6e->ip6e_len + 1) << 3;
2172		}
2173		break;
2174	case IPV6_DSTOPTS:
2175		if (pktopt && pktopt->ip6po_dest2) {
2176			optdata = (void *)pktopt->ip6po_dest2;
2177			ip6e = (struct ip6_ext *)pktopt->ip6po_dest2;
2178			optdatalen = (ip6e->ip6e_len + 1) << 3;
2179		}
2180		break;
2181	case IPV6_NEXTHOP:
2182		if (pktopt && pktopt->ip6po_nexthop) {
2183			optdata = (void *)pktopt->ip6po_nexthop;
2184			optdatalen = pktopt->ip6po_nexthop->sa_len;
2185		}
2186		break;
2187	case IPV6_USE_MIN_MTU:
2188		if (pktopt)
2189			optdata = (void *)&pktopt->ip6po_minmtu;
2190		else
2191			optdata = (void *)&defminmtu;
2192		optdatalen = sizeof(int);
2193		break;
2194	case IPV6_DONTFRAG:
2195		if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
2196			on = 1;
2197		else
2198			on = 0;
2199		optdata = (void *)&on;
2200		optdatalen = sizeof(on);
2201		break;
2202	case IPV6_PREFER_TEMPADDR:
2203		if (pktopt)
2204			optdata = (void *)&pktopt->ip6po_prefer_tempaddr;
2205		else
2206			optdata = (void *)&defpreftemp;
2207		optdatalen = sizeof(int);
2208		break;
2209	default:		/* should not happen */
2210#ifdef DIAGNOSTIC
2211		panic("ip6_getpcbopt: unexpected option\n");
2212#endif
2213		return (ENOPROTOOPT);
2214	}
2215
2216	error = sooptcopyout(sopt, optdata, optdatalen);
2217
2218	return (error);
2219}
2220
2221void
2222ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
2223{
2224	if (pktopt == NULL)
2225		return;
2226
2227	if (optname == -1 || optname == IPV6_PKTINFO) {
2228		if (pktopt->ip6po_pktinfo)
2229			free(pktopt->ip6po_pktinfo, M_IP6OPT);
2230		pktopt->ip6po_pktinfo = NULL;
2231	}
2232	if (optname == -1 || optname == IPV6_HOPLIMIT)
2233		pktopt->ip6po_hlim = -1;
2234	if (optname == -1 || optname == IPV6_TCLASS)
2235		pktopt->ip6po_tclass = -1;
2236	if (optname == -1 || optname == IPV6_NEXTHOP) {
2237		if (pktopt->ip6po_nextroute.ro_rt) {
2238			RTFREE(pktopt->ip6po_nextroute.ro_rt);
2239			pktopt->ip6po_nextroute.ro_rt = NULL;
2240		}
2241		if (pktopt->ip6po_nexthop)
2242			free(pktopt->ip6po_nexthop, M_IP6OPT);
2243		pktopt->ip6po_nexthop = NULL;
2244	}
2245	if (optname == -1 || optname == IPV6_HOPOPTS) {
2246		if (pktopt->ip6po_hbh)
2247			free(pktopt->ip6po_hbh, M_IP6OPT);
2248		pktopt->ip6po_hbh = NULL;
2249	}
2250	if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
2251		if (pktopt->ip6po_dest1)
2252			free(pktopt->ip6po_dest1, M_IP6OPT);
2253		pktopt->ip6po_dest1 = NULL;
2254	}
2255	if (optname == -1 || optname == IPV6_RTHDR) {
2256		if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
2257			free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
2258		pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
2259		if (pktopt->ip6po_route.ro_rt) {
2260			RTFREE(pktopt->ip6po_route.ro_rt);
2261			pktopt->ip6po_route.ro_rt = NULL;
2262		}
2263	}
2264	if (optname == -1 || optname == IPV6_DSTOPTS) {
2265		if (pktopt->ip6po_dest2)
2266			free(pktopt->ip6po_dest2, M_IP6OPT);
2267		pktopt->ip6po_dest2 = NULL;
2268	}
2269}
2270
2271#define PKTOPT_EXTHDRCPY(type) \
2272do {\
2273	if (src->type) {\
2274		int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
2275		dst->type = malloc(hlen, M_IP6OPT, canwait);\
2276		if (dst->type == NULL && canwait == M_NOWAIT)\
2277			goto bad;\
2278		bcopy(src->type, dst->type, hlen);\
2279	}\
2280} while (/*CONSTCOND*/ 0)
2281
2282static int
2283copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
2284{
2285	if (dst == NULL || src == NULL)  {
2286		printf("ip6_clearpktopts: invalid argument\n");
2287		return (EINVAL);
2288	}
2289
2290	dst->ip6po_hlim = src->ip6po_hlim;
2291	dst->ip6po_tclass = src->ip6po_tclass;
2292	dst->ip6po_flags = src->ip6po_flags;
2293	if (src->ip6po_pktinfo) {
2294		dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
2295		    M_IP6OPT, canwait);
2296		if (dst->ip6po_pktinfo == NULL)
2297			goto bad;
2298		*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
2299	}
2300	if (src->ip6po_nexthop) {
2301		dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
2302		    M_IP6OPT, canwait);
2303		if (dst->ip6po_nexthop == NULL)
2304			goto bad;
2305		bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
2306		    src->ip6po_nexthop->sa_len);
2307	}
2308	PKTOPT_EXTHDRCPY(ip6po_hbh);
2309	PKTOPT_EXTHDRCPY(ip6po_dest1);
2310	PKTOPT_EXTHDRCPY(ip6po_dest2);
2311	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
2312	return (0);
2313
2314  bad:
2315	ip6_clearpktopts(dst, -1);
2316	return (ENOBUFS);
2317}
2318#undef PKTOPT_EXTHDRCPY
2319
2320struct ip6_pktopts *
2321ip6_copypktopts(struct ip6_pktopts *src, int canwait)
2322{
2323	int error;
2324	struct ip6_pktopts *dst;
2325
2326	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
2327	if (dst == NULL)
2328		return (NULL);
2329	ip6_initpktopts(dst);
2330
2331	if ((error = copypktopts(dst, src, canwait)) != 0) {
2332		free(dst, M_IP6OPT);
2333		return (NULL);
2334	}
2335
2336	return (dst);
2337}
2338
2339void
2340ip6_freepcbopts(struct ip6_pktopts *pktopt)
2341{
2342	if (pktopt == NULL)
2343		return;
2344
2345	ip6_clearpktopts(pktopt, -1);
2346
2347	free(pktopt, M_IP6OPT);
2348}
2349
2350/*
2351 * Set IPv6 outgoing packet options based on advanced API.
2352 */
2353int
2354ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
2355    struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto)
2356{
2357	struct cmsghdr *cm = 0;
2358
2359	if (control == NULL || opt == NULL)
2360		return (EINVAL);
2361
2362	ip6_initpktopts(opt);
2363	if (stickyopt) {
2364		int error;
2365
2366		/*
2367		 * If stickyopt is provided, make a local copy of the options
2368		 * for this particular packet, then override them by ancillary
2369		 * objects.
2370		 * XXX: copypktopts() does not copy the cached route to a next
2371		 * hop (if any).  This is not very good in terms of efficiency,
2372		 * but we can allow this since this option should be rarely
2373		 * used.
2374		 */
2375		if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
2376			return (error);
2377	}
2378
2379	/*
2380	 * XXX: Currently, we assume all the optional information is stored
2381	 * in a single mbuf.
2382	 */
2383	if (control->m_next)
2384		return (EINVAL);
2385
2386	for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
2387	    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
2388		int error;
2389
2390		if (control->m_len < CMSG_LEN(0))
2391			return (EINVAL);
2392
2393		cm = mtod(control, struct cmsghdr *);
2394		if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len)
2395			return (EINVAL);
2396		if (cm->cmsg_level != IPPROTO_IPV6)
2397			continue;
2398
2399		error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
2400		    cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
2401		if (error)
2402			return (error);
2403	}
2404
2405	return (0);
2406}
2407
2408/*
2409 * Set a particular packet option, as a sticky option or an ancillary data
2410 * item.  "len" can be 0 only when it's a sticky option.
2411 * We have 4 cases of combination of "sticky" and "cmsg":
2412 * "sticky=0, cmsg=0": impossible
2413 * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
2414 * "sticky=1, cmsg=0": RFC3542 socket option
2415 * "sticky=1, cmsg=1": RFC2292 socket option
2416 */
2417static int
2418ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
2419    struct ucred *cred, int sticky, int cmsg, int uproto)
2420{
2421	int minmtupolicy, preftemp;
2422	int error;
2423
2424	if (!sticky && !cmsg) {
2425#ifdef DIAGNOSTIC
2426		printf("ip6_setpktopt: impossible case\n");
2427#endif
2428		return (EINVAL);
2429	}
2430
2431	/*
2432	 * IPV6_2292xxx is for backward compatibility to RFC2292, and should
2433	 * not be specified in the context of RFC3542.  Conversely,
2434	 * RFC3542 types should not be specified in the context of RFC2292.
2435	 */
2436	if (!cmsg) {
2437		switch (optname) {
2438		case IPV6_2292PKTINFO:
2439		case IPV6_2292HOPLIMIT:
2440		case IPV6_2292NEXTHOP:
2441		case IPV6_2292HOPOPTS:
2442		case IPV6_2292DSTOPTS:
2443		case IPV6_2292RTHDR:
2444		case IPV6_2292PKTOPTIONS:
2445			return (ENOPROTOOPT);
2446		}
2447	}
2448	if (sticky && cmsg) {
2449		switch (optname) {
2450		case IPV6_PKTINFO:
2451		case IPV6_HOPLIMIT:
2452		case IPV6_NEXTHOP:
2453		case IPV6_HOPOPTS:
2454		case IPV6_DSTOPTS:
2455		case IPV6_RTHDRDSTOPTS:
2456		case IPV6_RTHDR:
2457		case IPV6_USE_MIN_MTU:
2458		case IPV6_DONTFRAG:
2459		case IPV6_TCLASS:
2460		case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */
2461			return (ENOPROTOOPT);
2462		}
2463	}
2464
2465	switch (optname) {
2466	case IPV6_2292PKTINFO:
2467	case IPV6_PKTINFO:
2468	{
2469		struct ifnet *ifp = NULL;
2470		struct in6_pktinfo *pktinfo;
2471
2472		if (len != sizeof(struct in6_pktinfo))
2473			return (EINVAL);
2474
2475		pktinfo = (struct in6_pktinfo *)buf;
2476
2477		/*
2478		 * An application can clear any sticky IPV6_PKTINFO option by
2479		 * doing a "regular" setsockopt with ipi6_addr being
2480		 * in6addr_any and ipi6_ifindex being zero.
2481		 * [RFC 3542, Section 6]
2482		 */
2483		if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
2484		    pktinfo->ipi6_ifindex == 0 &&
2485		    IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
2486			ip6_clearpktopts(opt, optname);
2487			break;
2488		}
2489
2490		if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
2491		    sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
2492			return (EINVAL);
2493		}
2494
2495		/* validate the interface index if specified. */
2496		if (pktinfo->ipi6_ifindex > V_if_index ||
2497		    pktinfo->ipi6_ifindex < 0) {
2498			 return (ENXIO);
2499		}
2500		if (pktinfo->ipi6_ifindex) {
2501			ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
2502			if (ifp == NULL)
2503				return (ENXIO);
2504		}
2505
2506		/*
2507		 * We store the address anyway, and let in6_selectsrc()
2508		 * validate the specified address.  This is because ipi6_addr
2509		 * may not have enough information about its scope zone, and
2510		 * we may need additional information (such as outgoing
2511		 * interface or the scope zone of a destination address) to
2512		 * disambiguate the scope.
2513		 * XXX: the delay of the validation may confuse the
2514		 * application when it is used as a sticky option.
2515		 */
2516		if (opt->ip6po_pktinfo == NULL) {
2517			opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
2518			    M_IP6OPT, M_NOWAIT);
2519			if (opt->ip6po_pktinfo == NULL)
2520				return (ENOBUFS);
2521		}
2522		bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
2523		break;
2524	}
2525
2526	case IPV6_2292HOPLIMIT:
2527	case IPV6_HOPLIMIT:
2528	{
2529		int *hlimp;
2530
2531		/*
2532		 * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
2533		 * to simplify the ordering among hoplimit options.
2534		 */
2535		if (optname == IPV6_HOPLIMIT && sticky)
2536			return (ENOPROTOOPT);
2537
2538		if (len != sizeof(int))
2539			return (EINVAL);
2540		hlimp = (int *)buf;
2541		if (*hlimp < -1 || *hlimp > 255)
2542			return (EINVAL);
2543
2544		opt->ip6po_hlim = *hlimp;
2545		break;
2546	}
2547
2548	case IPV6_TCLASS:
2549	{
2550		int tclass;
2551
2552		if (len != sizeof(int))
2553			return (EINVAL);
2554		tclass = *(int *)buf;
2555		if (tclass < -1 || tclass > 255)
2556			return (EINVAL);
2557
2558		opt->ip6po_tclass = tclass;
2559		break;
2560	}
2561
2562	case IPV6_2292NEXTHOP:
2563	case IPV6_NEXTHOP:
2564		if (cred != NULL) {
2565			error = priv_check_cred(cred,
2566			    PRIV_NETINET_SETHDROPTS, 0);
2567			if (error)
2568				return (error);
2569		}
2570
2571		if (len == 0) {	/* just remove the option */
2572			ip6_clearpktopts(opt, IPV6_NEXTHOP);
2573			break;
2574		}
2575
2576		/* check if cmsg_len is large enough for sa_len */
2577		if (len < sizeof(struct sockaddr) || len < *buf)
2578			return (EINVAL);
2579
2580		switch (((struct sockaddr *)buf)->sa_family) {
2581		case AF_INET6:
2582		{
2583			struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
2584			int error;
2585
2586			if (sa6->sin6_len != sizeof(struct sockaddr_in6))
2587				return (EINVAL);
2588
2589			if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
2590			    IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
2591				return (EINVAL);
2592			}
2593			if ((error = sa6_embedscope(sa6, V_ip6_use_defzone))
2594			    != 0) {
2595				return (error);
2596			}
2597			break;
2598		}
2599		case AF_LINK:	/* should eventually be supported */
2600		default:
2601			return (EAFNOSUPPORT);
2602		}
2603
2604		/* turn off the previous option, then set the new option. */
2605		ip6_clearpktopts(opt, IPV6_NEXTHOP);
2606		opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
2607		if (opt->ip6po_nexthop == NULL)
2608			return (ENOBUFS);
2609		bcopy(buf, opt->ip6po_nexthop, *buf);
2610		break;
2611
2612	case IPV6_2292HOPOPTS:
2613	case IPV6_HOPOPTS:
2614	{
2615		struct ip6_hbh *hbh;
2616		int hbhlen;
2617
2618		/*
2619		 * XXX: We don't allow a non-privileged user to set ANY HbH
2620		 * options, since per-option restriction has too much
2621		 * overhead.
2622		 */
2623		if (cred != NULL) {
2624			error = priv_check_cred(cred,
2625			    PRIV_NETINET_SETHDROPTS, 0);
2626			if (error)
2627				return (error);
2628		}
2629
2630		if (len == 0) {
2631			ip6_clearpktopts(opt, IPV6_HOPOPTS);
2632			break;	/* just remove the option */
2633		}
2634
2635		/* message length validation */
2636		if (len < sizeof(struct ip6_hbh))
2637			return (EINVAL);
2638		hbh = (struct ip6_hbh *)buf;
2639		hbhlen = (hbh->ip6h_len + 1) << 3;
2640		if (len != hbhlen)
2641			return (EINVAL);
2642
2643		/* turn off the previous option, then set the new option. */
2644		ip6_clearpktopts(opt, IPV6_HOPOPTS);
2645		opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
2646		if (opt->ip6po_hbh == NULL)
2647			return (ENOBUFS);
2648		bcopy(hbh, opt->ip6po_hbh, hbhlen);
2649
2650		break;
2651	}
2652
2653	case IPV6_2292DSTOPTS:
2654	case IPV6_DSTOPTS:
2655	case IPV6_RTHDRDSTOPTS:
2656	{
2657		struct ip6_dest *dest, **newdest = NULL;
2658		int destlen;
2659
2660		if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */
2661			error = priv_check_cred(cred,
2662			    PRIV_NETINET_SETHDROPTS, 0);
2663			if (error)
2664				return (error);
2665		}
2666
2667		if (len == 0) {
2668			ip6_clearpktopts(opt, optname);
2669			break;	/* just remove the option */
2670		}
2671
2672		/* message length validation */
2673		if (len < sizeof(struct ip6_dest))
2674			return (EINVAL);
2675		dest = (struct ip6_dest *)buf;
2676		destlen = (dest->ip6d_len + 1) << 3;
2677		if (len != destlen)
2678			return (EINVAL);
2679
2680		/*
2681		 * Determine the position that the destination options header
2682		 * should be inserted; before or after the routing header.
2683		 */
2684		switch (optname) {
2685		case IPV6_2292DSTOPTS:
2686			/*
2687			 * The old advacned API is ambiguous on this point.
2688			 * Our approach is to determine the position based
2689			 * according to the existence of a routing header.
2690			 * Note, however, that this depends on the order of the
2691			 * extension headers in the ancillary data; the 1st
2692			 * part of the destination options header must appear
2693			 * before the routing header in the ancillary data,
2694			 * too.
2695			 * RFC3542 solved the ambiguity by introducing
2696			 * separate ancillary data or option types.
2697			 */
2698			if (opt->ip6po_rthdr == NULL)
2699				newdest = &opt->ip6po_dest1;
2700			else
2701				newdest = &opt->ip6po_dest2;
2702			break;
2703		case IPV6_RTHDRDSTOPTS:
2704			newdest = &opt->ip6po_dest1;
2705			break;
2706		case IPV6_DSTOPTS:
2707			newdest = &opt->ip6po_dest2;
2708			break;
2709		}
2710
2711		/* turn off the previous option, then set the new option. */
2712		ip6_clearpktopts(opt, optname);
2713		*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
2714		if (*newdest == NULL)
2715			return (ENOBUFS);
2716		bcopy(dest, *newdest, destlen);
2717
2718		break;
2719	}
2720
2721	case IPV6_2292RTHDR:
2722	case IPV6_RTHDR:
2723	{
2724		struct ip6_rthdr *rth;
2725		int rthlen;
2726
2727		if (len == 0) {
2728			ip6_clearpktopts(opt, IPV6_RTHDR);
2729			break;	/* just remove the option */
2730		}
2731
2732		/* message length validation */
2733		if (len < sizeof(struct ip6_rthdr))
2734			return (EINVAL);
2735		rth = (struct ip6_rthdr *)buf;
2736		rthlen = (rth->ip6r_len + 1) << 3;
2737		if (len != rthlen)
2738			return (EINVAL);
2739
2740		switch (rth->ip6r_type) {
2741		case IPV6_RTHDR_TYPE_0:
2742			if (rth->ip6r_len == 0)	/* must contain one addr */
2743				return (EINVAL);
2744			if (rth->ip6r_len % 2) /* length must be even */
2745				return (EINVAL);
2746			if (rth->ip6r_len / 2 != rth->ip6r_segleft)
2747				return (EINVAL);
2748			break;
2749		default:
2750			return (EINVAL);	/* not supported */
2751		}
2752
2753		/* turn off the previous option */
2754		ip6_clearpktopts(opt, IPV6_RTHDR);
2755		opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
2756		if (opt->ip6po_rthdr == NULL)
2757			return (ENOBUFS);
2758		bcopy(rth, opt->ip6po_rthdr, rthlen);
2759
2760		break;
2761	}
2762
2763	case IPV6_USE_MIN_MTU:
2764		if (len != sizeof(int))
2765			return (EINVAL);
2766		minmtupolicy = *(int *)buf;
2767		if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
2768		    minmtupolicy != IP6PO_MINMTU_DISABLE &&
2769		    minmtupolicy != IP6PO_MINMTU_ALL) {
2770			return (EINVAL);
2771		}
2772		opt->ip6po_minmtu = minmtupolicy;
2773		break;
2774
2775	case IPV6_DONTFRAG:
2776		if (len != sizeof(int))
2777			return (EINVAL);
2778
2779		if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
2780			/*
2781			 * we ignore this option for TCP sockets.
2782			 * (RFC3542 leaves this case unspecified.)
2783			 */
2784			opt->ip6po_flags &= ~IP6PO_DONTFRAG;
2785		} else
2786			opt->ip6po_flags |= IP6PO_DONTFRAG;
2787		break;
2788
2789	case IPV6_PREFER_TEMPADDR:
2790		if (len != sizeof(int))
2791			return (EINVAL);
2792		preftemp = *(int *)buf;
2793		if (preftemp != IP6PO_TEMPADDR_SYSTEM &&
2794		    preftemp != IP6PO_TEMPADDR_NOTPREFER &&
2795		    preftemp != IP6PO_TEMPADDR_PREFER) {
2796			return (EINVAL);
2797		}
2798		opt->ip6po_prefer_tempaddr = preftemp;
2799		break;
2800
2801	default:
2802		return (ENOPROTOOPT);
2803	} /* end of switch */
2804
2805	return (0);
2806}
2807
2808/*
2809 * Routine called from ip6_output() to loop back a copy of an IP6 multicast
2810 * packet to the input queue of a specified interface.  Note that this
2811 * calls the output routine of the loopback "driver", but with an interface
2812 * pointer that might NOT be &loif -- easier than replicating that code here.
2813 */
2814void
2815ip6_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in6 *dst)
2816{
2817	struct mbuf *copym;
2818	struct ip6_hdr *ip6;
2819
2820	copym = m_copy(m, 0, M_COPYALL);
2821	if (copym == NULL)
2822		return;
2823
2824	/*
2825	 * Make sure to deep-copy IPv6 header portion in case the data
2826	 * is in an mbuf cluster, so that we can safely override the IPv6
2827	 * header portion later.
2828	 */
2829	if ((copym->m_flags & M_EXT) != 0 ||
2830	    copym->m_len < sizeof(struct ip6_hdr)) {
2831		copym = m_pullup(copym, sizeof(struct ip6_hdr));
2832		if (copym == NULL)
2833			return;
2834	}
2835
2836#ifdef DIAGNOSTIC
2837	if (copym->m_len < sizeof(*ip6)) {
2838		m_freem(copym);
2839		return;
2840	}
2841#endif
2842
2843	ip6 = mtod(copym, struct ip6_hdr *);
2844	/*
2845	 * clear embedded scope identifiers if necessary.
2846	 * in6_clearscope will touch the addresses only when necessary.
2847	 */
2848	in6_clearscope(&ip6->ip6_src);
2849	in6_clearscope(&ip6->ip6_dst);
2850
2851	(void)if_simloop(ifp, copym, dst->sin6_family, 0);
2852}
2853
2854/*
2855 * Chop IPv6 header off from the payload.
2856 */
2857static int
2858ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
2859{
2860	struct mbuf *mh;
2861	struct ip6_hdr *ip6;
2862
2863	ip6 = mtod(m, struct ip6_hdr *);
2864	if (m->m_len > sizeof(*ip6)) {
2865		MGETHDR(mh, M_DONTWAIT, MT_HEADER);
2866		if (mh == 0) {
2867			m_freem(m);
2868			return ENOBUFS;
2869		}
2870		M_MOVE_PKTHDR(mh, m);
2871		MH_ALIGN(mh, sizeof(*ip6));
2872		m->m_len -= sizeof(*ip6);
2873		m->m_data += sizeof(*ip6);
2874		mh->m_next = m;
2875		m = mh;
2876		m->m_len = sizeof(*ip6);
2877		bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
2878	}
2879	exthdrs->ip6e_ip6 = m;
2880	return 0;
2881}
2882
2883/*
2884 * Compute IPv6 extension header length.
2885 */
2886int
2887ip6_optlen(struct inpcb *in6p)
2888{
2889	int len;
2890
2891	if (!in6p->in6p_outputopts)
2892		return 0;
2893
2894	len = 0;
2895#define elen(x) \
2896    (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
2897
2898	len += elen(in6p->in6p_outputopts->ip6po_hbh);
2899	if (in6p->in6p_outputopts->ip6po_rthdr)
2900		/* dest1 is valid with rthdr only */
2901		len += elen(in6p->in6p_outputopts->ip6po_dest1);
2902	len += elen(in6p->in6p_outputopts->ip6po_rthdr);
2903	len += elen(in6p->in6p_outputopts->ip6po_dest2);
2904	return len;
2905#undef elen
2906}
2907