ip6_output.c revision 351058
1/*-
2 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the project nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	$KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $
30 */
31
32/*-
33 * Copyright (c) 1982, 1986, 1988, 1990, 1993
34 *	The Regents of the University of California.  All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
61 */
62
63#include <sys/cdefs.h>
64__FBSDID("$FreeBSD: stable/11/sys/netinet6/ip6_output.c 351058 2019-08-14 23:25:58Z jhb $");
65
66#include "opt_inet.h"
67#include "opt_inet6.h"
68#include "opt_ipsec.h"
69#include "opt_sctp.h"
70#include "opt_route.h"
71#include "opt_rss.h"
72
73#include <sys/param.h>
74#include <sys/kernel.h>
75#include <sys/malloc.h>
76#include <sys/mbuf.h>
77#include <sys/errno.h>
78#include <sys/priv.h>
79#include <sys/proc.h>
80#include <sys/protosw.h>
81#include <sys/socket.h>
82#include <sys/socketvar.h>
83#include <sys/syslog.h>
84#include <sys/ucred.h>
85
86#include <machine/in_cksum.h>
87
88#include <net/if.h>
89#include <net/if_var.h>
90#include <net/if_llatbl.h>
91#include <net/netisr.h>
92#include <net/route.h>
93#include <net/pfil.h>
94#include <net/rss_config.h>
95#include <net/vnet.h>
96
97#include <netinet/in.h>
98#include <netinet/in_var.h>
99#include <netinet/ip_var.h>
100#include <netinet6/in6_fib.h>
101#include <netinet6/in6_var.h>
102#include <netinet/ip6.h>
103#include <netinet/icmp6.h>
104#include <netinet6/ip6_var.h>
105#include <netinet/in_pcb.h>
106#include <netinet/tcp_var.h>
107#include <netinet6/nd6.h>
108#include <netinet6/in6_rss.h>
109
110#include <netipsec/ipsec_support.h>
111#ifdef SCTP
112#include <netinet/sctp.h>
113#include <netinet/sctp_crc32.h>
114#endif
115
116#include <netinet6/ip6protosw.h>
117#include <netinet6/scope6_var.h>
118
119#ifdef FLOWTABLE
120#include <net/flowtable.h>
121#endif
122
123extern int in6_mcast_loop;
124
125struct ip6_exthdrs {
126	struct mbuf *ip6e_ip6;
127	struct mbuf *ip6e_hbh;
128	struct mbuf *ip6e_dest1;
129	struct mbuf *ip6e_rthdr;
130	struct mbuf *ip6e_dest2;
131};
132
133static MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
134
135static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **,
136			   struct ucred *, int);
137static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *,
138	struct socket *, struct sockopt *);
139static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *);
140static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *,
141	struct ucred *, int, int, int);
142
143static int ip6_copyexthdr(struct mbuf **, caddr_t, int);
144static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
145	struct ip6_frag **);
146static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
147static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
148static int ip6_getpmtu(struct route_in6 *, int,
149	struct ifnet *, const struct in6_addr *, u_long *, int *, u_int,
150	u_int);
151static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long,
152	u_long *, int *, u_int);
153static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *);
154static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
155
156
157/*
158 * Make an extension header from option data.  hp is the source, and
159 * mp is the destination.
160 */
161#define MAKE_EXTHDR(hp, mp)						\
162    do {								\
163	if (hp) {							\
164		struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
165		error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
166		    ((eh)->ip6e_len + 1) << 3);				\
167		if (error)						\
168			goto freehdrs;					\
169	}								\
170    } while (/*CONSTCOND*/ 0)
171
172/*
173 * Form a chain of extension headers.
174 * m is the extension header mbuf
175 * mp is the previous mbuf in the chain
176 * p is the next header
177 * i is the type of option.
178 */
179#define MAKE_CHAIN(m, mp, p, i)\
180    do {\
181	if (m) {\
182		if (!hdrsplit) \
183			panic("assumption failed: hdr not split"); \
184		*mtod((m), u_char *) = *(p);\
185		*(p) = (i);\
186		p = mtod((m), u_char *);\
187		(m)->m_next = (mp)->m_next;\
188		(mp)->m_next = (m);\
189		(mp) = (m);\
190	}\
191    } while (/*CONSTCOND*/ 0)
192
193void
194in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset)
195{
196	u_short csum;
197
198	csum = in_cksum_skip(m, offset + plen, offset);
199	if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0)
200		csum = 0xffff;
201	offset += m->m_pkthdr.csum_data;	/* checksum offset */
202
203	if (offset + sizeof(csum) > m->m_len)
204		m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
205	else
206		*(u_short *)mtodo(m, offset) = csum;
207}
208
209int
210ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto,
211    int mtu, uint32_t id)
212{
213	struct mbuf *m, **mnext, *m_frgpart;
214	struct ip6_hdr *ip6, *mhip6;
215	struct ip6_frag *ip6f;
216	int off;
217	int error;
218	int tlen = m0->m_pkthdr.len;
219
220	KASSERT(( mtu % 8 == 0), ("Fragment length must be a multiple of 8"));
221
222	m = m0;
223	ip6 = mtod(m, struct ip6_hdr *);
224	mnext = &m->m_nextpkt;
225
226	for (off = hlen; off < tlen; off += mtu) {
227		m = m_gethdr(M_NOWAIT, MT_DATA);
228		if (!m) {
229			IP6STAT_INC(ip6s_odropped);
230			return (ENOBUFS);
231		}
232
233		/*
234		 * Make sure the complete packet header gets copied
235		 * from the originating mbuf to the newly created
236		 * mbuf. This also ensures that existing firewall
237		 * classification(s), VLAN tags and so on get copied
238		 * to the resulting fragmented packet(s):
239		 */
240		if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
241			m_free(m);
242			IP6STAT_INC(ip6s_odropped);
243			return (ENOBUFS);
244		}
245
246		*mnext = m;
247		mnext = &m->m_nextpkt;
248		m->m_data += max_linkhdr;
249		mhip6 = mtod(m, struct ip6_hdr *);
250		*mhip6 = *ip6;
251		m->m_len = sizeof(*mhip6);
252		error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
253		if (error) {
254			IP6STAT_INC(ip6s_odropped);
255			return (error);
256		}
257		ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
258		if (off + mtu >= tlen)
259			mtu = tlen - off;
260		else
261			ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
262		mhip6->ip6_plen = htons((u_short)(mtu + hlen +
263		    sizeof(*ip6f) - sizeof(struct ip6_hdr)));
264		if ((m_frgpart = m_copy(m0, off, mtu)) == NULL) {
265			IP6STAT_INC(ip6s_odropped);
266			return (ENOBUFS);
267		}
268		m_cat(m, m_frgpart);
269		m->m_pkthdr.len = mtu + hlen + sizeof(*ip6f);
270		ip6f->ip6f_reserved = 0;
271		ip6f->ip6f_ident = id;
272		ip6f->ip6f_nxt = nextproto;
273		IP6STAT_INC(ip6s_ofragments);
274		in6_ifstat_inc(ifp, ifs6_out_fragcreat);
275	}
276
277	return (0);
278}
279
280/*
281 * IP6 output. The packet in mbuf chain m contains a skeletal IP6
282 * header (with pri, len, nxt, hlim, src, dst).
283 * This function may modify ver and hlim only.
284 * The mbuf chain containing the packet will be freed.
285 * The mbuf opt, if present, will not be freed.
286 * If route_in6 ro is present and has ro_rt initialized, route lookup would be
287 * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
288 * then result of route lookup is stored in ro->ro_rt.
289 *
290 * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and
291 * nd_ifinfo.linkmtu is u_int32_t.  so we use u_long to hold largest one,
292 * which is rt_mtu.
293 *
294 * ifpp - XXX: just for statistics
295 */
296/*
297 * XXX TODO: no flowid is assigned for outbound flows?
298 */
299int
300ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
301    struct route_in6 *ro, int flags, struct ip6_moptions *im6o,
302    struct ifnet **ifpp, struct inpcb *inp)
303{
304	struct ip6_hdr *ip6;
305	struct ifnet *ifp, *origifp;
306	struct mbuf *m = m0;
307	struct mbuf *mprev = NULL;
308	int hlen, tlen, len;
309	struct route_in6 ip6route;
310	struct rtentry *rt = NULL;
311	struct sockaddr_in6 *dst, src_sa, dst_sa;
312	struct in6_addr odst;
313	int error = 0;
314	struct in6_ifaddr *ia = NULL;
315	u_long mtu;
316	int alwaysfrag, dontfrag;
317	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
318	struct ip6_exthdrs exthdrs;
319	struct in6_addr src0, dst0;
320	u_int32_t zone;
321	struct route_in6 *ro_pmtu = NULL;
322	int hdrsplit = 0;
323	int sw_csum, tso;
324	int needfiblookup;
325	uint32_t fibnum;
326	struct m_tag *fwd_tag = NULL;
327	uint32_t id;
328
329	if (inp != NULL) {
330		INP_LOCK_ASSERT(inp);
331		M_SETFIB(m, inp->inp_inc.inc_fibnum);
332		if ((flags & IP_NODEFAULTFLOWID) == 0) {
333			/* unconditionally set flowid */
334			m->m_pkthdr.flowid = inp->inp_flowid;
335			M_HASHTYPE_SET(m, inp->inp_flowtype);
336		}
337	}
338
339#if defined(IPSEC) || defined(IPSEC_SUPPORT)
340	/*
341	 * IPSec checking which handles several cases.
342	 * FAST IPSEC: We re-injected the packet.
343	 * XXX: need scope argument.
344	 */
345	if (IPSEC_ENABLED(ipv6)) {
346		if ((error = IPSEC_OUTPUT(ipv6, m, inp)) != 0) {
347			if (error == EINPROGRESS)
348				error = 0;
349			goto done;
350		}
351	}
352#endif /* IPSEC */
353
354	bzero(&exthdrs, sizeof(exthdrs));
355	if (opt) {
356		/* Hop-by-Hop options header */
357		MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
358		/* Destination options header(1st part) */
359		if (opt->ip6po_rthdr) {
360			/*
361			 * Destination options header(1st part)
362			 * This only makes sense with a routing header.
363			 * See Section 9.2 of RFC 3542.
364			 * Disabling this part just for MIP6 convenience is
365			 * a bad idea.  We need to think carefully about a
366			 * way to make the advanced API coexist with MIP6
367			 * options, which might automatically be inserted in
368			 * the kernel.
369			 */
370			MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
371		}
372		/* Routing header */
373		MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
374		/* Destination options header(2nd part) */
375		MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
376	}
377
378	/*
379	 * Calculate the total length of the extension header chain.
380	 * Keep the length of the unfragmentable part for fragmentation.
381	 */
382	optlen = 0;
383	if (exthdrs.ip6e_hbh)
384		optlen += exthdrs.ip6e_hbh->m_len;
385	if (exthdrs.ip6e_dest1)
386		optlen += exthdrs.ip6e_dest1->m_len;
387	if (exthdrs.ip6e_rthdr)
388		optlen += exthdrs.ip6e_rthdr->m_len;
389	unfragpartlen = optlen + sizeof(struct ip6_hdr);
390
391	/* NOTE: we don't add AH/ESP length here (done in ip6_ipsec_output) */
392	if (exthdrs.ip6e_dest2)
393		optlen += exthdrs.ip6e_dest2->m_len;
394
395	/*
396	 * If there is at least one extension header,
397	 * separate IP6 header from the payload.
398	 */
399	if (optlen && !hdrsplit) {
400		if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
401			m = NULL;
402			goto freehdrs;
403		}
404		m = exthdrs.ip6e_ip6;
405		hdrsplit++;
406	}
407
408	ip6 = mtod(m, struct ip6_hdr *);
409
410	/* adjust mbuf packet header length */
411	m->m_pkthdr.len += optlen;
412	plen = m->m_pkthdr.len - sizeof(*ip6);
413
414	/* If this is a jumbo payload, insert a jumbo payload option. */
415	if (plen > IPV6_MAXPACKET) {
416		if (!hdrsplit) {
417			if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
418				m = NULL;
419				goto freehdrs;
420			}
421			m = exthdrs.ip6e_ip6;
422			hdrsplit++;
423		}
424		/* adjust pointer */
425		ip6 = mtod(m, struct ip6_hdr *);
426		if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
427			goto freehdrs;
428		ip6->ip6_plen = 0;
429	} else
430		ip6->ip6_plen = htons(plen);
431
432	/*
433	 * Concatenate headers and fill in next header fields.
434	 * Here we have, on "m"
435	 *	IPv6 payload
436	 * and we insert headers accordingly.  Finally, we should be getting:
437	 *	IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
438	 *
439	 * during the header composing process, "m" points to IPv6 header.
440	 * "mprev" points to an extension header prior to esp.
441	 */
442	u_char *nexthdrp = &ip6->ip6_nxt;
443	mprev = m;
444
445	/*
446	 * we treat dest2 specially.  this makes IPsec processing
447	 * much easier.  the goal here is to make mprev point the
448	 * mbuf prior to dest2.
449	 *
450	 * result: IPv6 dest2 payload
451	 * m and mprev will point to IPv6 header.
452	 */
453	if (exthdrs.ip6e_dest2) {
454		if (!hdrsplit)
455			panic("assumption failed: hdr not split");
456		exthdrs.ip6e_dest2->m_next = m->m_next;
457		m->m_next = exthdrs.ip6e_dest2;
458		*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
459		ip6->ip6_nxt = IPPROTO_DSTOPTS;
460	}
461
462	/*
463	 * result: IPv6 hbh dest1 rthdr dest2 payload
464	 * m will point to IPv6 header.  mprev will point to the
465	 * extension header prior to dest2 (rthdr in the above case).
466	 */
467	MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
468	MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
469		   IPPROTO_DSTOPTS);
470	MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
471		   IPPROTO_ROUTING);
472
473	/*
474	 * If there is a routing header, discard the packet.
475	 */
476	if (exthdrs.ip6e_rthdr) {
477		 error = EINVAL;
478		 goto bad;
479	}
480
481	/* Source address validation */
482	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
483	    (flags & IPV6_UNSPECSRC) == 0) {
484		error = EOPNOTSUPP;
485		IP6STAT_INC(ip6s_badscope);
486		goto bad;
487	}
488	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
489		error = EOPNOTSUPP;
490		IP6STAT_INC(ip6s_badscope);
491		goto bad;
492	}
493
494	IP6STAT_INC(ip6s_localout);
495
496	/*
497	 * Route packet.
498	 */
499	if (ro == NULL) {
500		ro = &ip6route;
501		bzero((caddr_t)ro, sizeof(*ro));
502	}
503	ro_pmtu = ro;
504	if (opt && opt->ip6po_rthdr)
505		ro = &opt->ip6po_route;
506	dst = (struct sockaddr_in6 *)&ro->ro_dst;
507#ifdef FLOWTABLE
508	if (ro->ro_rt == NULL)
509		(void )flowtable_lookup(AF_INET6, m, (struct route *)ro);
510#endif
511	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
512again:
513	/*
514	 * if specified, try to fill in the traffic class field.
515	 * do not override if a non-zero value is already set.
516	 * we check the diffserv field and the ecn field separately.
517	 */
518	if (opt && opt->ip6po_tclass >= 0) {
519		int mask = 0;
520
521		if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
522			mask |= 0xfc;
523		if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
524			mask |= 0x03;
525		if (mask != 0)
526			ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
527	}
528
529	/* fill in or override the hop limit field, if necessary. */
530	if (opt && opt->ip6po_hlim != -1)
531		ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
532	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
533		if (im6o != NULL)
534			ip6->ip6_hlim = im6o->im6o_multicast_hlim;
535		else
536			ip6->ip6_hlim = V_ip6_defmcasthlim;
537	}
538	/*
539	 * Validate route against routing table additions;
540	 * a better/more specific route might have been added.
541	 * Make sure address family is set in route.
542	 */
543	if (inp) {
544		ro->ro_dst.sin6_family = AF_INET6;
545		RT_VALIDATE((struct route *)ro, &inp->inp_rt_cookie, fibnum);
546	}
547	if (ro->ro_rt && fwd_tag == NULL && (ro->ro_rt->rt_flags & RTF_UP) &&
548	    ro->ro_dst.sin6_family == AF_INET6 &&
549	    IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) {
550		rt = ro->ro_rt;
551		ifp = ro->ro_rt->rt_ifp;
552	} else {
553		if (ro->ro_lle)
554			LLE_FREE(ro->ro_lle);	/* zeros ro_lle */
555		ro->ro_lle = NULL;
556		if (fwd_tag == NULL) {
557			bzero(&dst_sa, sizeof(dst_sa));
558			dst_sa.sin6_family = AF_INET6;
559			dst_sa.sin6_len = sizeof(dst_sa);
560			dst_sa.sin6_addr = ip6->ip6_dst;
561		}
562		error = in6_selectroute_fib(&dst_sa, opt, im6o, ro, &ifp,
563		    &rt, fibnum);
564		if (error != 0) {
565			if (ifp != NULL)
566				in6_ifstat_inc(ifp, ifs6_out_discard);
567			goto bad;
568		}
569	}
570	if (rt == NULL) {
571		/*
572		 * If in6_selectroute() does not return a route entry,
573		 * dst may not have been updated.
574		 */
575		*dst = dst_sa;	/* XXX */
576	}
577
578	/*
579	 * then rt (for unicast) and ifp must be non-NULL valid values.
580	 */
581	if ((flags & IPV6_FORWARDING) == 0) {
582		/* XXX: the FORWARDING flag can be set for mrouting. */
583		in6_ifstat_inc(ifp, ifs6_out_request);
584	}
585	if (rt != NULL) {
586		ia = (struct in6_ifaddr *)(rt->rt_ifa);
587		counter_u64_add(rt->rt_pksent, 1);
588	}
589
590	/* Setup data structures for scope ID checks. */
591	src0 = ip6->ip6_src;
592	bzero(&src_sa, sizeof(src_sa));
593	src_sa.sin6_family = AF_INET6;
594	src_sa.sin6_len = sizeof(src_sa);
595	src_sa.sin6_addr = ip6->ip6_src;
596
597	dst0 = ip6->ip6_dst;
598	/* re-initialize to be sure */
599	bzero(&dst_sa, sizeof(dst_sa));
600	dst_sa.sin6_family = AF_INET6;
601	dst_sa.sin6_len = sizeof(dst_sa);
602	dst_sa.sin6_addr = ip6->ip6_dst;
603
604	/* Check for valid scope ID. */
605	if (in6_setscope(&src0, ifp, &zone) == 0 &&
606	    sa6_recoverscope(&src_sa) == 0 && zone == src_sa.sin6_scope_id &&
607	    in6_setscope(&dst0, ifp, &zone) == 0 &&
608	    sa6_recoverscope(&dst_sa) == 0 && zone == dst_sa.sin6_scope_id) {
609		/*
610		 * The outgoing interface is in the zone of the source
611		 * and destination addresses.
612		 *
613		 * Because the loopback interface cannot receive
614		 * packets with a different scope ID than its own,
615		 * there is a trick is to pretend the outgoing packet
616		 * was received by the real network interface, by
617		 * setting "origifp" different from "ifp". This is
618		 * only allowed when "ifp" is a loopback network
619		 * interface. Refer to code in nd6_output_ifp() for
620		 * more details.
621		 */
622		origifp = ifp;
623
624		/*
625		 * We should use ia_ifp to support the case of sending
626		 * packets to an address of our own.
627		 */
628		if (ia != NULL && ia->ia_ifp)
629			ifp = ia->ia_ifp;
630
631	} else if ((ifp->if_flags & IFF_LOOPBACK) == 0 ||
632	    sa6_recoverscope(&src_sa) != 0 ||
633	    sa6_recoverscope(&dst_sa) != 0 ||
634	    dst_sa.sin6_scope_id == 0 ||
635	    (src_sa.sin6_scope_id != 0 &&
636	    src_sa.sin6_scope_id != dst_sa.sin6_scope_id) ||
637	    (origifp = ifnet_byindex(dst_sa.sin6_scope_id)) == NULL) {
638		/*
639		 * If the destination network interface is not a
640		 * loopback interface, or the destination network
641		 * address has no scope ID, or the source address has
642		 * a scope ID set which is different from the
643		 * destination address one, or there is no network
644		 * interface representing this scope ID, the address
645		 * pair is considered invalid.
646		 */
647		IP6STAT_INC(ip6s_badscope);
648		in6_ifstat_inc(ifp, ifs6_out_discard);
649		if (error == 0)
650			error = EHOSTUNREACH; /* XXX */
651		goto bad;
652	}
653
654	/* All scope ID checks are successful. */
655
656	if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
657		if (opt && opt->ip6po_nextroute.ro_rt) {
658			/*
659			 * The nexthop is explicitly specified by the
660			 * application.  We assume the next hop is an IPv6
661			 * address.
662			 */
663			dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
664		}
665		else if ((rt->rt_flags & RTF_GATEWAY))
666			dst = (struct sockaddr_in6 *)rt->rt_gateway;
667	}
668
669	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
670		m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
671	} else {
672		m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
673		in6_ifstat_inc(ifp, ifs6_out_mcast);
674		/*
675		 * Confirm that the outgoing interface supports multicast.
676		 */
677		if (!(ifp->if_flags & IFF_MULTICAST)) {
678			IP6STAT_INC(ip6s_noroute);
679			in6_ifstat_inc(ifp, ifs6_out_discard);
680			error = ENETUNREACH;
681			goto bad;
682		}
683		if ((im6o == NULL && in6_mcast_loop) ||
684		    (im6o && im6o->im6o_multicast_loop)) {
685			/*
686			 * Loop back multicast datagram if not expressly
687			 * forbidden to do so, even if we have not joined
688			 * the address; protocols will filter it later,
689			 * thus deferring a hash lookup and lock acquisition
690			 * at the expense of an m_copym().
691			 */
692			ip6_mloopback(ifp, m);
693		} else {
694			/*
695			 * If we are acting as a multicast router, perform
696			 * multicast forwarding as if the packet had just
697			 * arrived on the interface to which we are about
698			 * to send.  The multicast forwarding function
699			 * recursively calls this function, using the
700			 * IPV6_FORWARDING flag to prevent infinite recursion.
701			 *
702			 * Multicasts that are looped back by ip6_mloopback(),
703			 * above, will be forwarded by the ip6_input() routine,
704			 * if necessary.
705			 */
706			if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
707				/*
708				 * XXX: ip6_mforward expects that rcvif is NULL
709				 * when it is called from the originating path.
710				 * However, it may not always be the case.
711				 */
712				m->m_pkthdr.rcvif = NULL;
713				if (ip6_mforward(ip6, ifp, m) != 0) {
714					m_freem(m);
715					goto done;
716				}
717			}
718		}
719		/*
720		 * Multicasts with a hoplimit of zero may be looped back,
721		 * above, but must not be transmitted on a network.
722		 * Also, multicasts addressed to the loopback interface
723		 * are not sent -- the above call to ip6_mloopback() will
724		 * loop back a copy if this host actually belongs to the
725		 * destination group on the loopback interface.
726		 */
727		if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) ||
728		    IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
729			m_freem(m);
730			goto done;
731		}
732	}
733
734	/*
735	 * Fill the outgoing inteface to tell the upper layer
736	 * to increment per-interface statistics.
737	 */
738	if (ifpp)
739		*ifpp = ifp;
740
741	/* Determine path MTU. */
742	if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst,
743		    &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0)
744		goto bad;
745
746	/*
747	 * The caller of this function may specify to use the minimum MTU
748	 * in some cases.
749	 * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
750	 * setting.  The logic is a bit complicated; by default, unicast
751	 * packets will follow path MTU while multicast packets will be sent at
752	 * the minimum MTU.  If IP6PO_MINMTU_ALL is specified, all packets
753	 * including unicast ones will be sent at the minimum MTU.  Multicast
754	 * packets will always be sent at the minimum MTU unless
755	 * IP6PO_MINMTU_DISABLE is explicitly specified.
756	 * See RFC 3542 for more details.
757	 */
758	if (mtu > IPV6_MMTU) {
759		if ((flags & IPV6_MINMTU))
760			mtu = IPV6_MMTU;
761		else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
762			mtu = IPV6_MMTU;
763		else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
764			 (opt == NULL ||
765			  opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
766			mtu = IPV6_MMTU;
767		}
768	}
769
770	/*
771	 * clear embedded scope identifiers if necessary.
772	 * in6_clearscope will touch the addresses only when necessary.
773	 */
774	in6_clearscope(&ip6->ip6_src);
775	in6_clearscope(&ip6->ip6_dst);
776
777	/*
778	 * If the outgoing packet contains a hop-by-hop options header,
779	 * it must be examined and processed even by the source node.
780	 * (RFC 2460, section 4.)
781	 */
782	if (exthdrs.ip6e_hbh) {
783		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
784		u_int32_t dummy; /* XXX unused */
785		u_int32_t plen = 0; /* XXX: ip6_process will check the value */
786
787#ifdef DIAGNOSTIC
788		if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
789			panic("ip6e_hbh is not contiguous");
790#endif
791		/*
792		 *  XXX: if we have to send an ICMPv6 error to the sender,
793		 *       we need the M_LOOP flag since icmp6_error() expects
794		 *       the IPv6 and the hop-by-hop options header are
795		 *       contiguous unless the flag is set.
796		 */
797		m->m_flags |= M_LOOP;
798		m->m_pkthdr.rcvif = ifp;
799		if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1),
800		    ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
801		    &dummy, &plen) < 0) {
802			/* m was already freed at this point */
803			error = EINVAL;/* better error? */
804			goto done;
805		}
806		m->m_flags &= ~M_LOOP; /* XXX */
807		m->m_pkthdr.rcvif = NULL;
808	}
809
810	/* Jump over all PFIL processing if hooks are not active. */
811	if (!PFIL_HOOKED(&V_inet6_pfil_hook))
812		goto passout;
813
814	odst = ip6->ip6_dst;
815	/* Run through list of hooks for output packets. */
816	error = pfil_run_hooks(&V_inet6_pfil_hook, &m, ifp, PFIL_OUT, 0, inp);
817	if (error != 0 || m == NULL)
818		goto done;
819	/* adjust pointer */
820	ip6 = mtod(m, struct ip6_hdr *);
821
822	needfiblookup = 0;
823	/* See if destination IP address was changed by packet filter. */
824	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
825		m->m_flags |= M_SKIP_FIREWALL;
826		/* If destination is now ourself drop to ip6_input(). */
827		if (in6_localip(&ip6->ip6_dst)) {
828			m->m_flags |= M_FASTFWD_OURS;
829			if (m->m_pkthdr.rcvif == NULL)
830				m->m_pkthdr.rcvif = V_loif;
831			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
832				m->m_pkthdr.csum_flags |=
833				    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
834				m->m_pkthdr.csum_data = 0xffff;
835			}
836#ifdef SCTP
837			if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
838				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
839#endif
840			error = netisr_queue(NETISR_IPV6, m);
841			goto done;
842		} else {
843			RO_RTFREE(ro);
844			needfiblookup = 1; /* Redo the routing table lookup. */
845			if (ro->ro_lle)
846				LLE_FREE(ro->ro_lle);	/* zeros ro_lle */
847			ro->ro_lle = NULL;
848		}
849	}
850	/* See if fib was changed by packet filter. */
851	if (fibnum != M_GETFIB(m)) {
852		m->m_flags |= M_SKIP_FIREWALL;
853		fibnum = M_GETFIB(m);
854		RO_RTFREE(ro);
855		needfiblookup = 1;
856		if (ro->ro_lle)
857			LLE_FREE(ro->ro_lle);	/* zeros ro_lle */
858		ro->ro_lle = NULL;
859	}
860	if (needfiblookup)
861		goto again;
862
863	/* See if local, if yes, send it to netisr. */
864	if (m->m_flags & M_FASTFWD_OURS) {
865		if (m->m_pkthdr.rcvif == NULL)
866			m->m_pkthdr.rcvif = V_loif;
867		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
868			m->m_pkthdr.csum_flags |=
869			    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
870			m->m_pkthdr.csum_data = 0xffff;
871		}
872#ifdef SCTP
873		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
874			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
875#endif
876		error = netisr_queue(NETISR_IPV6, m);
877		goto done;
878	}
879	/* Or forward to some other address? */
880	if ((m->m_flags & M_IP6_NEXTHOP) &&
881	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
882		dst = (struct sockaddr_in6 *)&ro->ro_dst;
883		bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6));
884		m->m_flags |= M_SKIP_FIREWALL;
885		m->m_flags &= ~M_IP6_NEXTHOP;
886		m_tag_delete(m, fwd_tag);
887		goto again;
888	}
889
890passout:
891	/*
892	 * Send the packet to the outgoing interface.
893	 * If necessary, do IPv6 fragmentation before sending.
894	 *
895	 * the logic here is rather complex:
896	 * 1: normal case (dontfrag == 0, alwaysfrag == 0)
897	 * 1-a:	send as is if tlen <= path mtu
898	 * 1-b:	fragment if tlen > path mtu
899	 *
900	 * 2: if user asks us not to fragment (dontfrag == 1)
901	 * 2-a:	send as is if tlen <= interface mtu
902	 * 2-b:	error if tlen > interface mtu
903	 *
904	 * 3: if we always need to attach fragment header (alwaysfrag == 1)
905	 *	always fragment
906	 *
907	 * 4: if dontfrag == 1 && alwaysfrag == 1
908	 *	error, as we cannot handle this conflicting request
909	 */
910	sw_csum = m->m_pkthdr.csum_flags;
911	if (!hdrsplit) {
912		tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0;
913		sw_csum &= ~ifp->if_hwassist;
914	} else
915		tso = 0;
916	/*
917	 * If we added extension headers, we will not do TSO and calculate the
918	 * checksums ourselves for now.
919	 * XXX-BZ  Need a framework to know when the NIC can handle it, even
920	 * with ext. hdrs.
921	 */
922	if (sw_csum & CSUM_DELAY_DATA_IPV6) {
923		sw_csum &= ~CSUM_DELAY_DATA_IPV6;
924		in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr));
925	}
926#ifdef SCTP
927	if (sw_csum & CSUM_SCTP_IPV6) {
928		sw_csum &= ~CSUM_SCTP_IPV6;
929		sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
930	}
931#endif
932	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
933	tlen = m->m_pkthdr.len;
934
935	if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso)
936		dontfrag = 1;
937	else
938		dontfrag = 0;
939	if (dontfrag && alwaysfrag) {	/* case 4 */
940		/* conflicting request - can't transmit */
941		error = EMSGSIZE;
942		goto bad;
943	}
944	if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) {	/* case 2-b */
945		/*
946		 * Even if the DONTFRAG option is specified, we cannot send the
947		 * packet when the data length is larger than the MTU of the
948		 * outgoing interface.
949		 * Notify the error by sending IPV6_PATHMTU ancillary data if
950		 * application wanted to know the MTU value. Also return an
951		 * error code (this is not described in the API spec).
952		 */
953		if (inp != NULL)
954			ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu);
955		error = EMSGSIZE;
956		goto bad;
957	}
958
959	/*
960	 * transmit packet without fragmentation
961	 */
962	if (dontfrag || (!alwaysfrag && tlen <= mtu)) {	/* case 1-a and 2-a */
963		struct in6_ifaddr *ia6;
964
965		ip6 = mtod(m, struct ip6_hdr *);
966		ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
967		if (ia6) {
968			/* Record statistics for this interface address. */
969			counter_u64_add(ia6->ia_ifa.ifa_opackets, 1);
970			counter_u64_add(ia6->ia_ifa.ifa_obytes,
971			    m->m_pkthdr.len);
972			ifa_free(&ia6->ia_ifa);
973		}
974		error = nd6_output_ifp(ifp, origifp, m, dst,
975		    (struct route *)ro);
976		goto done;
977	}
978
979	/*
980	 * try to fragment the packet.  case 1-b and 3
981	 */
982	if (mtu < IPV6_MMTU) {
983		/* path MTU cannot be less than IPV6_MMTU */
984		error = EMSGSIZE;
985		in6_ifstat_inc(ifp, ifs6_out_fragfail);
986		goto bad;
987	} else if (ip6->ip6_plen == 0) {
988		/* jumbo payload cannot be fragmented */
989		error = EMSGSIZE;
990		in6_ifstat_inc(ifp, ifs6_out_fragfail);
991		goto bad;
992	} else {
993		u_char nextproto;
994
995		/*
996		 * Too large for the destination or interface;
997		 * fragment if possible.
998		 * Must be able to put at least 8 bytes per fragment.
999		 */
1000		hlen = unfragpartlen;
1001		if (mtu > IPV6_MAXPACKET)
1002			mtu = IPV6_MAXPACKET;
1003
1004		len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
1005		if (len < 8) {
1006			error = EMSGSIZE;
1007			in6_ifstat_inc(ifp, ifs6_out_fragfail);
1008			goto bad;
1009		}
1010
1011		/*
1012		 * If the interface will not calculate checksums on
1013		 * fragmented packets, then do it here.
1014		 * XXX-BZ handle the hw offloading case.  Need flags.
1015		 */
1016		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
1017			in6_delayed_cksum(m, plen, hlen);
1018			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
1019		}
1020#ifdef SCTP
1021		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
1022			sctp_delayed_cksum(m, hlen);
1023			m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
1024		}
1025#endif
1026		/*
1027		 * Change the next header field of the last header in the
1028		 * unfragmentable part.
1029		 */
1030		if (exthdrs.ip6e_rthdr) {
1031			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
1032			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
1033		} else if (exthdrs.ip6e_dest1) {
1034			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
1035			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
1036		} else if (exthdrs.ip6e_hbh) {
1037			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
1038			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
1039		} else {
1040			nextproto = ip6->ip6_nxt;
1041			ip6->ip6_nxt = IPPROTO_FRAGMENT;
1042		}
1043
1044		/*
1045		 * Loop through length of segment after first fragment,
1046		 * make new header and copy data of each part and link onto
1047		 * chain.
1048		 */
1049		m0 = m;
1050		id = htonl(ip6_randomid());
1051		if ((error = ip6_fragment(ifp, m, hlen, nextproto, len, id)))
1052			goto sendorfree;
1053
1054		in6_ifstat_inc(ifp, ifs6_out_fragok);
1055	}
1056
1057	/*
1058	 * Remove leading garbages.
1059	 */
1060sendorfree:
1061	m = m0->m_nextpkt;
1062	m0->m_nextpkt = 0;
1063	m_freem(m0);
1064	for (; m; m = m0) {
1065		m0 = m->m_nextpkt;
1066		m->m_nextpkt = 0;
1067		if (error == 0) {
1068			/* Record statistics for this interface address. */
1069			if (ia) {
1070				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
1071				counter_u64_add(ia->ia_ifa.ifa_obytes,
1072				    m->m_pkthdr.len);
1073			}
1074			error = nd6_output_ifp(ifp, origifp, m, dst,
1075			    (struct route *)ro);
1076		} else
1077			m_freem(m);
1078	}
1079
1080	if (error == 0)
1081		IP6STAT_INC(ip6s_fragmented);
1082
1083done:
1084	/*
1085	 * Release the route if using our private route, or if
1086	 * (with flowtable) we don't have our own reference.
1087	 */
1088	if (ro == &ip6route ||
1089	    (ro != NULL && ro->ro_flags & RT_NORTREF))
1090		RO_RTFREE(ro);
1091	return (error);
1092
1093freehdrs:
1094	m_freem(exthdrs.ip6e_hbh);	/* m_freem will check if mbuf is 0 */
1095	m_freem(exthdrs.ip6e_dest1);
1096	m_freem(exthdrs.ip6e_rthdr);
1097	m_freem(exthdrs.ip6e_dest2);
1098	/* FALLTHROUGH */
1099bad:
1100	if (m)
1101		m_freem(m);
1102	goto done;
1103}
1104
1105static int
1106ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
1107{
1108	struct mbuf *m;
1109
1110	if (hlen > MCLBYTES)
1111		return (ENOBUFS); /* XXX */
1112
1113	if (hlen > MLEN)
1114		m = m_getcl(M_NOWAIT, MT_DATA, 0);
1115	else
1116		m = m_get(M_NOWAIT, MT_DATA);
1117	if (m == NULL)
1118		return (ENOBUFS);
1119	m->m_len = hlen;
1120	if (hdr)
1121		bcopy(hdr, mtod(m, caddr_t), hlen);
1122
1123	*mp = m;
1124	return (0);
1125}
1126
1127/*
1128 * Insert jumbo payload option.
1129 */
1130static int
1131ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
1132{
1133	struct mbuf *mopt;
1134	u_char *optbuf;
1135	u_int32_t v;
1136
1137#define JUMBOOPTLEN	8	/* length of jumbo payload option and padding */
1138
1139	/*
1140	 * If there is no hop-by-hop options header, allocate new one.
1141	 * If there is one but it doesn't have enough space to store the
1142	 * jumbo payload option, allocate a cluster to store the whole options.
1143	 * Otherwise, use it to store the options.
1144	 */
1145	if (exthdrs->ip6e_hbh == NULL) {
1146		mopt = m_get(M_NOWAIT, MT_DATA);
1147		if (mopt == NULL)
1148			return (ENOBUFS);
1149		mopt->m_len = JUMBOOPTLEN;
1150		optbuf = mtod(mopt, u_char *);
1151		optbuf[1] = 0;	/* = ((JUMBOOPTLEN) >> 3) - 1 */
1152		exthdrs->ip6e_hbh = mopt;
1153	} else {
1154		struct ip6_hbh *hbh;
1155
1156		mopt = exthdrs->ip6e_hbh;
1157		if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
1158			/*
1159			 * XXX assumption:
1160			 * - exthdrs->ip6e_hbh is not referenced from places
1161			 *   other than exthdrs.
1162			 * - exthdrs->ip6e_hbh is not an mbuf chain.
1163			 */
1164			int oldoptlen = mopt->m_len;
1165			struct mbuf *n;
1166
1167			/*
1168			 * XXX: give up if the whole (new) hbh header does
1169			 * not fit even in an mbuf cluster.
1170			 */
1171			if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
1172				return (ENOBUFS);
1173
1174			/*
1175			 * As a consequence, we must always prepare a cluster
1176			 * at this point.
1177			 */
1178			n = m_getcl(M_NOWAIT, MT_DATA, 0);
1179			if (n == NULL)
1180				return (ENOBUFS);
1181			n->m_len = oldoptlen + JUMBOOPTLEN;
1182			bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
1183			    oldoptlen);
1184			optbuf = mtod(n, caddr_t) + oldoptlen;
1185			m_freem(mopt);
1186			mopt = exthdrs->ip6e_hbh = n;
1187		} else {
1188			optbuf = mtod(mopt, u_char *) + mopt->m_len;
1189			mopt->m_len += JUMBOOPTLEN;
1190		}
1191		optbuf[0] = IP6OPT_PADN;
1192		optbuf[1] = 1;
1193
1194		/*
1195		 * Adjust the header length according to the pad and
1196		 * the jumbo payload option.
1197		 */
1198		hbh = mtod(mopt, struct ip6_hbh *);
1199		hbh->ip6h_len += (JUMBOOPTLEN >> 3);
1200	}
1201
1202	/* fill in the option. */
1203	optbuf[2] = IP6OPT_JUMBO;
1204	optbuf[3] = 4;
1205	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
1206	bcopy(&v, &optbuf[4], sizeof(u_int32_t));
1207
1208	/* finally, adjust the packet header length */
1209	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
1210
1211	return (0);
1212#undef JUMBOOPTLEN
1213}
1214
1215/*
1216 * Insert fragment header and copy unfragmentable header portions.
1217 */
1218static int
1219ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
1220    struct ip6_frag **frghdrp)
1221{
1222	struct mbuf *n, *mlast;
1223
1224	if (hlen > sizeof(struct ip6_hdr)) {
1225		n = m_copym(m0, sizeof(struct ip6_hdr),
1226		    hlen - sizeof(struct ip6_hdr), M_NOWAIT);
1227		if (n == NULL)
1228			return (ENOBUFS);
1229		m->m_next = n;
1230	} else
1231		n = m;
1232
1233	/* Search for the last mbuf of unfragmentable part. */
1234	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
1235		;
1236
1237	if (M_WRITABLE(mlast) &&
1238	    M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
1239		/* use the trailing space of the last mbuf for the fragment hdr */
1240		*frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) +
1241		    mlast->m_len);
1242		mlast->m_len += sizeof(struct ip6_frag);
1243		m->m_pkthdr.len += sizeof(struct ip6_frag);
1244	} else {
1245		/* allocate a new mbuf for the fragment header */
1246		struct mbuf *mfrg;
1247
1248		mfrg = m_get(M_NOWAIT, MT_DATA);
1249		if (mfrg == NULL)
1250			return (ENOBUFS);
1251		mfrg->m_len = sizeof(struct ip6_frag);
1252		*frghdrp = mtod(mfrg, struct ip6_frag *);
1253		mlast->m_next = mfrg;
1254	}
1255
1256	return (0);
1257}
1258
1259/*
1260 * Calculates IPv6 path mtu for destination @dst.
1261 * Resulting MTU is stored in @mtup.
1262 *
1263 * Returns 0 on success.
1264 */
1265static int
1266ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup)
1267{
1268	struct nhop6_extended nh6;
1269	struct in6_addr kdst;
1270	uint32_t scopeid;
1271	struct ifnet *ifp;
1272	u_long mtu;
1273	int error;
1274
1275	in6_splitscope(dst, &kdst, &scopeid);
1276	if (fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF, 0, &nh6) != 0)
1277		return (EHOSTUNREACH);
1278
1279	ifp = nh6.nh_ifp;
1280	mtu = nh6.nh_mtu;
1281
1282	error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL, 0);
1283	fib6_free_nh_ext(fibnum, &nh6);
1284
1285	return (error);
1286}
1287
1288/*
1289 * Calculates IPv6 path MTU for @dst based on transmit @ifp,
1290 * and cached data in @ro_pmtu.
1291 * MTU from (successful) route lookup is saved (along with dst)
1292 * inside @ro_pmtu to avoid subsequent route lookups after packet
1293 * filter processing.
1294 *
1295 * Stores mtu and always-frag value into @mtup and @alwaysfragp.
1296 * Returns 0 on success.
1297 */
1298static int
1299ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup,
1300    struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup,
1301    int *alwaysfragp, u_int fibnum, u_int proto)
1302{
1303	struct nhop6_basic nh6;
1304	struct in6_addr kdst;
1305	uint32_t scopeid;
1306	struct sockaddr_in6 *sa6_dst;
1307	u_long mtu;
1308
1309	mtu = 0;
1310	if (do_lookup) {
1311
1312		/*
1313		 * Here ro_pmtu has final destination address, while
1314		 * ro might represent immediate destination.
1315		 * Use ro_pmtu destination since mtu might differ.
1316		 */
1317		sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst;
1318		if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))
1319			ro_pmtu->ro_mtu = 0;
1320
1321		if (ro_pmtu->ro_mtu == 0) {
1322			bzero(sa6_dst, sizeof(*sa6_dst));
1323			sa6_dst->sin6_family = AF_INET6;
1324			sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
1325			sa6_dst->sin6_addr = *dst;
1326
1327			in6_splitscope(dst, &kdst, &scopeid);
1328			if (fib6_lookup_nh_basic(fibnum, &kdst, scopeid, 0, 0,
1329			    &nh6) == 0)
1330				ro_pmtu->ro_mtu = nh6.nh_mtu;
1331		}
1332
1333		mtu = ro_pmtu->ro_mtu;
1334	}
1335
1336	if (ro_pmtu->ro_rt)
1337		mtu = ro_pmtu->ro_rt->rt_mtu;
1338
1339	return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto));
1340}
1341
1342/*
1343 * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and
1344 * hostcache data for @dst.
1345 * Stores mtu and always-frag value into @mtup and @alwaysfragp.
1346 *
1347 * Returns 0 on success.
1348 */
1349static int
1350ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu,
1351    u_long *mtup, int *alwaysfragp, u_int proto)
1352{
1353	u_long mtu = 0;
1354	int alwaysfrag = 0;
1355	int error = 0;
1356
1357	if (rt_mtu > 0) {
1358		u_int32_t ifmtu;
1359		struct in_conninfo inc;
1360
1361		bzero(&inc, sizeof(inc));
1362		inc.inc_flags |= INC_ISIPV6;
1363		inc.inc6_faddr = *dst;
1364
1365		ifmtu = IN6_LINKMTU(ifp);
1366
1367		/* TCP is known to react to pmtu changes so skip hc */
1368		if (proto != IPPROTO_TCP)
1369			mtu = tcp_hc_getmtu(&inc);
1370
1371		if (mtu)
1372			mtu = min(mtu, rt_mtu);
1373		else
1374			mtu = rt_mtu;
1375		if (mtu == 0)
1376			mtu = ifmtu;
1377		else if (mtu < IPV6_MMTU) {
1378			/*
1379			 * RFC2460 section 5, last paragraph:
1380			 * if we record ICMPv6 too big message with
1381			 * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
1382			 * or smaller, with framgent header attached.
1383			 * (fragment header is needed regardless from the
1384			 * packet size, for translators to identify packets)
1385			 */
1386			alwaysfrag = 1;
1387			mtu = IPV6_MMTU;
1388		}
1389	} else if (ifp) {
1390		mtu = IN6_LINKMTU(ifp);
1391	} else
1392		error = EHOSTUNREACH; /* XXX */
1393
1394	*mtup = mtu;
1395	if (alwaysfragp)
1396		*alwaysfragp = alwaysfrag;
1397	return (error);
1398}
1399
1400/*
1401 * IP6 socket option processing.
1402 */
1403int
1404ip6_ctloutput(struct socket *so, struct sockopt *sopt)
1405{
1406	int optdatalen, uproto;
1407	void *optdata;
1408	struct inpcb *in6p = sotoinpcb(so);
1409	int error, optval;
1410	int level, op, optname;
1411	int optlen;
1412	struct thread *td;
1413#ifdef	RSS
1414	uint32_t rss_bucket;
1415	int retval;
1416#endif
1417
1418/*
1419 * Don't use more than a quarter of mbuf clusters.  N.B.:
1420 * nmbclusters is an int, but nmbclusters * MCLBYTES may overflow
1421 * on LP64 architectures, so cast to u_long to avoid undefined
1422 * behavior.  ILP32 architectures cannot have nmbclusters
1423 * large enough to overflow for other reasons.
1424 */
1425#define IPV6_PKTOPTIONS_MBUF_LIMIT	((u_long)nmbclusters * MCLBYTES / 4)
1426
1427	level = sopt->sopt_level;
1428	op = sopt->sopt_dir;
1429	optname = sopt->sopt_name;
1430	optlen = sopt->sopt_valsize;
1431	td = sopt->sopt_td;
1432	error = 0;
1433	optval = 0;
1434	uproto = (int)so->so_proto->pr_protocol;
1435
1436	if (level != IPPROTO_IPV6) {
1437		error = EINVAL;
1438
1439		if (sopt->sopt_level == SOL_SOCKET &&
1440		    sopt->sopt_dir == SOPT_SET) {
1441			switch (sopt->sopt_name) {
1442			case SO_REUSEADDR:
1443				INP_WLOCK(in6p);
1444				if ((so->so_options & SO_REUSEADDR) != 0)
1445					in6p->inp_flags2 |= INP_REUSEADDR;
1446				else
1447					in6p->inp_flags2 &= ~INP_REUSEADDR;
1448				INP_WUNLOCK(in6p);
1449				error = 0;
1450				break;
1451			case SO_REUSEPORT:
1452				INP_WLOCK(in6p);
1453				if ((so->so_options & SO_REUSEPORT) != 0)
1454					in6p->inp_flags2 |= INP_REUSEPORT;
1455				else
1456					in6p->inp_flags2 &= ~INP_REUSEPORT;
1457				INP_WUNLOCK(in6p);
1458				error = 0;
1459				break;
1460			case SO_SETFIB:
1461				INP_WLOCK(in6p);
1462				in6p->inp_inc.inc_fibnum = so->so_fibnum;
1463				INP_WUNLOCK(in6p);
1464				error = 0;
1465				break;
1466			default:
1467				break;
1468			}
1469		}
1470	} else {		/* level == IPPROTO_IPV6 */
1471		switch (op) {
1472
1473		case SOPT_SET:
1474			switch (optname) {
1475			case IPV6_2292PKTOPTIONS:
1476#ifdef IPV6_PKTOPTIONS
1477			case IPV6_PKTOPTIONS:
1478#endif
1479			{
1480				struct mbuf *m;
1481
1482				if (optlen > IPV6_PKTOPTIONS_MBUF_LIMIT) {
1483					printf("ip6_ctloutput: mbuf limit hit\n");
1484					error = ENOBUFS;
1485					break;
1486				}
1487
1488				error = soopt_getm(sopt, &m); /* XXX */
1489				if (error != 0)
1490					break;
1491				error = soopt_mcopyin(sopt, m); /* XXX */
1492				if (error != 0)
1493					break;
1494				error = ip6_pcbopts(&in6p->in6p_outputopts,
1495						    m, so, sopt);
1496				m_freem(m); /* XXX */
1497				break;
1498			}
1499
1500			/*
1501			 * Use of some Hop-by-Hop options or some
1502			 * Destination options, might require special
1503			 * privilege.  That is, normal applications
1504			 * (without special privilege) might be forbidden
1505			 * from setting certain options in outgoing packets,
1506			 * and might never see certain options in received
1507			 * packets. [RFC 2292 Section 6]
1508			 * KAME specific note:
1509			 *  KAME prevents non-privileged users from sending or
1510			 *  receiving ANY hbh/dst options in order to avoid
1511			 *  overhead of parsing options in the kernel.
1512			 */
1513			case IPV6_RECVHOPOPTS:
1514			case IPV6_RECVDSTOPTS:
1515			case IPV6_RECVRTHDRDSTOPTS:
1516				if (td != NULL) {
1517					error = priv_check(td,
1518					    PRIV_NETINET_SETHDROPTS);
1519					if (error)
1520						break;
1521				}
1522				/* FALLTHROUGH */
1523			case IPV6_UNICAST_HOPS:
1524			case IPV6_HOPLIMIT:
1525
1526			case IPV6_RECVPKTINFO:
1527			case IPV6_RECVHOPLIMIT:
1528			case IPV6_RECVRTHDR:
1529			case IPV6_RECVPATHMTU:
1530			case IPV6_RECVTCLASS:
1531			case IPV6_RECVFLOWID:
1532#ifdef	RSS
1533			case IPV6_RECVRSSBUCKETID:
1534#endif
1535			case IPV6_V6ONLY:
1536			case IPV6_AUTOFLOWLABEL:
1537			case IPV6_BINDANY:
1538			case IPV6_BINDMULTI:
1539#ifdef	RSS
1540			case IPV6_RSS_LISTEN_BUCKET:
1541#endif
1542				if (optname == IPV6_BINDANY && td != NULL) {
1543					error = priv_check(td,
1544					    PRIV_NETINET_BINDANY);
1545					if (error)
1546						break;
1547				}
1548
1549				if (optlen != sizeof(int)) {
1550					error = EINVAL;
1551					break;
1552				}
1553				error = sooptcopyin(sopt, &optval,
1554					sizeof optval, sizeof optval);
1555				if (error)
1556					break;
1557				switch (optname) {
1558
1559				case IPV6_UNICAST_HOPS:
1560					if (optval < -1 || optval >= 256)
1561						error = EINVAL;
1562					else {
1563						/* -1 = kernel default */
1564						in6p->in6p_hops = optval;
1565						if ((in6p->inp_vflag &
1566						     INP_IPV4) != 0)
1567							in6p->inp_ip_ttl = optval;
1568					}
1569					break;
1570#define OPTSET(bit) \
1571do { \
1572	INP_WLOCK(in6p); \
1573	if (optval) \
1574		in6p->inp_flags |= (bit); \
1575	else \
1576		in6p->inp_flags &= ~(bit); \
1577	INP_WUNLOCK(in6p); \
1578} while (/*CONSTCOND*/ 0)
1579#define OPTSET2292(bit) \
1580do { \
1581	INP_WLOCK(in6p); \
1582	in6p->inp_flags |= IN6P_RFC2292; \
1583	if (optval) \
1584		in6p->inp_flags |= (bit); \
1585	else \
1586		in6p->inp_flags &= ~(bit); \
1587	INP_WUNLOCK(in6p); \
1588} while (/*CONSTCOND*/ 0)
1589#define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0)
1590
1591#define OPTSET2(bit, val) do {						\
1592	INP_WLOCK(in6p);						\
1593	if (val)							\
1594		in6p->inp_flags2 |= bit;				\
1595	else								\
1596		in6p->inp_flags2 &= ~bit;				\
1597	INP_WUNLOCK(in6p);						\
1598} while (0)
1599#define OPTBIT2(bit) (in6p->inp_flags2 & (bit) ? 1 : 0)
1600
1601				case IPV6_RECVPKTINFO:
1602					/* cannot mix with RFC2292 */
1603					if (OPTBIT(IN6P_RFC2292)) {
1604						error = EINVAL;
1605						break;
1606					}
1607					OPTSET(IN6P_PKTINFO);
1608					break;
1609
1610				case IPV6_HOPLIMIT:
1611				{
1612					struct ip6_pktopts **optp;
1613
1614					/* cannot mix with RFC2292 */
1615					if (OPTBIT(IN6P_RFC2292)) {
1616						error = EINVAL;
1617						break;
1618					}
1619					optp = &in6p->in6p_outputopts;
1620					error = ip6_pcbopt(IPV6_HOPLIMIT,
1621					    (u_char *)&optval, sizeof(optval),
1622					    optp, (td != NULL) ? td->td_ucred :
1623					    NULL, uproto);
1624					break;
1625				}
1626
1627				case IPV6_RECVHOPLIMIT:
1628					/* cannot mix with RFC2292 */
1629					if (OPTBIT(IN6P_RFC2292)) {
1630						error = EINVAL;
1631						break;
1632					}
1633					OPTSET(IN6P_HOPLIMIT);
1634					break;
1635
1636				case IPV6_RECVHOPOPTS:
1637					/* cannot mix with RFC2292 */
1638					if (OPTBIT(IN6P_RFC2292)) {
1639						error = EINVAL;
1640						break;
1641					}
1642					OPTSET(IN6P_HOPOPTS);
1643					break;
1644
1645				case IPV6_RECVDSTOPTS:
1646					/* cannot mix with RFC2292 */
1647					if (OPTBIT(IN6P_RFC2292)) {
1648						error = EINVAL;
1649						break;
1650					}
1651					OPTSET(IN6P_DSTOPTS);
1652					break;
1653
1654				case IPV6_RECVRTHDRDSTOPTS:
1655					/* cannot mix with RFC2292 */
1656					if (OPTBIT(IN6P_RFC2292)) {
1657						error = EINVAL;
1658						break;
1659					}
1660					OPTSET(IN6P_RTHDRDSTOPTS);
1661					break;
1662
1663				case IPV6_RECVRTHDR:
1664					/* cannot mix with RFC2292 */
1665					if (OPTBIT(IN6P_RFC2292)) {
1666						error = EINVAL;
1667						break;
1668					}
1669					OPTSET(IN6P_RTHDR);
1670					break;
1671
1672				case IPV6_RECVPATHMTU:
1673					/*
1674					 * We ignore this option for TCP
1675					 * sockets.
1676					 * (RFC3542 leaves this case
1677					 * unspecified.)
1678					 */
1679					if (uproto != IPPROTO_TCP)
1680						OPTSET(IN6P_MTU);
1681					break;
1682
1683				case IPV6_RECVFLOWID:
1684					OPTSET2(INP_RECVFLOWID, optval);
1685					break;
1686
1687#ifdef	RSS
1688				case IPV6_RECVRSSBUCKETID:
1689					OPTSET2(INP_RECVRSSBUCKETID, optval);
1690					break;
1691#endif
1692
1693				case IPV6_V6ONLY:
1694					/*
1695					 * make setsockopt(IPV6_V6ONLY)
1696					 * available only prior to bind(2).
1697					 * see ipng mailing list, Jun 22 2001.
1698					 */
1699					if (in6p->inp_lport ||
1700					    !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) {
1701						error = EINVAL;
1702						break;
1703					}
1704					OPTSET(IN6P_IPV6_V6ONLY);
1705					if (optval)
1706						in6p->inp_vflag &= ~INP_IPV4;
1707					else
1708						in6p->inp_vflag |= INP_IPV4;
1709					break;
1710				case IPV6_RECVTCLASS:
1711					/* cannot mix with RFC2292 XXX */
1712					if (OPTBIT(IN6P_RFC2292)) {
1713						error = EINVAL;
1714						break;
1715					}
1716					OPTSET(IN6P_TCLASS);
1717					break;
1718				case IPV6_AUTOFLOWLABEL:
1719					OPTSET(IN6P_AUTOFLOWLABEL);
1720					break;
1721
1722				case IPV6_BINDANY:
1723					OPTSET(INP_BINDANY);
1724					break;
1725
1726				case IPV6_BINDMULTI:
1727					OPTSET2(INP_BINDMULTI, optval);
1728					break;
1729#ifdef	RSS
1730				case IPV6_RSS_LISTEN_BUCKET:
1731					if ((optval >= 0) &&
1732					    (optval < rss_getnumbuckets())) {
1733						in6p->inp_rss_listen_bucket = optval;
1734						OPTSET2(INP_RSS_BUCKET_SET, 1);
1735					} else {
1736						error = EINVAL;
1737					}
1738					break;
1739#endif
1740				}
1741				break;
1742
1743			case IPV6_TCLASS:
1744			case IPV6_DONTFRAG:
1745			case IPV6_USE_MIN_MTU:
1746			case IPV6_PREFER_TEMPADDR:
1747				if (optlen != sizeof(optval)) {
1748					error = EINVAL;
1749					break;
1750				}
1751				error = sooptcopyin(sopt, &optval,
1752					sizeof optval, sizeof optval);
1753				if (error)
1754					break;
1755				{
1756					struct ip6_pktopts **optp;
1757					optp = &in6p->in6p_outputopts;
1758					error = ip6_pcbopt(optname,
1759					    (u_char *)&optval, sizeof(optval),
1760					    optp, (td != NULL) ? td->td_ucred :
1761					    NULL, uproto);
1762					break;
1763				}
1764
1765			case IPV6_2292PKTINFO:
1766			case IPV6_2292HOPLIMIT:
1767			case IPV6_2292HOPOPTS:
1768			case IPV6_2292DSTOPTS:
1769			case IPV6_2292RTHDR:
1770				/* RFC 2292 */
1771				if (optlen != sizeof(int)) {
1772					error = EINVAL;
1773					break;
1774				}
1775				error = sooptcopyin(sopt, &optval,
1776					sizeof optval, sizeof optval);
1777				if (error)
1778					break;
1779				switch (optname) {
1780				case IPV6_2292PKTINFO:
1781					OPTSET2292(IN6P_PKTINFO);
1782					break;
1783				case IPV6_2292HOPLIMIT:
1784					OPTSET2292(IN6P_HOPLIMIT);
1785					break;
1786				case IPV6_2292HOPOPTS:
1787					/*
1788					 * Check super-user privilege.
1789					 * See comments for IPV6_RECVHOPOPTS.
1790					 */
1791					if (td != NULL) {
1792						error = priv_check(td,
1793						    PRIV_NETINET_SETHDROPTS);
1794						if (error)
1795							return (error);
1796					}
1797					OPTSET2292(IN6P_HOPOPTS);
1798					break;
1799				case IPV6_2292DSTOPTS:
1800					if (td != NULL) {
1801						error = priv_check(td,
1802						    PRIV_NETINET_SETHDROPTS);
1803						if (error)
1804							return (error);
1805					}
1806					OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
1807					break;
1808				case IPV6_2292RTHDR:
1809					OPTSET2292(IN6P_RTHDR);
1810					break;
1811				}
1812				break;
1813			case IPV6_PKTINFO:
1814			case IPV6_HOPOPTS:
1815			case IPV6_RTHDR:
1816			case IPV6_DSTOPTS:
1817			case IPV6_RTHDRDSTOPTS:
1818			case IPV6_NEXTHOP:
1819			{
1820				/* new advanced API (RFC3542) */
1821				u_char *optbuf;
1822				u_char optbuf_storage[MCLBYTES];
1823				int optlen;
1824				struct ip6_pktopts **optp;
1825
1826				/* cannot mix with RFC2292 */
1827				if (OPTBIT(IN6P_RFC2292)) {
1828					error = EINVAL;
1829					break;
1830				}
1831
1832				/*
1833				 * We only ensure valsize is not too large
1834				 * here.  Further validation will be done
1835				 * later.
1836				 */
1837				error = sooptcopyin(sopt, optbuf_storage,
1838				    sizeof(optbuf_storage), 0);
1839				if (error)
1840					break;
1841				optlen = sopt->sopt_valsize;
1842				optbuf = optbuf_storage;
1843				optp = &in6p->in6p_outputopts;
1844				error = ip6_pcbopt(optname, optbuf, optlen,
1845				    optp, (td != NULL) ? td->td_ucred : NULL,
1846				    uproto);
1847				break;
1848			}
1849#undef OPTSET
1850
1851			case IPV6_MULTICAST_IF:
1852			case IPV6_MULTICAST_HOPS:
1853			case IPV6_MULTICAST_LOOP:
1854			case IPV6_JOIN_GROUP:
1855			case IPV6_LEAVE_GROUP:
1856			case IPV6_MSFILTER:
1857			case MCAST_BLOCK_SOURCE:
1858			case MCAST_UNBLOCK_SOURCE:
1859			case MCAST_JOIN_GROUP:
1860			case MCAST_LEAVE_GROUP:
1861			case MCAST_JOIN_SOURCE_GROUP:
1862			case MCAST_LEAVE_SOURCE_GROUP:
1863				error = ip6_setmoptions(in6p, sopt);
1864				break;
1865
1866			case IPV6_PORTRANGE:
1867				error = sooptcopyin(sopt, &optval,
1868				    sizeof optval, sizeof optval);
1869				if (error)
1870					break;
1871
1872				INP_WLOCK(in6p);
1873				switch (optval) {
1874				case IPV6_PORTRANGE_DEFAULT:
1875					in6p->inp_flags &= ~(INP_LOWPORT);
1876					in6p->inp_flags &= ~(INP_HIGHPORT);
1877					break;
1878
1879				case IPV6_PORTRANGE_HIGH:
1880					in6p->inp_flags &= ~(INP_LOWPORT);
1881					in6p->inp_flags |= INP_HIGHPORT;
1882					break;
1883
1884				case IPV6_PORTRANGE_LOW:
1885					in6p->inp_flags &= ~(INP_HIGHPORT);
1886					in6p->inp_flags |= INP_LOWPORT;
1887					break;
1888
1889				default:
1890					error = EINVAL;
1891					break;
1892				}
1893				INP_WUNLOCK(in6p);
1894				break;
1895
1896#if defined(IPSEC) || defined(IPSEC_SUPPORT)
1897			case IPV6_IPSEC_POLICY:
1898				if (IPSEC_ENABLED(ipv6)) {
1899					error = IPSEC_PCBCTL(ipv6, in6p, sopt);
1900					break;
1901				}
1902				/* FALLTHROUGH */
1903#endif /* IPSEC */
1904
1905			default:
1906				error = ENOPROTOOPT;
1907				break;
1908			}
1909			break;
1910
1911		case SOPT_GET:
1912			switch (optname) {
1913
1914			case IPV6_2292PKTOPTIONS:
1915#ifdef IPV6_PKTOPTIONS
1916			case IPV6_PKTOPTIONS:
1917#endif
1918				/*
1919				 * RFC3542 (effectively) deprecated the
1920				 * semantics of the 2292-style pktoptions.
1921				 * Since it was not reliable in nature (i.e.,
1922				 * applications had to expect the lack of some
1923				 * information after all), it would make sense
1924				 * to simplify this part by always returning
1925				 * empty data.
1926				 */
1927				sopt->sopt_valsize = 0;
1928				break;
1929
1930			case IPV6_RECVHOPOPTS:
1931			case IPV6_RECVDSTOPTS:
1932			case IPV6_RECVRTHDRDSTOPTS:
1933			case IPV6_UNICAST_HOPS:
1934			case IPV6_RECVPKTINFO:
1935			case IPV6_RECVHOPLIMIT:
1936			case IPV6_RECVRTHDR:
1937			case IPV6_RECVPATHMTU:
1938
1939			case IPV6_V6ONLY:
1940			case IPV6_PORTRANGE:
1941			case IPV6_RECVTCLASS:
1942			case IPV6_AUTOFLOWLABEL:
1943			case IPV6_BINDANY:
1944			case IPV6_FLOWID:
1945			case IPV6_FLOWTYPE:
1946			case IPV6_RECVFLOWID:
1947#ifdef	RSS
1948			case IPV6_RSSBUCKETID:
1949			case IPV6_RECVRSSBUCKETID:
1950#endif
1951			case IPV6_BINDMULTI:
1952				switch (optname) {
1953
1954				case IPV6_RECVHOPOPTS:
1955					optval = OPTBIT(IN6P_HOPOPTS);
1956					break;
1957
1958				case IPV6_RECVDSTOPTS:
1959					optval = OPTBIT(IN6P_DSTOPTS);
1960					break;
1961
1962				case IPV6_RECVRTHDRDSTOPTS:
1963					optval = OPTBIT(IN6P_RTHDRDSTOPTS);
1964					break;
1965
1966				case IPV6_UNICAST_HOPS:
1967					optval = in6p->in6p_hops;
1968					break;
1969
1970				case IPV6_RECVPKTINFO:
1971					optval = OPTBIT(IN6P_PKTINFO);
1972					break;
1973
1974				case IPV6_RECVHOPLIMIT:
1975					optval = OPTBIT(IN6P_HOPLIMIT);
1976					break;
1977
1978				case IPV6_RECVRTHDR:
1979					optval = OPTBIT(IN6P_RTHDR);
1980					break;
1981
1982				case IPV6_RECVPATHMTU:
1983					optval = OPTBIT(IN6P_MTU);
1984					break;
1985
1986				case IPV6_V6ONLY:
1987					optval = OPTBIT(IN6P_IPV6_V6ONLY);
1988					break;
1989
1990				case IPV6_PORTRANGE:
1991				    {
1992					int flags;
1993					flags = in6p->inp_flags;
1994					if (flags & INP_HIGHPORT)
1995						optval = IPV6_PORTRANGE_HIGH;
1996					else if (flags & INP_LOWPORT)
1997						optval = IPV6_PORTRANGE_LOW;
1998					else
1999						optval = 0;
2000					break;
2001				    }
2002				case IPV6_RECVTCLASS:
2003					optval = OPTBIT(IN6P_TCLASS);
2004					break;
2005
2006				case IPV6_AUTOFLOWLABEL:
2007					optval = OPTBIT(IN6P_AUTOFLOWLABEL);
2008					break;
2009
2010				case IPV6_BINDANY:
2011					optval = OPTBIT(INP_BINDANY);
2012					break;
2013
2014				case IPV6_FLOWID:
2015					optval = in6p->inp_flowid;
2016					break;
2017
2018				case IPV6_FLOWTYPE:
2019					optval = in6p->inp_flowtype;
2020					break;
2021
2022				case IPV6_RECVFLOWID:
2023					optval = OPTBIT2(INP_RECVFLOWID);
2024					break;
2025#ifdef	RSS
2026				case IPV6_RSSBUCKETID:
2027					retval =
2028					    rss_hash2bucket(in6p->inp_flowid,
2029					    in6p->inp_flowtype,
2030					    &rss_bucket);
2031					if (retval == 0)
2032						optval = rss_bucket;
2033					else
2034						error = EINVAL;
2035					break;
2036
2037				case IPV6_RECVRSSBUCKETID:
2038					optval = OPTBIT2(INP_RECVRSSBUCKETID);
2039					break;
2040#endif
2041
2042				case IPV6_BINDMULTI:
2043					optval = OPTBIT2(INP_BINDMULTI);
2044					break;
2045
2046				}
2047				if (error)
2048					break;
2049				error = sooptcopyout(sopt, &optval,
2050					sizeof optval);
2051				break;
2052
2053			case IPV6_PATHMTU:
2054			{
2055				u_long pmtu = 0;
2056				struct ip6_mtuinfo mtuinfo;
2057
2058				if (!(so->so_state & SS_ISCONNECTED))
2059					return (ENOTCONN);
2060				/*
2061				 * XXX: we dot not consider the case of source
2062				 * routing, or optional information to specify
2063				 * the outgoing interface.
2064				 */
2065				error = ip6_getpmtu_ctl(so->so_fibnum,
2066				    &in6p->in6p_faddr, &pmtu);
2067				if (error)
2068					break;
2069				if (pmtu > IPV6_MAXPACKET)
2070					pmtu = IPV6_MAXPACKET;
2071
2072				bzero(&mtuinfo, sizeof(mtuinfo));
2073				mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
2074				optdata = (void *)&mtuinfo;
2075				optdatalen = sizeof(mtuinfo);
2076				error = sooptcopyout(sopt, optdata,
2077				    optdatalen);
2078				break;
2079			}
2080
2081			case IPV6_2292PKTINFO:
2082			case IPV6_2292HOPLIMIT:
2083			case IPV6_2292HOPOPTS:
2084			case IPV6_2292RTHDR:
2085			case IPV6_2292DSTOPTS:
2086				switch (optname) {
2087				case IPV6_2292PKTINFO:
2088					optval = OPTBIT(IN6P_PKTINFO);
2089					break;
2090				case IPV6_2292HOPLIMIT:
2091					optval = OPTBIT(IN6P_HOPLIMIT);
2092					break;
2093				case IPV6_2292HOPOPTS:
2094					optval = OPTBIT(IN6P_HOPOPTS);
2095					break;
2096				case IPV6_2292RTHDR:
2097					optval = OPTBIT(IN6P_RTHDR);
2098					break;
2099				case IPV6_2292DSTOPTS:
2100					optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
2101					break;
2102				}
2103				error = sooptcopyout(sopt, &optval,
2104				    sizeof optval);
2105				break;
2106			case IPV6_PKTINFO:
2107			case IPV6_HOPOPTS:
2108			case IPV6_RTHDR:
2109			case IPV6_DSTOPTS:
2110			case IPV6_RTHDRDSTOPTS:
2111			case IPV6_NEXTHOP:
2112			case IPV6_TCLASS:
2113			case IPV6_DONTFRAG:
2114			case IPV6_USE_MIN_MTU:
2115			case IPV6_PREFER_TEMPADDR:
2116				error = ip6_getpcbopt(in6p->in6p_outputopts,
2117				    optname, sopt);
2118				break;
2119
2120			case IPV6_MULTICAST_IF:
2121			case IPV6_MULTICAST_HOPS:
2122			case IPV6_MULTICAST_LOOP:
2123			case IPV6_MSFILTER:
2124				error = ip6_getmoptions(in6p, sopt);
2125				break;
2126
2127#if defined(IPSEC) || defined(IPSEC_SUPPORT)
2128			case IPV6_IPSEC_POLICY:
2129				if (IPSEC_ENABLED(ipv6)) {
2130					error = IPSEC_PCBCTL(ipv6, in6p, sopt);
2131					break;
2132				}
2133				/* FALLTHROUGH */
2134#endif /* IPSEC */
2135			default:
2136				error = ENOPROTOOPT;
2137				break;
2138			}
2139			break;
2140		}
2141	}
2142	return (error);
2143}
2144
2145int
2146ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt)
2147{
2148	int error = 0, optval, optlen;
2149	const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
2150	struct inpcb *in6p = sotoinpcb(so);
2151	int level, op, optname;
2152
2153	level = sopt->sopt_level;
2154	op = sopt->sopt_dir;
2155	optname = sopt->sopt_name;
2156	optlen = sopt->sopt_valsize;
2157
2158	if (level != IPPROTO_IPV6) {
2159		return (EINVAL);
2160	}
2161
2162	switch (optname) {
2163	case IPV6_CHECKSUM:
2164		/*
2165		 * For ICMPv6 sockets, no modification allowed for checksum
2166		 * offset, permit "no change" values to help existing apps.
2167		 *
2168		 * RFC3542 says: "An attempt to set IPV6_CHECKSUM
2169		 * for an ICMPv6 socket will fail."
2170		 * The current behavior does not meet RFC3542.
2171		 */
2172		switch (op) {
2173		case SOPT_SET:
2174			if (optlen != sizeof(int)) {
2175				error = EINVAL;
2176				break;
2177			}
2178			error = sooptcopyin(sopt, &optval, sizeof(optval),
2179					    sizeof(optval));
2180			if (error)
2181				break;
2182			if (optval < -1 || (optval % 2) != 0) {
2183				/*
2184				 * The API assumes non-negative even offset
2185				 * values or -1 as a special value.
2186				 */
2187				error = EINVAL;
2188			} else if (so->so_proto->pr_protocol ==
2189			    IPPROTO_ICMPV6) {
2190				if (optval != icmp6off)
2191					error = EINVAL;
2192			} else
2193				in6p->in6p_cksum = optval;
2194			break;
2195
2196		case SOPT_GET:
2197			if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
2198				optval = icmp6off;
2199			else
2200				optval = in6p->in6p_cksum;
2201
2202			error = sooptcopyout(sopt, &optval, sizeof(optval));
2203			break;
2204
2205		default:
2206			error = EINVAL;
2207			break;
2208		}
2209		break;
2210
2211	default:
2212		error = ENOPROTOOPT;
2213		break;
2214	}
2215
2216	return (error);
2217}
2218
2219/*
2220 * Set up IP6 options in pcb for insertion in output packets or
2221 * specifying behavior of outgoing packets.
2222 */
2223static int
2224ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m,
2225    struct socket *so, struct sockopt *sopt)
2226{
2227	struct ip6_pktopts *opt = *pktopt;
2228	int error = 0;
2229	struct thread *td = sopt->sopt_td;
2230
2231	/* turn off any old options. */
2232	if (opt) {
2233#ifdef DIAGNOSTIC
2234		if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
2235		    opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
2236		    opt->ip6po_rhinfo.ip6po_rhi_rthdr)
2237			printf("ip6_pcbopts: all specified options are cleared.\n");
2238#endif
2239		ip6_clearpktopts(opt, -1);
2240	} else
2241		opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK);
2242	*pktopt = NULL;
2243
2244	if (!m || m->m_len == 0) {
2245		/*
2246		 * Only turning off any previous options, regardless of
2247		 * whether the opt is just created or given.
2248		 */
2249		free(opt, M_IP6OPT);
2250		return (0);
2251	}
2252
2253	/*  set options specified by user. */
2254	if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ?
2255	    td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) {
2256		ip6_clearpktopts(opt, -1); /* XXX: discard all options */
2257		free(opt, M_IP6OPT);
2258		return (error);
2259	}
2260	*pktopt = opt;
2261	return (0);
2262}
2263
2264/*
2265 * initialize ip6_pktopts.  beware that there are non-zero default values in
2266 * the struct.
2267 */
2268void
2269ip6_initpktopts(struct ip6_pktopts *opt)
2270{
2271
2272	bzero(opt, sizeof(*opt));
2273	opt->ip6po_hlim = -1;	/* -1 means default hop limit */
2274	opt->ip6po_tclass = -1;	/* -1 means default traffic class */
2275	opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
2276	opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
2277}
2278
2279static int
2280ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
2281    struct ucred *cred, int uproto)
2282{
2283	struct ip6_pktopts *opt;
2284
2285	if (*pktopt == NULL) {
2286		*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
2287		    M_WAITOK);
2288		ip6_initpktopts(*pktopt);
2289	}
2290	opt = *pktopt;
2291
2292	return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto));
2293}
2294
2295static int
2296ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt)
2297{
2298	void *optdata = NULL;
2299	int optdatalen = 0;
2300	struct ip6_ext *ip6e;
2301	int error = 0;
2302	struct in6_pktinfo null_pktinfo;
2303	int deftclass = 0, on;
2304	int defminmtu = IP6PO_MINMTU_MCASTONLY;
2305	int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
2306
2307	switch (optname) {
2308	case IPV6_PKTINFO:
2309		optdata = (void *)&null_pktinfo;
2310		if (pktopt && pktopt->ip6po_pktinfo) {
2311			bcopy(pktopt->ip6po_pktinfo, &null_pktinfo,
2312			    sizeof(null_pktinfo));
2313			in6_clearscope(&null_pktinfo.ipi6_addr);
2314		} else {
2315			/* XXX: we don't have to do this every time... */
2316			bzero(&null_pktinfo, sizeof(null_pktinfo));
2317		}
2318		optdatalen = sizeof(struct in6_pktinfo);
2319		break;
2320	case IPV6_TCLASS:
2321		if (pktopt && pktopt->ip6po_tclass >= 0)
2322			optdata = (void *)&pktopt->ip6po_tclass;
2323		else
2324			optdata = (void *)&deftclass;
2325		optdatalen = sizeof(int);
2326		break;
2327	case IPV6_HOPOPTS:
2328		if (pktopt && pktopt->ip6po_hbh) {
2329			optdata = (void *)pktopt->ip6po_hbh;
2330			ip6e = (struct ip6_ext *)pktopt->ip6po_hbh;
2331			optdatalen = (ip6e->ip6e_len + 1) << 3;
2332		}
2333		break;
2334	case IPV6_RTHDR:
2335		if (pktopt && pktopt->ip6po_rthdr) {
2336			optdata = (void *)pktopt->ip6po_rthdr;
2337			ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr;
2338			optdatalen = (ip6e->ip6e_len + 1) << 3;
2339		}
2340		break;
2341	case IPV6_RTHDRDSTOPTS:
2342		if (pktopt && pktopt->ip6po_dest1) {
2343			optdata = (void *)pktopt->ip6po_dest1;
2344			ip6e = (struct ip6_ext *)pktopt->ip6po_dest1;
2345			optdatalen = (ip6e->ip6e_len + 1) << 3;
2346		}
2347		break;
2348	case IPV6_DSTOPTS:
2349		if (pktopt && pktopt->ip6po_dest2) {
2350			optdata = (void *)pktopt->ip6po_dest2;
2351			ip6e = (struct ip6_ext *)pktopt->ip6po_dest2;
2352			optdatalen = (ip6e->ip6e_len + 1) << 3;
2353		}
2354		break;
2355	case IPV6_NEXTHOP:
2356		if (pktopt && pktopt->ip6po_nexthop) {
2357			optdata = (void *)pktopt->ip6po_nexthop;
2358			optdatalen = pktopt->ip6po_nexthop->sa_len;
2359		}
2360		break;
2361	case IPV6_USE_MIN_MTU:
2362		if (pktopt)
2363			optdata = (void *)&pktopt->ip6po_minmtu;
2364		else
2365			optdata = (void *)&defminmtu;
2366		optdatalen = sizeof(int);
2367		break;
2368	case IPV6_DONTFRAG:
2369		if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
2370			on = 1;
2371		else
2372			on = 0;
2373		optdata = (void *)&on;
2374		optdatalen = sizeof(on);
2375		break;
2376	case IPV6_PREFER_TEMPADDR:
2377		if (pktopt)
2378			optdata = (void *)&pktopt->ip6po_prefer_tempaddr;
2379		else
2380			optdata = (void *)&defpreftemp;
2381		optdatalen = sizeof(int);
2382		break;
2383	default:		/* should not happen */
2384#ifdef DIAGNOSTIC
2385		panic("ip6_getpcbopt: unexpected option\n");
2386#endif
2387		return (ENOPROTOOPT);
2388	}
2389
2390	error = sooptcopyout(sopt, optdata, optdatalen);
2391
2392	return (error);
2393}
2394
2395void
2396ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
2397{
2398	if (pktopt == NULL)
2399		return;
2400
2401	if (optname == -1 || optname == IPV6_PKTINFO) {
2402		if (pktopt->ip6po_pktinfo)
2403			free(pktopt->ip6po_pktinfo, M_IP6OPT);
2404		pktopt->ip6po_pktinfo = NULL;
2405	}
2406	if (optname == -1 || optname == IPV6_HOPLIMIT)
2407		pktopt->ip6po_hlim = -1;
2408	if (optname == -1 || optname == IPV6_TCLASS)
2409		pktopt->ip6po_tclass = -1;
2410	if (optname == -1 || optname == IPV6_NEXTHOP) {
2411		if (pktopt->ip6po_nextroute.ro_rt) {
2412			RTFREE(pktopt->ip6po_nextroute.ro_rt);
2413			pktopt->ip6po_nextroute.ro_rt = NULL;
2414		}
2415		if (pktopt->ip6po_nexthop)
2416			free(pktopt->ip6po_nexthop, M_IP6OPT);
2417		pktopt->ip6po_nexthop = NULL;
2418	}
2419	if (optname == -1 || optname == IPV6_HOPOPTS) {
2420		if (pktopt->ip6po_hbh)
2421			free(pktopt->ip6po_hbh, M_IP6OPT);
2422		pktopt->ip6po_hbh = NULL;
2423	}
2424	if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
2425		if (pktopt->ip6po_dest1)
2426			free(pktopt->ip6po_dest1, M_IP6OPT);
2427		pktopt->ip6po_dest1 = NULL;
2428	}
2429	if (optname == -1 || optname == IPV6_RTHDR) {
2430		if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
2431			free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
2432		pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
2433		if (pktopt->ip6po_route.ro_rt) {
2434			RTFREE(pktopt->ip6po_route.ro_rt);
2435			pktopt->ip6po_route.ro_rt = NULL;
2436		}
2437	}
2438	if (optname == -1 || optname == IPV6_DSTOPTS) {
2439		if (pktopt->ip6po_dest2)
2440			free(pktopt->ip6po_dest2, M_IP6OPT);
2441		pktopt->ip6po_dest2 = NULL;
2442	}
2443}
2444
2445#define PKTOPT_EXTHDRCPY(type) \
2446do {\
2447	if (src->type) {\
2448		int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
2449		dst->type = malloc(hlen, M_IP6OPT, canwait);\
2450		if (dst->type == NULL)\
2451			goto bad;\
2452		bcopy(src->type, dst->type, hlen);\
2453	}\
2454} while (/*CONSTCOND*/ 0)
2455
2456static int
2457copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
2458{
2459	if (dst == NULL || src == NULL)  {
2460		printf("ip6_clearpktopts: invalid argument\n");
2461		return (EINVAL);
2462	}
2463
2464	dst->ip6po_hlim = src->ip6po_hlim;
2465	dst->ip6po_tclass = src->ip6po_tclass;
2466	dst->ip6po_flags = src->ip6po_flags;
2467	dst->ip6po_minmtu = src->ip6po_minmtu;
2468	dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr;
2469	if (src->ip6po_pktinfo) {
2470		dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
2471		    M_IP6OPT, canwait);
2472		if (dst->ip6po_pktinfo == NULL)
2473			goto bad;
2474		*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
2475	}
2476	if (src->ip6po_nexthop) {
2477		dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
2478		    M_IP6OPT, canwait);
2479		if (dst->ip6po_nexthop == NULL)
2480			goto bad;
2481		bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
2482		    src->ip6po_nexthop->sa_len);
2483	}
2484	PKTOPT_EXTHDRCPY(ip6po_hbh);
2485	PKTOPT_EXTHDRCPY(ip6po_dest1);
2486	PKTOPT_EXTHDRCPY(ip6po_dest2);
2487	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
2488	return (0);
2489
2490  bad:
2491	ip6_clearpktopts(dst, -1);
2492	return (ENOBUFS);
2493}
2494#undef PKTOPT_EXTHDRCPY
2495
2496struct ip6_pktopts *
2497ip6_copypktopts(struct ip6_pktopts *src, int canwait)
2498{
2499	int error;
2500	struct ip6_pktopts *dst;
2501
2502	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
2503	if (dst == NULL)
2504		return (NULL);
2505	ip6_initpktopts(dst);
2506
2507	if ((error = copypktopts(dst, src, canwait)) != 0) {
2508		free(dst, M_IP6OPT);
2509		return (NULL);
2510	}
2511
2512	return (dst);
2513}
2514
2515void
2516ip6_freepcbopts(struct ip6_pktopts *pktopt)
2517{
2518	if (pktopt == NULL)
2519		return;
2520
2521	ip6_clearpktopts(pktopt, -1);
2522
2523	free(pktopt, M_IP6OPT);
2524}
2525
2526/*
2527 * Set IPv6 outgoing packet options based on advanced API.
2528 */
2529int
2530ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
2531    struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto)
2532{
2533	struct cmsghdr *cm = NULL;
2534
2535	if (control == NULL || opt == NULL)
2536		return (EINVAL);
2537
2538	ip6_initpktopts(opt);
2539	if (stickyopt) {
2540		int error;
2541
2542		/*
2543		 * If stickyopt is provided, make a local copy of the options
2544		 * for this particular packet, then override them by ancillary
2545		 * objects.
2546		 * XXX: copypktopts() does not copy the cached route to a next
2547		 * hop (if any).  This is not very good in terms of efficiency,
2548		 * but we can allow this since this option should be rarely
2549		 * used.
2550		 */
2551		if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
2552			return (error);
2553	}
2554
2555	/*
2556	 * XXX: Currently, we assume all the optional information is stored
2557	 * in a single mbuf.
2558	 */
2559	if (control->m_next)
2560		return (EINVAL);
2561
2562	for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
2563	    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
2564		int error;
2565
2566		if (control->m_len < CMSG_LEN(0))
2567			return (EINVAL);
2568
2569		cm = mtod(control, struct cmsghdr *);
2570		if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len)
2571			return (EINVAL);
2572		if (cm->cmsg_level != IPPROTO_IPV6)
2573			continue;
2574
2575		error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
2576		    cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
2577		if (error)
2578			return (error);
2579	}
2580
2581	return (0);
2582}
2583
2584/*
2585 * Set a particular packet option, as a sticky option or an ancillary data
2586 * item.  "len" can be 0 only when it's a sticky option.
2587 * We have 4 cases of combination of "sticky" and "cmsg":
2588 * "sticky=0, cmsg=0": impossible
2589 * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
2590 * "sticky=1, cmsg=0": RFC3542 socket option
2591 * "sticky=1, cmsg=1": RFC2292 socket option
2592 */
2593static int
2594ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
2595    struct ucred *cred, int sticky, int cmsg, int uproto)
2596{
2597	int minmtupolicy, preftemp;
2598	int error;
2599
2600	if (!sticky && !cmsg) {
2601#ifdef DIAGNOSTIC
2602		printf("ip6_setpktopt: impossible case\n");
2603#endif
2604		return (EINVAL);
2605	}
2606
2607	/*
2608	 * IPV6_2292xxx is for backward compatibility to RFC2292, and should
2609	 * not be specified in the context of RFC3542.  Conversely,
2610	 * RFC3542 types should not be specified in the context of RFC2292.
2611	 */
2612	if (!cmsg) {
2613		switch (optname) {
2614		case IPV6_2292PKTINFO:
2615		case IPV6_2292HOPLIMIT:
2616		case IPV6_2292NEXTHOP:
2617		case IPV6_2292HOPOPTS:
2618		case IPV6_2292DSTOPTS:
2619		case IPV6_2292RTHDR:
2620		case IPV6_2292PKTOPTIONS:
2621			return (ENOPROTOOPT);
2622		}
2623	}
2624	if (sticky && cmsg) {
2625		switch (optname) {
2626		case IPV6_PKTINFO:
2627		case IPV6_HOPLIMIT:
2628		case IPV6_NEXTHOP:
2629		case IPV6_HOPOPTS:
2630		case IPV6_DSTOPTS:
2631		case IPV6_RTHDRDSTOPTS:
2632		case IPV6_RTHDR:
2633		case IPV6_USE_MIN_MTU:
2634		case IPV6_DONTFRAG:
2635		case IPV6_TCLASS:
2636		case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */
2637			return (ENOPROTOOPT);
2638		}
2639	}
2640
2641	switch (optname) {
2642	case IPV6_2292PKTINFO:
2643	case IPV6_PKTINFO:
2644	{
2645		struct ifnet *ifp = NULL;
2646		struct in6_pktinfo *pktinfo;
2647
2648		if (len != sizeof(struct in6_pktinfo))
2649			return (EINVAL);
2650
2651		pktinfo = (struct in6_pktinfo *)buf;
2652
2653		/*
2654		 * An application can clear any sticky IPV6_PKTINFO option by
2655		 * doing a "regular" setsockopt with ipi6_addr being
2656		 * in6addr_any and ipi6_ifindex being zero.
2657		 * [RFC 3542, Section 6]
2658		 */
2659		if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
2660		    pktinfo->ipi6_ifindex == 0 &&
2661		    IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
2662			ip6_clearpktopts(opt, optname);
2663			break;
2664		}
2665
2666		if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
2667		    sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
2668			return (EINVAL);
2669		}
2670		if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr))
2671			return (EINVAL);
2672		/* validate the interface index if specified. */
2673		if (pktinfo->ipi6_ifindex > V_if_index)
2674			 return (ENXIO);
2675		if (pktinfo->ipi6_ifindex) {
2676			ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
2677			if (ifp == NULL)
2678				return (ENXIO);
2679		}
2680		if (ifp != NULL && (ifp->if_afdata[AF_INET6] == NULL ||
2681		    (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0))
2682			return (ENETDOWN);
2683
2684		if (ifp != NULL &&
2685		    !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
2686			struct in6_ifaddr *ia;
2687
2688			in6_setscope(&pktinfo->ipi6_addr, ifp, NULL);
2689			ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr);
2690			if (ia == NULL)
2691				return (EADDRNOTAVAIL);
2692			ifa_free(&ia->ia_ifa);
2693		}
2694		/*
2695		 * We store the address anyway, and let in6_selectsrc()
2696		 * validate the specified address.  This is because ipi6_addr
2697		 * may not have enough information about its scope zone, and
2698		 * we may need additional information (such as outgoing
2699		 * interface or the scope zone of a destination address) to
2700		 * disambiguate the scope.
2701		 * XXX: the delay of the validation may confuse the
2702		 * application when it is used as a sticky option.
2703		 */
2704		if (opt->ip6po_pktinfo == NULL) {
2705			opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
2706			    M_IP6OPT, M_NOWAIT);
2707			if (opt->ip6po_pktinfo == NULL)
2708				return (ENOBUFS);
2709		}
2710		bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
2711		break;
2712	}
2713
2714	case IPV6_2292HOPLIMIT:
2715	case IPV6_HOPLIMIT:
2716	{
2717		int *hlimp;
2718
2719		/*
2720		 * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
2721		 * to simplify the ordering among hoplimit options.
2722		 */
2723		if (optname == IPV6_HOPLIMIT && sticky)
2724			return (ENOPROTOOPT);
2725
2726		if (len != sizeof(int))
2727			return (EINVAL);
2728		hlimp = (int *)buf;
2729		if (*hlimp < -1 || *hlimp > 255)
2730			return (EINVAL);
2731
2732		opt->ip6po_hlim = *hlimp;
2733		break;
2734	}
2735
2736	case IPV6_TCLASS:
2737	{
2738		int tclass;
2739
2740		if (len != sizeof(int))
2741			return (EINVAL);
2742		tclass = *(int *)buf;
2743		if (tclass < -1 || tclass > 255)
2744			return (EINVAL);
2745
2746		opt->ip6po_tclass = tclass;
2747		break;
2748	}
2749
2750	case IPV6_2292NEXTHOP:
2751	case IPV6_NEXTHOP:
2752		if (cred != NULL) {
2753			error = priv_check_cred(cred,
2754			    PRIV_NETINET_SETHDROPTS, 0);
2755			if (error)
2756				return (error);
2757		}
2758
2759		if (len == 0) {	/* just remove the option */
2760			ip6_clearpktopts(opt, IPV6_NEXTHOP);
2761			break;
2762		}
2763
2764		/* check if cmsg_len is large enough for sa_len */
2765		if (len < sizeof(struct sockaddr) || len < *buf)
2766			return (EINVAL);
2767
2768		switch (((struct sockaddr *)buf)->sa_family) {
2769		case AF_INET6:
2770		{
2771			struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
2772			int error;
2773
2774			if (sa6->sin6_len != sizeof(struct sockaddr_in6))
2775				return (EINVAL);
2776
2777			if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
2778			    IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
2779				return (EINVAL);
2780			}
2781			if ((error = sa6_embedscope(sa6, V_ip6_use_defzone))
2782			    != 0) {
2783				return (error);
2784			}
2785			break;
2786		}
2787		case AF_LINK:	/* should eventually be supported */
2788		default:
2789			return (EAFNOSUPPORT);
2790		}
2791
2792		/* turn off the previous option, then set the new option. */
2793		ip6_clearpktopts(opt, IPV6_NEXTHOP);
2794		opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
2795		if (opt->ip6po_nexthop == NULL)
2796			return (ENOBUFS);
2797		bcopy(buf, opt->ip6po_nexthop, *buf);
2798		break;
2799
2800	case IPV6_2292HOPOPTS:
2801	case IPV6_HOPOPTS:
2802	{
2803		struct ip6_hbh *hbh;
2804		int hbhlen;
2805
2806		/*
2807		 * XXX: We don't allow a non-privileged user to set ANY HbH
2808		 * options, since per-option restriction has too much
2809		 * overhead.
2810		 */
2811		if (cred != NULL) {
2812			error = priv_check_cred(cred,
2813			    PRIV_NETINET_SETHDROPTS, 0);
2814			if (error)
2815				return (error);
2816		}
2817
2818		if (len == 0) {
2819			ip6_clearpktopts(opt, IPV6_HOPOPTS);
2820			break;	/* just remove the option */
2821		}
2822
2823		/* message length validation */
2824		if (len < sizeof(struct ip6_hbh))
2825			return (EINVAL);
2826		hbh = (struct ip6_hbh *)buf;
2827		hbhlen = (hbh->ip6h_len + 1) << 3;
2828		if (len != hbhlen)
2829			return (EINVAL);
2830
2831		/* turn off the previous option, then set the new option. */
2832		ip6_clearpktopts(opt, IPV6_HOPOPTS);
2833		opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
2834		if (opt->ip6po_hbh == NULL)
2835			return (ENOBUFS);
2836		bcopy(hbh, opt->ip6po_hbh, hbhlen);
2837
2838		break;
2839	}
2840
2841	case IPV6_2292DSTOPTS:
2842	case IPV6_DSTOPTS:
2843	case IPV6_RTHDRDSTOPTS:
2844	{
2845		struct ip6_dest *dest, **newdest = NULL;
2846		int destlen;
2847
2848		if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */
2849			error = priv_check_cred(cred,
2850			    PRIV_NETINET_SETHDROPTS, 0);
2851			if (error)
2852				return (error);
2853		}
2854
2855		if (len == 0) {
2856			ip6_clearpktopts(opt, optname);
2857			break;	/* just remove the option */
2858		}
2859
2860		/* message length validation */
2861		if (len < sizeof(struct ip6_dest))
2862			return (EINVAL);
2863		dest = (struct ip6_dest *)buf;
2864		destlen = (dest->ip6d_len + 1) << 3;
2865		if (len != destlen)
2866			return (EINVAL);
2867
2868		/*
2869		 * Determine the position that the destination options header
2870		 * should be inserted; before or after the routing header.
2871		 */
2872		switch (optname) {
2873		case IPV6_2292DSTOPTS:
2874			/*
2875			 * The old advacned API is ambiguous on this point.
2876			 * Our approach is to determine the position based
2877			 * according to the existence of a routing header.
2878			 * Note, however, that this depends on the order of the
2879			 * extension headers in the ancillary data; the 1st
2880			 * part of the destination options header must appear
2881			 * before the routing header in the ancillary data,
2882			 * too.
2883			 * RFC3542 solved the ambiguity by introducing
2884			 * separate ancillary data or option types.
2885			 */
2886			if (opt->ip6po_rthdr == NULL)
2887				newdest = &opt->ip6po_dest1;
2888			else
2889				newdest = &opt->ip6po_dest2;
2890			break;
2891		case IPV6_RTHDRDSTOPTS:
2892			newdest = &opt->ip6po_dest1;
2893			break;
2894		case IPV6_DSTOPTS:
2895			newdest = &opt->ip6po_dest2;
2896			break;
2897		}
2898
2899		/* turn off the previous option, then set the new option. */
2900		ip6_clearpktopts(opt, optname);
2901		*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
2902		if (*newdest == NULL)
2903			return (ENOBUFS);
2904		bcopy(dest, *newdest, destlen);
2905
2906		break;
2907	}
2908
2909	case IPV6_2292RTHDR:
2910	case IPV6_RTHDR:
2911	{
2912		struct ip6_rthdr *rth;
2913		int rthlen;
2914
2915		if (len == 0) {
2916			ip6_clearpktopts(opt, IPV6_RTHDR);
2917			break;	/* just remove the option */
2918		}
2919
2920		/* message length validation */
2921		if (len < sizeof(struct ip6_rthdr))
2922			return (EINVAL);
2923		rth = (struct ip6_rthdr *)buf;
2924		rthlen = (rth->ip6r_len + 1) << 3;
2925		if (len != rthlen)
2926			return (EINVAL);
2927
2928		switch (rth->ip6r_type) {
2929		case IPV6_RTHDR_TYPE_0:
2930			if (rth->ip6r_len == 0)	/* must contain one addr */
2931				return (EINVAL);
2932			if (rth->ip6r_len % 2) /* length must be even */
2933				return (EINVAL);
2934			if (rth->ip6r_len / 2 != rth->ip6r_segleft)
2935				return (EINVAL);
2936			break;
2937		default:
2938			return (EINVAL);	/* not supported */
2939		}
2940
2941		/* turn off the previous option */
2942		ip6_clearpktopts(opt, IPV6_RTHDR);
2943		opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
2944		if (opt->ip6po_rthdr == NULL)
2945			return (ENOBUFS);
2946		bcopy(rth, opt->ip6po_rthdr, rthlen);
2947
2948		break;
2949	}
2950
2951	case IPV6_USE_MIN_MTU:
2952		if (len != sizeof(int))
2953			return (EINVAL);
2954		minmtupolicy = *(int *)buf;
2955		if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
2956		    minmtupolicy != IP6PO_MINMTU_DISABLE &&
2957		    minmtupolicy != IP6PO_MINMTU_ALL) {
2958			return (EINVAL);
2959		}
2960		opt->ip6po_minmtu = minmtupolicy;
2961		break;
2962
2963	case IPV6_DONTFRAG:
2964		if (len != sizeof(int))
2965			return (EINVAL);
2966
2967		if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
2968			/*
2969			 * we ignore this option for TCP sockets.
2970			 * (RFC3542 leaves this case unspecified.)
2971			 */
2972			opt->ip6po_flags &= ~IP6PO_DONTFRAG;
2973		} else
2974			opt->ip6po_flags |= IP6PO_DONTFRAG;
2975		break;
2976
2977	case IPV6_PREFER_TEMPADDR:
2978		if (len != sizeof(int))
2979			return (EINVAL);
2980		preftemp = *(int *)buf;
2981		if (preftemp != IP6PO_TEMPADDR_SYSTEM &&
2982		    preftemp != IP6PO_TEMPADDR_NOTPREFER &&
2983		    preftemp != IP6PO_TEMPADDR_PREFER) {
2984			return (EINVAL);
2985		}
2986		opt->ip6po_prefer_tempaddr = preftemp;
2987		break;
2988
2989	default:
2990		return (ENOPROTOOPT);
2991	} /* end of switch */
2992
2993	return (0);
2994}
2995
2996/*
2997 * Routine called from ip6_output() to loop back a copy of an IP6 multicast
2998 * packet to the input queue of a specified interface.  Note that this
2999 * calls the output routine of the loopback "driver", but with an interface
3000 * pointer that might NOT be &loif -- easier than replicating that code here.
3001 */
3002void
3003ip6_mloopback(struct ifnet *ifp, struct mbuf *m)
3004{
3005	struct mbuf *copym;
3006	struct ip6_hdr *ip6;
3007
3008	copym = m_copy(m, 0, M_COPYALL);
3009	if (copym == NULL)
3010		return;
3011
3012	/*
3013	 * Make sure to deep-copy IPv6 header portion in case the data
3014	 * is in an mbuf cluster, so that we can safely override the IPv6
3015	 * header portion later.
3016	 */
3017	if (!M_WRITABLE(copym) ||
3018	    copym->m_len < sizeof(struct ip6_hdr)) {
3019		copym = m_pullup(copym, sizeof(struct ip6_hdr));
3020		if (copym == NULL)
3021			return;
3022	}
3023	ip6 = mtod(copym, struct ip6_hdr *);
3024	/*
3025	 * clear embedded scope identifiers if necessary.
3026	 * in6_clearscope will touch the addresses only when necessary.
3027	 */
3028	in6_clearscope(&ip6->ip6_src);
3029	in6_clearscope(&ip6->ip6_dst);
3030	if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
3031		copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 |
3032		    CSUM_PSEUDO_HDR;
3033		copym->m_pkthdr.csum_data = 0xffff;
3034	}
3035	if_simloop(ifp, copym, AF_INET6, 0);
3036}
3037
3038/*
3039 * Chop IPv6 header off from the payload.
3040 */
3041static int
3042ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
3043{
3044	struct mbuf *mh;
3045	struct ip6_hdr *ip6;
3046
3047	ip6 = mtod(m, struct ip6_hdr *);
3048	if (m->m_len > sizeof(*ip6)) {
3049		mh = m_gethdr(M_NOWAIT, MT_DATA);
3050		if (mh == NULL) {
3051			m_freem(m);
3052			return ENOBUFS;
3053		}
3054		m_move_pkthdr(mh, m);
3055		M_ALIGN(mh, sizeof(*ip6));
3056		m->m_len -= sizeof(*ip6);
3057		m->m_data += sizeof(*ip6);
3058		mh->m_next = m;
3059		m = mh;
3060		m->m_len = sizeof(*ip6);
3061		bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
3062	}
3063	exthdrs->ip6e_ip6 = m;
3064	return 0;
3065}
3066
3067/*
3068 * Compute IPv6 extension header length.
3069 */
3070int
3071ip6_optlen(struct inpcb *in6p)
3072{
3073	int len;
3074
3075	if (!in6p->in6p_outputopts)
3076		return 0;
3077
3078	len = 0;
3079#define elen(x) \
3080    (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
3081
3082	len += elen(in6p->in6p_outputopts->ip6po_hbh);
3083	if (in6p->in6p_outputopts->ip6po_rthdr)
3084		/* dest1 is valid with rthdr only */
3085		len += elen(in6p->in6p_outputopts->ip6po_dest1);
3086	len += elen(in6p->in6p_outputopts->ip6po_rthdr);
3087	len += elen(in6p->in6p_outputopts->ip6po_dest2);
3088	return len;
3089#undef elen
3090}
3091