icmp.c revision 11042:2d6e217af1b4
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25/* Copyright (c) 1990 Mentat Inc. */
26
27#include <sys/types.h>
28#include <sys/stream.h>
29#include <sys/stropts.h>
30#include <sys/strlog.h>
31#include <sys/strsun.h>
32#define	_SUN_TPI_VERSION 2
33#include <sys/tihdr.h>
34#include <sys/timod.h>
35#include <sys/ddi.h>
36#include <sys/sunddi.h>
37#include <sys/strsubr.h>
38#include <sys/suntpi.h>
39#include <sys/xti_inet.h>
40#include <sys/cmn_err.h>
41#include <sys/kmem.h>
42#include <sys/cred_impl.h>
43#include <sys/policy.h>
44#include <sys/priv.h>
45#include <sys/ucred.h>
46#include <sys/zone.h>
47
48#include <sys/sockio.h>
49#include <sys/socket.h>
50#include <sys/socketvar.h>
51#include <sys/vtrace.h>
52#include <sys/sdt.h>
53#include <sys/debug.h>
54#include <sys/isa_defs.h>
55#include <sys/random.h>
56#include <netinet/in.h>
57#include <netinet/ip6.h>
58#include <netinet/icmp6.h>
59#include <netinet/udp.h>
60
61#include <inet/common.h>
62#include <inet/ip.h>
63#include <inet/ip_impl.h>
64#include <inet/ipsec_impl.h>
65#include <inet/ip6.h>
66#include <inet/ip_ire.h>
67#include <inet/ip_if.h>
68#include <inet/ip_multi.h>
69#include <inet/ip_ndp.h>
70#include <inet/proto_set.h>
71#include <inet/mib2.h>
72#include <inet/nd.h>
73#include <inet/optcom.h>
74#include <inet/snmpcom.h>
75#include <inet/kstatcom.h>
76#include <inet/ipclassifier.h>
77
78#include <sys/tsol/label.h>
79#include <sys/tsol/tnet.h>
80
81#include <inet/rawip_impl.h>
82
83#include <sys/disp.h>
84
85/*
86 * Synchronization notes:
87 *
88 * RAWIP is MT and uses the usual kernel synchronization primitives. We use
89 * conn_lock to protect the icmp_t.
90 *
91 * Plumbing notes:
92 * ICMP is always a device driver. For compatibility with mibopen() code
93 * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
94 * dummy module.
95 */
96
97static void	icmp_addr_req(queue_t *q, mblk_t *mp);
98static void	icmp_tpi_bind(queue_t *q, mblk_t *mp);
99static void	icmp_bind_proto(icmp_t *icmp);
100static int	icmp_build_hdr_template(conn_t *, const in6_addr_t *,
101    const in6_addr_t *, uint32_t);
102static void	icmp_capability_req(queue_t *q, mblk_t *mp);
103static int	icmp_close(queue_t *q, int flags);
104static void	icmp_close_free(conn_t *);
105static void	icmp_tpi_connect(queue_t *q, mblk_t *mp);
106static void	icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
107static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
108    int sys_error);
109static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
110    t_scalar_t tlierr, int sys_error);
111static void	icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2,
112    ip_recv_attr_t *);
113static void	icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
114    ip_recv_attr_t *);
115static void	icmp_info_req(queue_t *q, mblk_t *mp);
116static void	icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
117static conn_t 	*icmp_open(int family, cred_t *credp, int *err, int flags);
118static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
119		    cred_t *credp);
120static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
121		    cred_t *credp);
122static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
123int		icmp_opt_set(conn_t *connp, uint_t optset_context,
124		    int level, int name, uint_t inlen,
125		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
126		    void *thisdg_attrs, cred_t *cr);
127int		icmp_opt_get(conn_t *connp, int level, int name,
128		    uchar_t *ptr);
129static int	icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
130		    sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa);
131static int	icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
132static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt);
133static int	icmp_param_set(queue_t *q, mblk_t *mp, char *value,
134		    caddr_t cp, cred_t *cr);
135static mblk_t	*icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
136    const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *);
137static mblk_t	*icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
138    mblk_t *, const in6_addr_t *, uint32_t, int *);
139static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
140		    uchar_t *ptr, int len);
141static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
142static void	icmp_tpi_unbind(queue_t *q, mblk_t *mp);
143static void	icmp_wput(queue_t *q, mblk_t *mp);
144static void	icmp_wput_fallback(queue_t *q, mblk_t *mp);
145static void	icmp_wput_other(queue_t *q, mblk_t *mp);
146static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
147static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
148static void	icmp_ulp_recv(conn_t *, mblk_t *, uint_t);
149
150static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
151static void	rawip_stack_fini(netstackid_t stackid, void *arg);
152
153static void	*rawip_kstat_init(netstackid_t stackid);
154static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
155static int	rawip_kstat_update(kstat_t *kp, int rw);
156static void	rawip_stack_shutdown(netstackid_t stackid, void *arg);
157
158/* Common routines for TPI and socket module */
159static conn_t	*rawip_do_open(int, cred_t *, int *, int);
160static void	rawip_do_close(conn_t *);
161static int	rawip_do_bind(conn_t *, struct sockaddr *, socklen_t);
162static int	rawip_do_unbind(conn_t *);
163static int	rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t,
164    cred_t *, pid_t);
165
166int		rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
167		    socklen_t *, cred_t *);
168int		rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
169		    socklen_t *, cred_t *);
170
171static struct module_info icmp_mod_info =  {
172	5707, "icmp", 1, INFPSZ, 512, 128
173};
174
175/*
176 * Entry points for ICMP as a device.
177 * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
178 */
179static struct qinit icmprinitv4 = {
180	NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
181};
182
183static struct qinit icmprinitv6 = {
184	NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
185};
186
187static struct qinit icmpwinit = {
188	(pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
189};
190
191/* ICMP entry point during fallback */
192static struct qinit icmp_fallback_sock_winit = {
193	(pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
194};
195
196/* For AF_INET aka /dev/icmp */
197struct streamtab icmpinfov4 = {
198	&icmprinitv4, &icmpwinit
199};
200
201/* For AF_INET6 aka /dev/icmp6 */
202struct streamtab icmpinfov6 = {
203	&icmprinitv6, &icmpwinit
204};
205
206static sin_t	sin_null;	/* Zero address for quick clears */
207static sin6_t	sin6_null;	/* Zero address for quick clears */
208
209/* Default structure copied into T_INFO_ACK messages */
210static struct T_info_ack icmp_g_t_info_ack = {
211	T_INFO_ACK,
212	IP_MAXPACKET,	 /* TSDU_size.  icmp allows maximum size messages. */
213	T_INVALID,	/* ETSDU_size.  icmp does not support expedited data. */
214	T_INVALID,	/* CDATA_size. icmp does not support connect data. */
215	T_INVALID,	/* DDATA_size. icmp does not support disconnect data. */
216	0,		/* ADDR_size - filled in later. */
217	0,		/* OPT_size - not initialized here */
218	IP_MAXPACKET,	/* TIDU_size.  icmp allows maximum size messages. */
219	T_CLTS,		/* SERV_type.  icmp supports connection-less. */
220	TS_UNBND,	/* CURRENT_state.  This is set from icmp_state. */
221	(XPG4_1|SENDZERO) /* PROVIDER_flag */
222};
223
224/*
225 * Table of ND variables supported by icmp.  These are loaded into is_nd
226 * when the stack instance is created.
227 * All of these are alterable, within the min/max values given, at run time.
228 */
229static icmpparam_t	icmp_param_arr[] = {
230	/* min	max	value	name */
231	{ 0,	128,	32,	"icmp_wroff_extra" },
232	{ 1,	255,	255,	"icmp_ipv4_ttl" },
233	{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS,	"icmp_ipv6_hoplimit"},
234	{ 0,	1,	1,	"icmp_bsd_compat" },
235	{ 4096,	65536,	8192,	"icmp_xmit_hiwat"},
236	{ 0,	65536,	1024,	"icmp_xmit_lowat"},
237	{ 4096,	65536,	8192,	"icmp_recv_hiwat"},
238	{ 65536, 1024*1024*1024, 256*1024,	"icmp_max_buf"},
239	{ 0,	1,	0,	"icmp_pmtu_discovery" },
240	{ 0,	1,	0,	"icmp_sendto_ignerr" },
241};
242#define	is_wroff_extra			is_param_arr[0].icmp_param_value
243#define	is_ipv4_ttl			is_param_arr[1].icmp_param_value
244#define	is_ipv6_hoplimit		is_param_arr[2].icmp_param_value
245#define	is_bsd_compat			is_param_arr[3].icmp_param_value
246#define	is_xmit_hiwat			is_param_arr[4].icmp_param_value
247#define	is_xmit_lowat			is_param_arr[5].icmp_param_value
248#define	is_recv_hiwat			is_param_arr[6].icmp_param_value
249#define	is_max_buf			is_param_arr[7].icmp_param_value
250#define	is_pmtu_discovery		is_param_arr[8].icmp_param_value
251#define	is_sendto_ignerr		is_param_arr[9].icmp_param_value
252
253typedef union T_primitives *t_primp_t;
254
255/*
256 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
257 * passed to icmp_wput.
258 * It calls IP to verify the local IP address, and calls IP to insert
259 * the conn_t in the fanout table.
260 * If everything is ok it then sends the T_BIND_ACK back up.
261 */
262static void
263icmp_tpi_bind(queue_t *q, mblk_t *mp)
264{
265	int	error;
266	struct sockaddr *sa;
267	struct T_bind_req *tbr;
268	socklen_t	len;
269	sin_t	*sin;
270	sin6_t	*sin6;
271	icmp_t		*icmp;
272	conn_t	*connp = Q_TO_CONN(q);
273	mblk_t *mp1;
274	cred_t *cr;
275
276	/*
277	 * All Solaris components should pass a db_credp
278	 * for this TPI message, hence we ASSERT.
279	 * But in case there is some other M_PROTO that looks
280	 * like a TPI message sent by some other kernel
281	 * component, we check and return an error.
282	 */
283	cr = msg_getcred(mp, NULL);
284	ASSERT(cr != NULL);
285	if (cr == NULL) {
286		icmp_err_ack(q, mp, TSYSERR, EINVAL);
287		return;
288	}
289
290	icmp = connp->conn_icmp;
291	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
292		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
293		    "icmp_bind: bad req, len %u",
294		    (uint_t)(mp->b_wptr - mp->b_rptr));
295		icmp_err_ack(q, mp, TPROTO, 0);
296		return;
297	}
298
299	if (icmp->icmp_state != TS_UNBND) {
300		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
301		    "icmp_bind: bad state, %u", icmp->icmp_state);
302		icmp_err_ack(q, mp, TOUTSTATE, 0);
303		return;
304	}
305
306	/*
307	 * Reallocate the message to make sure we have enough room for an
308	 * address.
309	 */
310	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
311	if (mp1 == NULL) {
312		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
313		return;
314	}
315	mp = mp1;
316
317	/* Reset the message type in preparation for shipping it back. */
318	DB_TYPE(mp) = M_PCPROTO;
319	tbr = (struct T_bind_req *)mp->b_rptr;
320	len = tbr->ADDR_length;
321	switch (len) {
322	case 0:	/* request for a generic port */
323		tbr->ADDR_offset = sizeof (struct T_bind_req);
324		if (connp->conn_family == AF_INET) {
325			tbr->ADDR_length = sizeof (sin_t);
326			sin = (sin_t *)&tbr[1];
327			*sin = sin_null;
328			sin->sin_family = AF_INET;
329			mp->b_wptr = (uchar_t *)&sin[1];
330			sa = (struct sockaddr *)sin;
331			len = sizeof (sin_t);
332		} else {
333			ASSERT(connp->conn_family == AF_INET6);
334			tbr->ADDR_length = sizeof (sin6_t);
335			sin6 = (sin6_t *)&tbr[1];
336			*sin6 = sin6_null;
337			sin6->sin6_family = AF_INET6;
338			mp->b_wptr = (uchar_t *)&sin6[1];
339			sa = (struct sockaddr *)sin6;
340			len = sizeof (sin6_t);
341		}
342		break;
343
344	case sizeof (sin_t):	/* Complete IPv4 address */
345		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
346		    sizeof (sin_t));
347		break;
348
349	case sizeof (sin6_t):	/* Complete IPv6 address */
350		sa = (struct sockaddr *)mi_offset_param(mp,
351		    tbr->ADDR_offset, sizeof (sin6_t));
352		break;
353
354	default:
355		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
356		    "icmp_bind: bad ADDR_length %u", tbr->ADDR_length);
357		icmp_err_ack(q, mp, TBADADDR, 0);
358		return;
359	}
360
361	error = rawip_do_bind(connp, sa, len);
362	if (error != 0) {
363		if (error > 0) {
364			icmp_err_ack(q, mp, TSYSERR, error);
365		} else {
366			icmp_err_ack(q, mp, -error, 0);
367		}
368	} else {
369		tbr->PRIM_type = T_BIND_ACK;
370		qreply(q, mp);
371	}
372}
373
374static int
375rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
376{
377	sin_t		*sin;
378	sin6_t		*sin6;
379	icmp_t		*icmp = connp->conn_icmp;
380	int		error = 0;
381	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
382	in_port_t	lport;		/* Network byte order */
383	ipaddr_t	v4src;		/* Set if AF_INET */
384	in6_addr_t	v6src;
385	uint_t		scopeid = 0;
386	zoneid_t	zoneid = IPCL_ZONEID(connp);
387	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
388
389	if (sa == NULL || !OK_32PTR((char *)sa)) {
390		return (EINVAL);
391	}
392
393	switch (len) {
394	case sizeof (sin_t):    /* Complete IPv4 address */
395		sin = (sin_t *)sa;
396		if (sin->sin_family != AF_INET ||
397		    connp->conn_family != AF_INET) {
398			/* TSYSERR, EAFNOSUPPORT */
399			return (EAFNOSUPPORT);
400		}
401		v4src = sin->sin_addr.s_addr;
402		IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
403		if (v4src != INADDR_ANY) {
404			laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
405			    B_TRUE);
406		}
407		lport = sin->sin_port;
408		break;
409	case sizeof (sin6_t): /* Complete IPv6 address */
410		sin6 = (sin6_t *)sa;
411		if (sin6->sin6_family != AF_INET6 ||
412		    connp->conn_family != AF_INET6) {
413			/* TSYSERR, EAFNOSUPPORT */
414			return (EAFNOSUPPORT);
415		}
416		/* No support for mapped addresses on raw sockets */
417		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
418			/* TSYSERR, EADDRNOTAVAIL */
419			return (EADDRNOTAVAIL);
420		}
421		v6src = sin6->sin6_addr;
422		if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
423			if (IN6_IS_ADDR_LINKSCOPE(&v6src))
424				scopeid = sin6->sin6_scope_id;
425			laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst,
426			    B_TRUE, scopeid);
427		}
428		lport = sin6->sin6_port;
429		break;
430
431	default:
432		/* TBADADDR */
433		return (EADDRNOTAVAIL);
434	}
435
436	/* Is the local address a valid unicast, multicast, or broadcast? */
437	if (laddr_type == IPVL_BAD)
438		return (EADDRNOTAVAIL);
439
440	/*
441	 * The state must be TS_UNBND.
442	 */
443	mutex_enter(&connp->conn_lock);
444	if (icmp->icmp_state != TS_UNBND) {
445		mutex_exit(&connp->conn_lock);
446		return (-TOUTSTATE);
447	}
448
449	/*
450	 * Copy the source address into our icmp structure.  This address
451	 * may still be zero; if so, ip will fill in the correct address
452	 * each time an outbound packet is passed to it.
453	 * If we are binding to a broadcast or multicast address then
454	 * we just set the conn_bound_addr since we don't want to use
455	 * that as the source address when sending.
456	 */
457	connp->conn_bound_addr_v6 = v6src;
458	connp->conn_laddr_v6 = v6src;
459	if (scopeid != 0) {
460		connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
461		connp->conn_ixa->ixa_scopeid = scopeid;
462		connp->conn_incoming_ifindex = scopeid;
463	} else {
464		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
465		connp->conn_incoming_ifindex = connp->conn_bound_if;
466	}
467
468	switch (laddr_type) {
469	case IPVL_UNICAST_UP:
470	case IPVL_UNICAST_DOWN:
471		connp->conn_saddr_v6 = v6src;
472		connp->conn_mcbc_bind = B_FALSE;
473		break;
474	case IPVL_MCAST:
475	case IPVL_BCAST:
476		/* ip_set_destination will pick a source address later */
477		connp->conn_saddr_v6 = ipv6_all_zeros;
478		connp->conn_mcbc_bind = B_TRUE;
479		break;
480	}
481
482	/* Any errors after this point should use late_error */
483
484	/*
485	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
486	 * with IPPROTO_TCP.
487	 */
488	connp->conn_lport = lport;
489	connp->conn_fport = 0;
490
491	if (connp->conn_family == AF_INET) {
492		ASSERT(connp->conn_ipversion == IPV4_VERSION);
493	} else {
494		ASSERT(connp->conn_ipversion == IPV6_VERSION);
495	}
496
497	icmp->icmp_state = TS_IDLE;
498
499	/*
500	 * We create an initial header template here to make a subsequent
501	 * sendto have a starting point. Since conn_last_dst is zero the
502	 * first sendto will always follow the 'dst changed' code path.
503	 * Note that we defer massaging options and the related checksum
504	 * adjustment until we have a destination address.
505	 */
506	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
507	    &connp->conn_faddr_v6, connp->conn_flowinfo);
508	if (error != 0) {
509		mutex_exit(&connp->conn_lock);
510		goto late_error;
511	}
512	/* Just in case */
513	connp->conn_faddr_v6 = ipv6_all_zeros;
514	connp->conn_v6lastdst = ipv6_all_zeros;
515	mutex_exit(&connp->conn_lock);
516
517	error = ip_laddr_fanout_insert(connp);
518	if (error != 0)
519		goto late_error;
520
521	/* Bind succeeded */
522	return (0);
523
524late_error:
525	mutex_enter(&connp->conn_lock);
526	connp->conn_saddr_v6 = ipv6_all_zeros;
527	connp->conn_bound_addr_v6 = ipv6_all_zeros;
528	connp->conn_laddr_v6 = ipv6_all_zeros;
529	if (scopeid != 0) {
530		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
531		connp->conn_incoming_ifindex = connp->conn_bound_if;
532	}
533	icmp->icmp_state = TS_UNBND;
534	connp->conn_v6lastdst = ipv6_all_zeros;
535	connp->conn_lport = 0;
536
537	/* Restore the header that was built above - different source address */
538	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
539	    &connp->conn_faddr_v6, connp->conn_flowinfo);
540	mutex_exit(&connp->conn_lock);
541	return (error);
542}
543
544/*
545 * Tell IP to just bind to the protocol.
546 */
547static void
548icmp_bind_proto(icmp_t *icmp)
549{
550	conn_t	*connp = icmp->icmp_connp;
551
552	mutex_enter(&connp->conn_lock);
553	connp->conn_saddr_v6 = ipv6_all_zeros;
554	connp->conn_laddr_v6 = ipv6_all_zeros;
555	connp->conn_faddr_v6 = ipv6_all_zeros;
556	connp->conn_v6lastdst = ipv6_all_zeros;
557	mutex_exit(&connp->conn_lock);
558
559	(void) ip_laddr_fanout_insert(connp);
560}
561
562/*
563 * This routine handles each T_CONN_REQ message passed to icmp.  It
564 * associates a default destination address with the stream.
565 *
566 * After various error checks are completed, icmp_connect() lays
567 * the target address and port into the composite header template.
568 * Then we ask IP for information, including a source address if we didn't
569 * already have one. Finally we send up the T_OK_ACK reply message.
570 */
571static void
572icmp_tpi_connect(queue_t *q, mblk_t *mp)
573{
574	conn_t	*connp = Q_TO_CONN(q);
575	struct T_conn_req	*tcr;
576	struct sockaddr *sa;
577	socklen_t len;
578	int error;
579	cred_t *cr;
580	pid_t pid;
581	/*
582	 * All Solaris components should pass a db_credp
583	 * for this TPI message, hence we ASSERT.
584	 * But in case there is some other M_PROTO that looks
585	 * like a TPI message sent by some other kernel
586	 * component, we check and return an error.
587	 */
588	cr = msg_getcred(mp, &pid);
589	ASSERT(cr != NULL);
590	if (cr == NULL) {
591		icmp_err_ack(q, mp, TSYSERR, EINVAL);
592		return;
593	}
594
595	tcr = (struct T_conn_req *)mp->b_rptr;
596	/* Sanity checks */
597	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
598		icmp_err_ack(q, mp, TPROTO, 0);
599		return;
600	}
601
602	if (tcr->OPT_length != 0) {
603		icmp_err_ack(q, mp, TBADOPT, 0);
604		return;
605	}
606
607	len = tcr->DEST_length;
608
609	switch (len) {
610	default:
611		icmp_err_ack(q, mp, TBADADDR, 0);
612		return;
613	case sizeof (sin_t):
614		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
615		    sizeof (sin_t));
616		break;
617	case sizeof (sin6_t):
618		sa = (struct sockaddr *)mi_offset_param(mp,
619		    tcr->DEST_offset, sizeof (sin6_t));
620		break;
621	}
622
623	error = proto_verify_ip_addr(connp->conn_family, sa, len);
624	if (error != 0) {
625		icmp_err_ack(q, mp, TSYSERR, error);
626		return;
627	}
628
629	error = rawip_do_connect(connp, sa, len, cr, pid);
630	if (error != 0) {
631		if (error < 0) {
632			icmp_err_ack(q, mp, -error, 0);
633		} else {
634			icmp_err_ack(q, mp, 0, error);
635		}
636	} else {
637		mblk_t *mp1;
638
639		/*
640		 * We have to send a connection confirmation to
641		 * keep TLI happy.
642		 */
643		if (connp->conn_family == AF_INET) {
644			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
645			    sizeof (sin_t), NULL, 0);
646		} else {
647			ASSERT(connp->conn_family == AF_INET6);
648			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
649			    sizeof (sin6_t), NULL, 0);
650		}
651		if (mp1 == NULL) {
652			icmp_err_ack(q, mp, TSYSERR, ENOMEM);
653			return;
654		}
655
656		/*
657		 * Send ok_ack for T_CONN_REQ
658		 */
659		mp = mi_tpi_ok_ack_alloc(mp);
660		if (mp == NULL) {
661			/* Unable to reuse the T_CONN_REQ for the ack. */
662			icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
663			return;
664		}
665		putnext(connp->conn_rq, mp);
666		putnext(connp->conn_rq, mp1);
667	}
668}
669
670static int
671rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
672    cred_t *cr, pid_t pid)
673{
674	icmp_t		*icmp;
675	sin_t		*sin;
676	sin6_t		*sin6;
677	int		error;
678	uint16_t 	dstport;
679	ipaddr_t	v4dst;
680	in6_addr_t	v6dst;
681	uint32_t	flowinfo;
682	ip_xmit_attr_t	*ixa;
683	uint_t		scopeid = 0;
684	uint_t		srcid = 0;
685	in6_addr_t	v6src = connp->conn_saddr_v6;
686
687	icmp = connp->conn_icmp;
688
689	if (sa == NULL || !OK_32PTR((char *)sa)) {
690		return (EINVAL);
691	}
692
693	ASSERT(sa != NULL && len != 0);
694
695	/*
696	 * Determine packet type based on type of address passed in
697	 * the request should contain an IPv4 or IPv6 address.
698	 * Make sure that address family matches the type of
699	 * family of the address passed down.
700	 */
701	switch (len) {
702	case sizeof (sin_t):
703		sin = (sin_t *)sa;
704
705		v4dst = sin->sin_addr.s_addr;
706		dstport = sin->sin_port;
707		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
708		ASSERT(connp->conn_ipversion == IPV4_VERSION);
709		break;
710
711	case sizeof (sin6_t):
712		sin6 = (sin6_t *)sa;
713
714		/* No support for mapped addresses on raw sockets */
715		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
716			return (EADDRNOTAVAIL);
717		}
718		v6dst = sin6->sin6_addr;
719		dstport = sin6->sin6_port;
720		ASSERT(connp->conn_ipversion == IPV6_VERSION);
721		flowinfo = sin6->sin6_flowinfo;
722		if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
723			scopeid = sin6->sin6_scope_id;
724		srcid = sin6->__sin6_src_id;
725		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
726			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
727			    connp->conn_netstack);
728		}
729		break;
730	}
731
732	/*
733	 * If there is a different thread using conn_ixa then we get a new
734	 * copy and cut the old one loose from conn_ixa. Otherwise we use
735	 * conn_ixa and prevent any other thread from using/changing it.
736	 * Once connect() is done other threads can use conn_ixa since the
737	 * refcnt will be back at one.
738	 */
739	ixa = conn_get_ixa(connp, B_TRUE);
740	if (ixa == NULL)
741		return (ENOMEM);
742
743	ASSERT(ixa->ixa_refcnt >= 2);
744	ASSERT(ixa == connp->conn_ixa);
745
746	mutex_enter(&connp->conn_lock);
747	/*
748	 * This icmp_t must have bound already before doing a connect.
749	 * Reject if a connect is in progress (we drop conn_lock during
750	 * rawip_do_connect).
751	 */
752	if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) {
753		mutex_exit(&connp->conn_lock);
754		ixa_refrele(ixa);
755		return (-TOUTSTATE);
756	}
757
758	if (icmp->icmp_state == TS_DATA_XFER) {
759		/* Already connected - clear out state */
760		if (connp->conn_mcbc_bind)
761			connp->conn_saddr_v6 = ipv6_all_zeros;
762		else
763			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
764		connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
765		connp->conn_faddr_v6 = ipv6_all_zeros;
766		icmp->icmp_state = TS_IDLE;
767	}
768
769	/*
770	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
771	 * with IPPROTO_TCP.
772	 */
773	connp->conn_fport = dstport;
774	if (connp->conn_ipversion == IPV4_VERSION) {
775		/*
776		 * Interpret a zero destination to mean loopback.
777		 * Update the T_CONN_REQ (sin/sin6) since it is used to
778		 * generate the T_CONN_CON.
779		 */
780		if (v4dst == INADDR_ANY) {
781			v4dst = htonl(INADDR_LOOPBACK);
782			IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
783			ASSERT(connp->conn_family == AF_INET);
784			sin->sin_addr.s_addr = v4dst;
785		}
786		connp->conn_faddr_v6 = v6dst;
787		connp->conn_flowinfo = 0;
788	} else {
789		ASSERT(connp->conn_ipversion == IPV6_VERSION);
790		/*
791		 * Interpret a zero destination to mean loopback.
792		 * Update the T_CONN_REQ (sin/sin6) since it is used to
793		 * generate the T_CONN_CON.
794		 */
795		if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
796			v6dst = ipv6_loopback;
797			sin6->sin6_addr = v6dst;
798		}
799		connp->conn_faddr_v6 = v6dst;
800		connp->conn_flowinfo = flowinfo;
801	}
802
803	ixa->ixa_cred = cr;
804	ixa->ixa_cpid = pid;
805	if (is_system_labeled()) {
806		/* We need to restart with a label based on the cred */
807		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
808	}
809
810	if (scopeid != 0) {
811		ixa->ixa_flags |= IXAF_SCOPEID_SET;
812		ixa->ixa_scopeid = scopeid;
813		connp->conn_incoming_ifindex = scopeid;
814	} else {
815		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
816		connp->conn_incoming_ifindex = connp->conn_bound_if;
817	}
818
819	/*
820	 * conn_connect will drop conn_lock and reacquire it.
821	 * To prevent a send* from messing with this icmp_t while the lock
822	 * is dropped we set icmp_state and clear conn_v6lastdst.
823	 * That will make all send* fail with EISCONN.
824	 */
825	connp->conn_v6lastdst = ipv6_all_zeros;
826	icmp->icmp_state = TS_WCON_CREQ;
827
828	error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
829	mutex_exit(&connp->conn_lock);
830	if (error != 0)
831		goto connect_failed;
832
833	/*
834	 * The addresses have been verified. Time to insert in
835	 * the correct fanout list.
836	 */
837	error = ipcl_conn_insert(connp);
838	if (error != 0)
839		goto connect_failed;
840
841	mutex_enter(&connp->conn_lock);
842	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
843	    &connp->conn_faddr_v6, connp->conn_flowinfo);
844	if (error != 0) {
845		mutex_exit(&connp->conn_lock);
846		goto connect_failed;
847	}
848
849	icmp->icmp_state = TS_DATA_XFER;
850	/* Record this as the "last" send even though we haven't sent any */
851	connp->conn_v6lastdst = connp->conn_faddr_v6;
852	connp->conn_lastipversion = connp->conn_ipversion;
853	connp->conn_lastdstport = connp->conn_fport;
854	connp->conn_lastflowinfo = connp->conn_flowinfo;
855	connp->conn_lastscopeid = scopeid;
856	connp->conn_lastsrcid = srcid;
857	/* Also remember a source to use together with lastdst */
858	connp->conn_v6lastsrc = v6src;
859	mutex_exit(&connp->conn_lock);
860
861	ixa_refrele(ixa);
862	return (0);
863
864connect_failed:
865	if (ixa != NULL)
866		ixa_refrele(ixa);
867	mutex_enter(&connp->conn_lock);
868	icmp->icmp_state = TS_IDLE;
869	/* In case the source address was set above */
870	if (connp->conn_mcbc_bind)
871		connp->conn_saddr_v6 = ipv6_all_zeros;
872	else
873		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
874	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
875	connp->conn_faddr_v6 = ipv6_all_zeros;
876	connp->conn_v6lastdst = ipv6_all_zeros;
877	connp->conn_flowinfo = 0;
878
879	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
880	    &connp->conn_faddr_v6, connp->conn_flowinfo);
881	mutex_exit(&connp->conn_lock);
882	return (error);
883}
884
885static void
886rawip_do_close(conn_t *connp)
887{
888	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
889
890	ip_quiesce_conn(connp);
891
892	if (!IPCL_IS_NONSTR(connp)) {
893		qprocsoff(connp->conn_rq);
894	}
895
896	icmp_close_free(connp);
897
898	/*
899	 * Now we are truly single threaded on this stream, and can
900	 * delete the things hanging off the connp, and finally the connp.
901	 * We removed this connp from the fanout list, it cannot be
902	 * accessed thru the fanouts, and we already waited for the
903	 * conn_ref to drop to 0. We are already in close, so
904	 * there cannot be any other thread from the top. qprocsoff
905	 * has completed, and service has completed or won't run in
906	 * future.
907	 */
908	ASSERT(connp->conn_ref == 1);
909
910	if (!IPCL_IS_NONSTR(connp)) {
911		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
912	} else {
913		ip_free_helper_stream(connp);
914	}
915
916	connp->conn_ref--;
917	ipcl_conn_destroy(connp);
918}
919
920static int
921icmp_close(queue_t *q, int flags)
922{
923	conn_t  *connp;
924
925	if (flags & SO_FALLBACK) {
926		/*
927		 * stream is being closed while in fallback
928		 * simply free the resources that were allocated
929		 */
930		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
931		qprocsoff(q);
932		goto done;
933	}
934
935	connp = Q_TO_CONN(q);
936	(void) rawip_do_close(connp);
937done:
938	q->q_ptr = WR(q)->q_ptr = NULL;
939	return (0);
940}
941
942static void
943icmp_close_free(conn_t *connp)
944{
945	icmp_t *icmp = connp->conn_icmp;
946
947	if (icmp->icmp_filter != NULL) {
948		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
949		icmp->icmp_filter = NULL;
950	}
951
952	/*
953	 * Clear any fields which the kmem_cache constructor clears.
954	 * Only icmp_connp needs to be preserved.
955	 * TBD: We should make this more efficient to avoid clearing
956	 * everything.
957	 */
958	ASSERT(icmp->icmp_connp == connp);
959	bzero(icmp, sizeof (icmp_t));
960	icmp->icmp_connp = connp;
961}
962
963/*
964 * This routine handles each T_DISCON_REQ message passed to icmp
965 * as an indicating that ICMP is no longer connected. This results
966 * in telling IP to restore the binding to just the local address.
967 */
968static int
969icmp_do_disconnect(conn_t *connp)
970{
971	icmp_t	*icmp = connp->conn_icmp;
972	int	error;
973
974	mutex_enter(&connp->conn_lock);
975	if (icmp->icmp_state != TS_DATA_XFER) {
976		mutex_exit(&connp->conn_lock);
977		return (-TOUTSTATE);
978	}
979	if (connp->conn_mcbc_bind)
980		connp->conn_saddr_v6 = ipv6_all_zeros;
981	else
982		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
983	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
984	connp->conn_faddr_v6 = ipv6_all_zeros;
985	icmp->icmp_state = TS_IDLE;
986
987	connp->conn_v6lastdst = ipv6_all_zeros;
988	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
989	    &connp->conn_faddr_v6, connp->conn_flowinfo);
990	mutex_exit(&connp->conn_lock);
991	if (error != 0)
992		return (error);
993
994	/*
995	 * Tell IP to remove the full binding and revert
996	 * to the local address binding.
997	 */
998	return (ip_laddr_fanout_insert(connp));
999}
1000
1001static void
1002icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
1003{
1004	conn_t	*connp = Q_TO_CONN(q);
1005	int	error;
1006
1007	/*
1008	 * Allocate the largest primitive we need to send back
1009	 * T_error_ack is > than T_ok_ack
1010	 */
1011	mp = reallocb(mp, sizeof (struct T_error_ack), 1);
1012	if (mp == NULL) {
1013		/* Unable to reuse the T_DISCON_REQ for the ack. */
1014		icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
1015		return;
1016	}
1017
1018	error = icmp_do_disconnect(connp);
1019
1020	if (error != 0) {
1021		if (error > 0) {
1022			icmp_err_ack(q, mp, 0, error);
1023		} else {
1024			icmp_err_ack(q, mp, -error, 0);
1025		}
1026	} else {
1027		mp = mi_tpi_ok_ack_alloc(mp);
1028		ASSERT(mp != NULL);
1029		qreply(q, mp);
1030	}
1031}
1032
1033static int
1034icmp_disconnect(conn_t *connp)
1035{
1036	int	error;
1037
1038	connp->conn_dgram_errind = B_FALSE;
1039
1040	error = icmp_do_disconnect(connp);
1041
1042	if (error < 0)
1043		error = proto_tlitosyserr(-error);
1044	return (error);
1045}
1046
1047/* This routine creates a T_ERROR_ACK message and passes it upstream. */
1048static void
1049icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1050{
1051	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1052		qreply(q, mp);
1053}
1054
1055/* Shorthand to generate and send TPI error acks to our client */
1056static void
1057icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1058    t_scalar_t t_error, int sys_error)
1059{
1060	struct T_error_ack	*teackp;
1061
1062	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1063	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
1064		teackp = (struct T_error_ack *)mp->b_rptr;
1065		teackp->ERROR_prim = primitive;
1066		teackp->TLI_error = t_error;
1067		teackp->UNIX_error = sys_error;
1068		qreply(q, mp);
1069	}
1070}
1071
1072/*
1073 * icmp_icmp_input is called as conn_recvicmp to process ICMP messages.
1074 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1075 * Assumes that IP has pulled up everything up to and including the ICMP header.
1076 */
1077/* ARGSUSED2 */
1078static void
1079icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
1080{
1081	conn_t		*connp = (conn_t *)arg1;
1082	icmp_t		*icmp = connp->conn_icmp;
1083	icmph_t		*icmph;
1084	ipha_t		*ipha;
1085	int		iph_hdr_length;
1086	sin_t		sin;
1087	mblk_t		*mp1;
1088	int		error = 0;
1089
1090	ipha = (ipha_t *)mp->b_rptr;
1091
1092	ASSERT(OK_32PTR(mp->b_rptr));
1093
1094	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1095		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1096		icmp_icmp_error_ipv6(connp, mp, ira);
1097		return;
1098	}
1099	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
1100
1101	/* Skip past the outer IP and ICMP headers */
1102	ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
1103	iph_hdr_length = ira->ira_ip_hdr_length;
1104	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
1105	ipha = (ipha_t *)&icmph[1];	/* Inner IP header */
1106
1107	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1108
1109	switch (icmph->icmph_type) {
1110	case ICMP_DEST_UNREACHABLE:
1111		switch (icmph->icmph_code) {
1112		case ICMP_FRAGMENTATION_NEEDED: {
1113			ipha_t		*ipha;
1114			ip_xmit_attr_t	*ixa;
1115			/*
1116			 * IP has already adjusted the path MTU.
1117			 * But we need to adjust DF for IPv4.
1118			 */
1119			if (connp->conn_ipversion != IPV4_VERSION)
1120				break;
1121
1122			ixa = conn_get_ixa(connp, B_FALSE);
1123			if (ixa == NULL || ixa->ixa_ire == NULL) {
1124				/*
1125				 * Some other thread holds conn_ixa. We will
1126				 * redo this on the next ICMP too big.
1127				 */
1128				if (ixa != NULL)
1129					ixa_refrele(ixa);
1130				break;
1131			}
1132			(void) ip_get_pmtu(ixa);
1133
1134			mutex_enter(&connp->conn_lock);
1135			ipha = (ipha_t *)connp->conn_ht_iphc;
1136			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
1137				ipha->ipha_fragment_offset_and_flags |=
1138				    IPH_DF_HTONS;
1139			} else {
1140				ipha->ipha_fragment_offset_and_flags &=
1141				    ~IPH_DF_HTONS;
1142			}
1143			mutex_exit(&connp->conn_lock);
1144			ixa_refrele(ixa);
1145			break;
1146		}
1147		case ICMP_PORT_UNREACHABLE:
1148		case ICMP_PROTOCOL_UNREACHABLE:
1149			error = ECONNREFUSED;
1150			break;
1151		default:
1152			/* Transient errors */
1153			break;
1154		}
1155		break;
1156	default:
1157		/* Transient errors */
1158		break;
1159	}
1160	if (error == 0) {
1161		freemsg(mp);
1162		return;
1163	}
1164
1165	/*
1166	 * Deliver T_UDERROR_IND when the application has asked for it.
1167	 * The socket layer enables this automatically when connected.
1168	 */
1169	if (!connp->conn_dgram_errind) {
1170		freemsg(mp);
1171		return;
1172	}
1173
1174	sin = sin_null;
1175	sin.sin_family = AF_INET;
1176	sin.sin_addr.s_addr = ipha->ipha_dst;
1177
1178	if (IPCL_IS_NONSTR(connp)) {
1179		mutex_enter(&connp->conn_lock);
1180		if (icmp->icmp_state == TS_DATA_XFER) {
1181			if (sin.sin_addr.s_addr == connp->conn_faddr_v4) {
1182				mutex_exit(&connp->conn_lock);
1183				(*connp->conn_upcalls->su_set_error)
1184				    (connp->conn_upper_handle, error);
1185				goto done;
1186			}
1187		} else {
1188			icmp->icmp_delayed_error = error;
1189			*((sin_t *)&icmp->icmp_delayed_addr) = sin;
1190		}
1191		mutex_exit(&connp->conn_lock);
1192	} else {
1193		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
1194		    error);
1195		if (mp1 != NULL)
1196			putnext(connp->conn_rq, mp1);
1197	}
1198done:
1199	freemsg(mp);
1200}
1201
1202/*
1203 * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6.
1204 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1205 * Assumes that IP has pulled up all the extension headers as well as the
1206 * ICMPv6 header.
1207 */
1208static void
1209icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
1210{
1211	icmp6_t		*icmp6;
1212	ip6_t		*ip6h, *outer_ip6h;
1213	uint16_t	iph_hdr_length;
1214	uint8_t		*nexthdrp;
1215	sin6_t		sin6;
1216	mblk_t		*mp1;
1217	int		error = 0;
1218	icmp_t		*icmp = connp->conn_icmp;
1219
1220	outer_ip6h = (ip6_t *)mp->b_rptr;
1221#ifdef DEBUG
1222	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1223		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1224	else
1225		iph_hdr_length = IPV6_HDR_LEN;
1226	ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
1227#endif
1228	/* Skip past the outer IP and ICMP headers */
1229	iph_hdr_length = ira->ira_ip_hdr_length;
1230	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1231
1232	ip6h = (ip6_t *)&icmp6[1];	/* Inner IP header */
1233	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1234		freemsg(mp);
1235		return;
1236	}
1237
1238	switch (icmp6->icmp6_type) {
1239	case ICMP6_DST_UNREACH:
1240		switch (icmp6->icmp6_code) {
1241		case ICMP6_DST_UNREACH_NOPORT:
1242			error = ECONNREFUSED;
1243			break;
1244		case ICMP6_DST_UNREACH_ADMIN:
1245		case ICMP6_DST_UNREACH_NOROUTE:
1246		case ICMP6_DST_UNREACH_BEYONDSCOPE:
1247		case ICMP6_DST_UNREACH_ADDR:
1248			/* Transient errors */
1249			break;
1250		default:
1251			break;
1252		}
1253		break;
1254	case ICMP6_PACKET_TOO_BIG: {
1255		struct T_unitdata_ind	*tudi;
1256		struct T_opthdr		*toh;
1257		size_t			udi_size;
1258		mblk_t			*newmp;
1259		t_scalar_t		opt_length = sizeof (struct T_opthdr) +
1260		    sizeof (struct ip6_mtuinfo);
1261		sin6_t			*sin6;
1262		struct ip6_mtuinfo	*mtuinfo;
1263
1264		/*
1265		 * If the application has requested to receive path mtu
1266		 * information, send up an empty message containing an
1267		 * IPV6_PATHMTU ancillary data item.
1268		 */
1269		if (!connp->conn_ipv6_recvpathmtu)
1270			break;
1271
1272		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1273		    opt_length;
1274		if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1275			BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1276			break;
1277		}
1278
1279		/*
1280		 * newmp->b_cont is left to NULL on purpose.  This is an
1281		 * empty message containing only ancillary data.
1282		 */
1283		newmp->b_datap->db_type = M_PROTO;
1284		tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1285		newmp->b_wptr = (uchar_t *)tudi + udi_size;
1286		tudi->PRIM_type = T_UNITDATA_IND;
1287		tudi->SRC_length = sizeof (sin6_t);
1288		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1289		tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1290		tudi->OPT_length = opt_length;
1291
1292		sin6 = (sin6_t *)&tudi[1];
1293		bzero(sin6, sizeof (sin6_t));
1294		sin6->sin6_family = AF_INET6;
1295		sin6->sin6_addr = connp->conn_faddr_v6;
1296
1297		toh = (struct T_opthdr *)&sin6[1];
1298		toh->level = IPPROTO_IPV6;
1299		toh->name = IPV6_PATHMTU;
1300		toh->len = opt_length;
1301		toh->status = 0;
1302
1303		mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1304		bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1305		mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1306		mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1307		mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1308		/*
1309		 * We've consumed everything we need from the original
1310		 * message.  Free it, then send our empty message.
1311		 */
1312		freemsg(mp);
1313		icmp_ulp_recv(connp, newmp, msgdsize(newmp));
1314		return;
1315	}
1316	case ICMP6_TIME_EXCEEDED:
1317		/* Transient errors */
1318		break;
1319	case ICMP6_PARAM_PROB:
1320		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1321		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1322		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1323		    (uchar_t *)nexthdrp) {
1324			error = ECONNREFUSED;
1325			break;
1326		}
1327		break;
1328	}
1329	if (error == 0) {
1330		freemsg(mp);
1331		return;
1332	}
1333
1334	/*
1335	 * Deliver T_UDERROR_IND when the application has asked for it.
1336	 * The socket layer enables this automatically when connected.
1337	 */
1338	if (!connp->conn_dgram_errind) {
1339		freemsg(mp);
1340		return;
1341	}
1342
1343	sin6 = sin6_null;
1344	sin6.sin6_family = AF_INET6;
1345	sin6.sin6_addr = ip6h->ip6_dst;
1346	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1347	if (IPCL_IS_NONSTR(connp)) {
1348		mutex_enter(&connp->conn_lock);
1349		if (icmp->icmp_state == TS_DATA_XFER) {
1350			if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1351			    &connp->conn_faddr_v6)) {
1352				mutex_exit(&connp->conn_lock);
1353				(*connp->conn_upcalls->su_set_error)
1354				    (connp->conn_upper_handle, error);
1355				goto done;
1356			}
1357		} else {
1358			icmp->icmp_delayed_error = error;
1359			*((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1360		}
1361		mutex_exit(&connp->conn_lock);
1362	} else {
1363		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1364		    NULL, 0, error);
1365		if (mp1 != NULL)
1366			putnext(connp->conn_rq, mp1);
1367	}
1368done:
1369	freemsg(mp);
1370}
1371
1372/*
1373 * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1374 * The local address is filled in if endpoint is bound. The remote address
1375 * is filled in if remote address has been precified ("connected endpoint")
1376 * (The concept of connected CLTS sockets is alien to published TPI
1377 *  but we support it anyway).
1378 */
1379static void
1380icmp_addr_req(queue_t *q, mblk_t *mp)
1381{
1382	struct sockaddr *sa;
1383	mblk_t	*ackmp;
1384	struct T_addr_ack *taa;
1385	icmp_t	*icmp = Q_TO_ICMP(q);
1386	conn_t	*connp = icmp->icmp_connp;
1387	uint_t	addrlen;
1388
1389	/* Make it large enough for worst case */
1390	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1391	    2 * sizeof (sin6_t), 1);
1392	if (ackmp == NULL) {
1393		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1394		return;
1395	}
1396	taa = (struct T_addr_ack *)ackmp->b_rptr;
1397
1398	bzero(taa, sizeof (struct T_addr_ack));
1399	ackmp->b_wptr = (uchar_t *)&taa[1];
1400
1401	taa->PRIM_type = T_ADDR_ACK;
1402	ackmp->b_datap->db_type = M_PCPROTO;
1403
1404	if (connp->conn_family == AF_INET)
1405		addrlen = sizeof (sin_t);
1406	else
1407		addrlen = sizeof (sin6_t);
1408
1409	mutex_enter(&connp->conn_lock);
1410	/*
1411	 * Note: Following code assumes 32 bit alignment of basic
1412	 * data structures like sin_t and struct T_addr_ack.
1413	 */
1414	if (icmp->icmp_state != TS_UNBND) {
1415		/*
1416		 * Fill in local address first
1417		 */
1418		taa->LOCADDR_offset = sizeof (*taa);
1419		taa->LOCADDR_length = addrlen;
1420		sa = (struct sockaddr *)&taa[1];
1421		(void) conn_getsockname(connp, sa, &addrlen);
1422		ackmp->b_wptr += addrlen;
1423	}
1424	if (icmp->icmp_state == TS_DATA_XFER) {
1425		/*
1426		 * connected, fill remote address too
1427		 */
1428		taa->REMADDR_length = addrlen;
1429		/* assumed 32-bit alignment */
1430		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
1431		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
1432		(void) conn_getpeername(connp, sa, &addrlen);
1433		ackmp->b_wptr += addrlen;
1434	}
1435	mutex_exit(&connp->conn_lock);
1436	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1437	qreply(q, ackmp);
1438}
1439
1440static void
1441icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1442{
1443	conn_t		*connp = icmp->icmp_connp;
1444
1445	*tap = icmp_g_t_info_ack;
1446
1447	if (connp->conn_family == AF_INET6)
1448		tap->ADDR_size = sizeof (sin6_t);
1449	else
1450		tap->ADDR_size = sizeof (sin_t);
1451	tap->CURRENT_state = icmp->icmp_state;
1452	tap->OPT_size = icmp_max_optsize;
1453}
1454
1455static void
1456icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1457    t_uscalar_t cap_bits1)
1458{
1459	tcap->CAP_bits1 = 0;
1460
1461	if (cap_bits1 & TC1_INFO) {
1462		icmp_copy_info(&tcap->INFO_ack, icmp);
1463		tcap->CAP_bits1 |= TC1_INFO;
1464	}
1465}
1466
1467/*
1468 * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1469 * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1470 * icmp_g_t_info_ack.  The current state of the stream is copied from
1471 * icmp_state.
1472 */
1473static void
1474icmp_capability_req(queue_t *q, mblk_t *mp)
1475{
1476	icmp_t			*icmp = Q_TO_ICMP(q);
1477	t_uscalar_t		cap_bits1;
1478	struct T_capability_ack	*tcap;
1479
1480	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1481
1482	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1483	    mp->b_datap->db_type, T_CAPABILITY_ACK);
1484	if (!mp)
1485		return;
1486
1487	tcap = (struct T_capability_ack *)mp->b_rptr;
1488
1489	icmp_do_capability_ack(icmp, tcap, cap_bits1);
1490
1491	qreply(q, mp);
1492}
1493
1494/*
1495 * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1496 * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1497 * The current state of the stream is copied from icmp_state.
1498 */
1499static void
1500icmp_info_req(queue_t *q, mblk_t *mp)
1501{
1502	icmp_t	*icmp = Q_TO_ICMP(q);
1503
1504	/* Create a T_INFO_ACK message. */
1505	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1506	    T_INFO_ACK);
1507	if (!mp)
1508		return;
1509	icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1510	qreply(q, mp);
1511}
1512
1513static int
1514icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1515    int family)
1516{
1517	conn_t *connp;
1518	dev_t	conn_dev;
1519	int	error;
1520
1521	/* If the stream is already open, return immediately. */
1522	if (q->q_ptr != NULL)
1523		return (0);
1524
1525	if (sflag == MODOPEN)
1526		return (EINVAL);
1527
1528	/*
1529	 * Since ICMP is not used so heavily, allocating from the small
1530	 * arena should be sufficient.
1531	 */
1532	if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1533		return (EBUSY);
1534	}
1535
1536	if (flag & SO_FALLBACK) {
1537		/*
1538		 * Non streams socket needs a stream to fallback to
1539		 */
1540		RD(q)->q_ptr = (void *)conn_dev;
1541		WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1542		WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1543		qprocson(q);
1544		return (0);
1545	}
1546
1547	connp = rawip_do_open(family, credp, &error, KM_SLEEP);
1548	if (connp == NULL) {
1549		ASSERT(error != 0);
1550		inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1551		return (error);
1552	}
1553
1554	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1555	connp->conn_dev = conn_dev;
1556	connp->conn_minor_arena = ip_minor_arena_sa;
1557
1558	/*
1559	 * Initialize the icmp_t structure for this stream.
1560	 */
1561	q->q_ptr = connp;
1562	WR(q)->q_ptr = connp;
1563	connp->conn_rq = q;
1564	connp->conn_wq = WR(q);
1565
1566	WR(q)->q_hiwat = connp->conn_sndbuf;
1567	WR(q)->q_lowat = connp->conn_sndlowat;
1568
1569	qprocson(q);
1570
1571	/* Set the Stream head write offset. */
1572	(void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
1573	(void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf);
1574
1575	mutex_enter(&connp->conn_lock);
1576	connp->conn_state_flags &= ~CONN_INCIPIENT;
1577	mutex_exit(&connp->conn_lock);
1578
1579	icmp_bind_proto(connp->conn_icmp);
1580
1581	return (0);
1582}
1583
1584/* For /dev/icmp aka AF_INET open */
1585static int
1586icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1587{
1588	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1589}
1590
1591/* For /dev/icmp6 aka AF_INET6 open */
1592static int
1593icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1594{
1595	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1596}
1597
1598/*
1599 * This is the open routine for icmp.  It allocates a icmp_t structure for
1600 * the stream and, on the first open of the module, creates an ND table.
1601 */
1602static conn_t *
1603rawip_do_open(int family, cred_t *credp, int *err, int flags)
1604{
1605	icmp_t	*icmp;
1606	conn_t *connp;
1607	zoneid_t zoneid;
1608	netstack_t *ns;
1609	icmp_stack_t *is;
1610	int len;
1611	boolean_t isv6 = B_FALSE;
1612
1613	*err = secpolicy_net_icmpaccess(credp);
1614	if (*err != 0)
1615		return (NULL);
1616
1617	if (family == AF_INET6)
1618		isv6 = B_TRUE;
1619
1620	ns = netstack_find_by_cred(credp);
1621	ASSERT(ns != NULL);
1622	is = ns->netstack_icmp;
1623	ASSERT(is != NULL);
1624
1625	/*
1626	 * For exclusive stacks we set the zoneid to zero
1627	 * to make ICMP operate as if in the global zone.
1628	 */
1629	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1630		zoneid = GLOBAL_ZONEID;
1631	else
1632		zoneid = crgetzoneid(credp);
1633
1634	ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1635
1636	connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1637	icmp = connp->conn_icmp;
1638
1639	/*
1640	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
1641	 * done by netstack_find_by_cred()
1642	 */
1643	netstack_rele(ns);
1644
1645	/*
1646	 * Since this conn_t/icmp_t is not yet visible to anybody else we don't
1647	 * need to lock anything.
1648	 */
1649	ASSERT(connp->conn_proto == IPPROTO_ICMP);
1650	ASSERT(connp->conn_icmp == icmp);
1651	ASSERT(icmp->icmp_connp == connp);
1652
1653	/* Set the initial state of the stream and the privilege status. */
1654	icmp->icmp_state = TS_UNBND;
1655	connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1656	if (isv6) {
1657		connp->conn_family = AF_INET6;
1658		connp->conn_ipversion = IPV6_VERSION;
1659		connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
1660		connp->conn_proto = IPPROTO_ICMPV6;
1661		/* May be changed by a SO_PROTOTYPE socket option. */
1662		connp->conn_proto = IPPROTO_ICMPV6;
1663		connp->conn_ixa->ixa_protocol = connp->conn_proto;
1664		connp->conn_ixa->ixa_raw_cksum_offset = 2;
1665		connp->conn_default_ttl = is->is_ipv6_hoplimit;
1666		len = sizeof (ip6_t);
1667	} else {
1668		connp->conn_family = AF_INET;
1669		connp->conn_ipversion = IPV4_VERSION;
1670		connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
1671		/* May be changed by a SO_PROTOTYPE socket option. */
1672		connp->conn_proto = IPPROTO_ICMP;
1673		connp->conn_ixa->ixa_protocol = connp->conn_proto;
1674		connp->conn_default_ttl = is->is_ipv4_ttl;
1675		len = sizeof (ipha_t);
1676	}
1677	connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
1678
1679	connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1680
1681	/*
1682	 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set,
1683	 * the checksum is provided in the pre-built packet. We clear
1684	 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a
1685	 * complete IP header and not to compute the transport checksum.
1686	 */
1687	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
1688	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1689	connp->conn_ixa->ixa_zoneid = zoneid;
1690
1691	connp->conn_zoneid = zoneid;
1692
1693	/*
1694	 * If the caller has the process-wide flag set, then default to MAC
1695	 * exempt mode.  This allows read-down to unlabeled hosts.
1696	 */
1697	if (getpflags(NET_MAC_AWARE, credp) != 0)
1698		connp->conn_mac_mode = CONN_MAC_AWARE;
1699
1700	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
1701
1702	icmp->icmp_is = is;
1703
1704	connp->conn_rcvbuf = is->is_recv_hiwat;
1705	connp->conn_sndbuf = is->is_xmit_hiwat;
1706	connp->conn_sndlowat = is->is_xmit_lowat;
1707	connp->conn_rcvlowat = icmp_mod_info.mi_lowat;
1708
1709	connp->conn_wroff = len + is->is_wroff_extra;
1710	connp->conn_so_type = SOCK_RAW;
1711
1712	connp->conn_recv = icmp_input;
1713	connp->conn_recvicmp = icmp_icmp_input;
1714	crhold(credp);
1715	connp->conn_cred = credp;
1716	connp->conn_cpid = curproc->p_pid;
1717	connp->conn_open_time = lbolt64;
1718	/* Cache things in ixa without an extra refhold */
1719	connp->conn_ixa->ixa_cred = connp->conn_cred;
1720	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1721	if (is_system_labeled())
1722		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1723
1724	connp->conn_flow_cntrld = B_FALSE;
1725
1726	if (is->is_pmtu_discovery)
1727		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
1728
1729	return (connp);
1730}
1731
1732/*
1733 * Which ICMP options OK to set through T_UNITDATA_REQ...
1734 */
1735/* ARGSUSED */
1736static boolean_t
1737icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1738{
1739	return (B_TRUE);
1740}
1741
1742/*
1743 * This routine gets default values of certain options whose default
1744 * values are maintained by protcol specific code
1745 */
1746int
1747icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1748{
1749	icmp_t *icmp = Q_TO_ICMP(q);
1750	icmp_stack_t *is = icmp->icmp_is;
1751	int *i1 = (int *)ptr;
1752
1753	switch (level) {
1754	case IPPROTO_IP:
1755		switch (name) {
1756		case IP_MULTICAST_TTL:
1757			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1758			return (sizeof (uchar_t));
1759		case IP_MULTICAST_LOOP:
1760			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1761			return (sizeof (uchar_t));
1762		}
1763		break;
1764	case IPPROTO_IPV6:
1765		switch (name) {
1766		case IPV6_MULTICAST_HOPS:
1767			*i1 = IP_DEFAULT_MULTICAST_TTL;
1768			return (sizeof (int));
1769		case IPV6_MULTICAST_LOOP:
1770			*i1 = IP_DEFAULT_MULTICAST_LOOP;
1771			return (sizeof (int));
1772		case IPV6_UNICAST_HOPS:
1773			*i1 = is->is_ipv6_hoplimit;
1774			return (sizeof (int));
1775		}
1776		break;
1777	case IPPROTO_ICMPV6:
1778		switch (name) {
1779		case ICMP6_FILTER:
1780			/* Make it look like "pass all" */
1781			ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1782			return (sizeof (icmp6_filter_t));
1783		}
1784		break;
1785	}
1786	return (-1);
1787}
1788
1789/*
1790 * This routine retrieves the current status of socket options.
1791 * It returns the size of the option retrieved, or -1.
1792 */
1793int
1794icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1795{
1796	icmp_t		*icmp = connp->conn_icmp;
1797	int		*i1 = (int *)ptr;
1798	conn_opt_arg_t	coas;
1799	int		retval;
1800
1801	coas.coa_connp = connp;
1802	coas.coa_ixa = connp->conn_ixa;
1803	coas.coa_ipp = &connp->conn_xmit_ipp;
1804	coas.coa_ancillary = B_FALSE;
1805	coas.coa_changed = 0;
1806
1807	/*
1808	 * We assume that the optcom framework has checked for the set
1809	 * of levels and names that are supported, hence we don't worry
1810	 * about rejecting based on that.
1811	 * First check for ICMP specific handling, then pass to common routine.
1812	 */
1813	switch (level) {
1814	case IPPROTO_IP:
1815		/*
1816		 * Only allow IPv4 option processing on IPv4 sockets.
1817		 */
1818		if (connp->conn_family != AF_INET)
1819			return (-1);
1820
1821		switch (name) {
1822		case IP_OPTIONS:
1823		case T_IP_OPTIONS:
1824			/* Options are passed up with each packet */
1825			return (0);
1826		case IP_HDRINCL:
1827			mutex_enter(&connp->conn_lock);
1828			*i1 = (int)icmp->icmp_hdrincl;
1829			mutex_exit(&connp->conn_lock);
1830			return (sizeof (int));
1831		}
1832		break;
1833
1834	case IPPROTO_IPV6:
1835		/*
1836		 * Only allow IPv6 option processing on native IPv6 sockets.
1837		 */
1838		if (connp->conn_family != AF_INET6)
1839			return (-1);
1840
1841		switch (name) {
1842		case IPV6_CHECKSUM:
1843			/*
1844			 * Return offset or -1 if no checksum offset.
1845			 * Does not apply to IPPROTO_ICMPV6
1846			 */
1847			if (connp->conn_proto == IPPROTO_ICMPV6)
1848				return (-1);
1849
1850			mutex_enter(&connp->conn_lock);
1851			if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM)
1852				*i1 = connp->conn_ixa->ixa_raw_cksum_offset;
1853			else
1854				*i1 = -1;
1855			mutex_exit(&connp->conn_lock);
1856			return (sizeof (int));
1857		}
1858		break;
1859
1860	case IPPROTO_ICMPV6:
1861		/*
1862		 * Only allow IPv6 option processing on native IPv6 sockets.
1863		 */
1864		if (connp->conn_family != AF_INET6)
1865			return (-1);
1866
1867		if (connp->conn_proto != IPPROTO_ICMPV6)
1868			return (-1);
1869
1870		switch (name) {
1871		case ICMP6_FILTER:
1872			mutex_enter(&connp->conn_lock);
1873			if (icmp->icmp_filter == NULL) {
1874				/* Make it look like "pass all" */
1875				ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1876			} else {
1877				(void) bcopy(icmp->icmp_filter, ptr,
1878				    sizeof (icmp6_filter_t));
1879			}
1880			mutex_exit(&connp->conn_lock);
1881			return (sizeof (icmp6_filter_t));
1882		}
1883	}
1884	mutex_enter(&connp->conn_lock);
1885	retval = conn_opt_get(&coas, level, name, ptr);
1886	mutex_exit(&connp->conn_lock);
1887	return (retval);
1888}
1889
1890/*
1891 * This routine retrieves the current status of socket options.
1892 * It returns the size of the option retrieved, or -1.
1893 */
1894int
1895icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
1896{
1897	conn_t		*connp = Q_TO_CONN(q);
1898	int 		err;
1899
1900	err = icmp_opt_get(connp, level, name, ptr);
1901	return (err);
1902}
1903
1904/*
1905 * This routine sets socket options.
1906 */
1907int
1908icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
1909    uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
1910{
1911	conn_t		*connp = coa->coa_connp;
1912	ip_xmit_attr_t	*ixa = coa->coa_ixa;
1913	icmp_t		*icmp = connp->conn_icmp;
1914	icmp_stack_t	*is = icmp->icmp_is;
1915	int		*i1 = (int *)invalp;
1916	boolean_t	onoff = (*i1 == 0) ? 0 : 1;
1917	int		error;
1918
1919	ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
1920
1921	/*
1922	 * For fixed length options, no sanity check
1923	 * of passed in length is done. It is assumed *_optcom_req()
1924	 * routines do the right thing.
1925	 */
1926
1927	switch (level) {
1928	case SOL_SOCKET:
1929		switch (name) {
1930		case SO_PROTOTYPE:
1931			if ((*i1 & 0xFF) != IPPROTO_ICMP &&
1932			    (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
1933			    secpolicy_net_rawaccess(cr) != 0) {
1934				return (EACCES);
1935			}
1936			if (checkonly)
1937				break;
1938
1939			mutex_enter(&connp->conn_lock);
1940			connp->conn_proto = *i1 & 0xFF;
1941			ixa->ixa_protocol = connp->conn_proto;
1942			if ((connp->conn_proto == IPPROTO_RAW ||
1943			    connp->conn_proto == IPPROTO_IGMP) &&
1944			    connp->conn_family == AF_INET) {
1945				icmp->icmp_hdrincl = 1;
1946				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
1947			} else if (connp->conn_proto == IPPROTO_UDP ||
1948			    connp->conn_proto == IPPROTO_TCP ||
1949			    connp->conn_proto == IPPROTO_SCTP) {
1950				/* Used by test applications like psh */
1951				icmp->icmp_hdrincl = 0;
1952				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
1953			} else {
1954				icmp->icmp_hdrincl = 0;
1955				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
1956			}
1957
1958			if (connp->conn_family == AF_INET6 &&
1959			    connp->conn_proto == IPPROTO_ICMPV6) {
1960				/* Set offset for icmp6_cksum */
1961				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
1962				ixa->ixa_raw_cksum_offset = 2;
1963			}
1964			if (icmp->icmp_filter != NULL &&
1965			    connp->conn_proto != IPPROTO_ICMPV6) {
1966				kmem_free(icmp->icmp_filter,
1967				    sizeof (icmp6_filter_t));
1968				icmp->icmp_filter = NULL;
1969			}
1970			mutex_exit(&connp->conn_lock);
1971
1972			coa->coa_changed |= COA_HEADER_CHANGED;
1973			/*
1974			 * For SCTP, we don't use icmp_bind_proto() for
1975			 * raw socket binding.
1976			 */
1977			if (connp->conn_proto == IPPROTO_SCTP)
1978				return (0);
1979
1980			coa->coa_changed |= COA_ICMP_BIND_NEEDED;
1981			return (0);
1982
1983		case SO_SNDBUF:
1984			if (*i1 > is->is_max_buf) {
1985				return (ENOBUFS);
1986			}
1987			break;
1988		case SO_RCVBUF:
1989			if (*i1 > is->is_max_buf) {
1990				return (ENOBUFS);
1991			}
1992			break;
1993		}
1994		break;
1995
1996	case IPPROTO_IP:
1997		/*
1998		 * Only allow IPv4 option processing on IPv4 sockets.
1999		 */
2000		if (connp->conn_family != AF_INET)
2001			return (EINVAL);
2002
2003		switch (name) {
2004		case IP_HDRINCL:
2005			if (!checkonly) {
2006				mutex_enter(&connp->conn_lock);
2007				icmp->icmp_hdrincl = onoff;
2008				if (onoff)
2009					ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2010				else
2011					ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2012				mutex_exit(&connp->conn_lock);
2013			}
2014			break;
2015		}
2016		break;
2017
2018	case IPPROTO_IPV6:
2019		if (connp->conn_family != AF_INET6)
2020			return (EINVAL);
2021
2022		switch (name) {
2023		case IPV6_CHECKSUM:
2024			/*
2025			 * Integer offset into the user data of where the
2026			 * checksum is located.
2027			 * Offset of -1 disables option.
2028			 * Does not apply to IPPROTO_ICMPV6.
2029			 */
2030			if (connp->conn_proto == IPPROTO_ICMPV6 ||
2031			    coa->coa_ancillary) {
2032				return (EINVAL);
2033			}
2034			if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2035				/* Negative or not 16 bit aligned offset */
2036				return (EINVAL);
2037			}
2038			if (checkonly)
2039				break;
2040
2041			mutex_enter(&connp->conn_lock);
2042			if (*i1 == -1) {
2043				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2044				ixa->ixa_raw_cksum_offset = 0;
2045				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2046			} else {
2047				ixa->ixa_flags |= IXAF_SET_RAW_CKSUM;
2048				ixa->ixa_raw_cksum_offset = *i1;
2049				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2050			}
2051			mutex_exit(&connp->conn_lock);
2052			break;
2053		}
2054		break;
2055
2056	case IPPROTO_ICMPV6:
2057		/*
2058		 * Only allow IPv6 option processing on IPv6 sockets.
2059		 */
2060		if (connp->conn_family != AF_INET6)
2061			return (EINVAL);
2062		if (connp->conn_proto != IPPROTO_ICMPV6)
2063			return (EINVAL);
2064
2065		switch (name) {
2066		case ICMP6_FILTER:
2067			if (checkonly)
2068				break;
2069
2070			if ((inlen != 0) &&
2071			    (inlen != sizeof (icmp6_filter_t)))
2072				return (EINVAL);
2073
2074			mutex_enter(&connp->conn_lock);
2075			if (inlen == 0) {
2076				if (icmp->icmp_filter != NULL) {
2077					kmem_free(icmp->icmp_filter,
2078					    sizeof (icmp6_filter_t));
2079					icmp->icmp_filter = NULL;
2080				}
2081			} else {
2082				if (icmp->icmp_filter == NULL) {
2083					icmp->icmp_filter = kmem_alloc(
2084					    sizeof (icmp6_filter_t),
2085					    KM_NOSLEEP);
2086					if (icmp->icmp_filter == NULL) {
2087						mutex_exit(&connp->conn_lock);
2088						return (ENOBUFS);
2089					}
2090				}
2091				(void) bcopy(invalp, icmp->icmp_filter, inlen);
2092			}
2093			mutex_exit(&connp->conn_lock);
2094			break;
2095		}
2096		break;
2097	}
2098	error = conn_opt_set(coa, level, name, inlen, invalp,
2099	    checkonly, cr);
2100	return (error);
2101}
2102
2103/*
2104 * This routine sets socket options.
2105 */
2106int
2107icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
2108    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2109    void *thisdg_attrs, cred_t *cr)
2110{
2111	icmp_t		*icmp = connp->conn_icmp;
2112	int		err;
2113	conn_opt_arg_t	coas, *coa;
2114	boolean_t	checkonly;
2115	icmp_stack_t	*is = icmp->icmp_is;
2116
2117	switch (optset_context) {
2118	case SETFN_OPTCOM_CHECKONLY:
2119		checkonly = B_TRUE;
2120		/*
2121		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
2122		 * inlen != 0 implies value supplied and
2123		 * 	we have to "pretend" to set it.
2124		 * inlen == 0 implies that there is no
2125		 * 	value part in T_CHECK request and just validation
2126		 * done elsewhere should be enough, we just return here.
2127		 */
2128		if (inlen == 0) {
2129			*outlenp = 0;
2130			return (0);
2131		}
2132		break;
2133	case SETFN_OPTCOM_NEGOTIATE:
2134		checkonly = B_FALSE;
2135		break;
2136	case SETFN_UD_NEGOTIATE:
2137	case SETFN_CONN_NEGOTIATE:
2138		checkonly = B_FALSE;
2139		/*
2140		 * Negotiating local and "association-related" options
2141		 * through T_UNITDATA_REQ.
2142		 *
2143		 * Following routine can filter out ones we do not
2144		 * want to be "set" this way.
2145		 */
2146		if (!icmp_opt_allow_udr_set(level, name)) {
2147			*outlenp = 0;
2148			return (EINVAL);
2149		}
2150		break;
2151	default:
2152		/*
2153		 * We should never get here
2154		 */
2155		*outlenp = 0;
2156		return (EINVAL);
2157	}
2158
2159	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
2160	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
2161
2162	if (thisdg_attrs != NULL) {
2163		/* Options from T_UNITDATA_REQ */
2164		coa = (conn_opt_arg_t *)thisdg_attrs;
2165		ASSERT(coa->coa_connp == connp);
2166		ASSERT(coa->coa_ixa != NULL);
2167		ASSERT(coa->coa_ipp != NULL);
2168		ASSERT(coa->coa_ancillary);
2169	} else {
2170		coa = &coas;
2171		coas.coa_connp = connp;
2172		/* Get a reference on conn_ixa to prevent concurrent mods */
2173		coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
2174		if (coas.coa_ixa == NULL) {
2175			*outlenp = 0;
2176			return (ENOMEM);
2177		}
2178		coas.coa_ipp = &connp->conn_xmit_ipp;
2179		coas.coa_ancillary = B_FALSE;
2180		coas.coa_changed = 0;
2181	}
2182
2183	err = icmp_do_opt_set(coa, level, name, inlen, invalp,
2184	    cr, checkonly);
2185	if (err != 0) {
2186errout:
2187		if (!coa->coa_ancillary)
2188			ixa_refrele(coa->coa_ixa);
2189		*outlenp = 0;
2190		return (err);
2191	}
2192
2193	/*
2194	 * Common case of OK return with outval same as inval.
2195	 */
2196	if (invalp != outvalp) {
2197		/* don't trust bcopy for identical src/dst */
2198		(void) bcopy(invalp, outvalp, inlen);
2199	}
2200	*outlenp = inlen;
2201
2202	/*
2203	 * If this was not ancillary data, then we rebuild the headers,
2204	 * update the IRE/NCE, and IPsec as needed.
2205	 * Since the label depends on the destination we go through
2206	 * ip_set_destination first.
2207	 */
2208	if (coa->coa_ancillary) {
2209		return (0);
2210	}
2211
2212	if (coa->coa_changed & COA_ROUTE_CHANGED) {
2213		in6_addr_t saddr, faddr, nexthop;
2214		in_port_t fport;
2215
2216		/*
2217		 * We clear lastdst to make sure we pick up the change
2218		 * next time sending.
2219		 * If we are connected we re-cache the information.
2220		 * We ignore errors to preserve BSD behavior.
2221		 * Note that we don't redo IPsec policy lookup here
2222		 * since the final destination (or source) didn't change.
2223		 */
2224		mutex_enter(&connp->conn_lock);
2225		connp->conn_v6lastdst = ipv6_all_zeros;
2226
2227		ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
2228		    &connp->conn_faddr_v6, &nexthop);
2229		saddr = connp->conn_saddr_v6;
2230		faddr = connp->conn_faddr_v6;
2231		fport = connp->conn_fport;
2232		mutex_exit(&connp->conn_lock);
2233
2234		if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
2235		    !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
2236			(void) ip_attr_connect(connp, coa->coa_ixa,
2237			    &saddr, &faddr, &nexthop, fport, NULL, NULL,
2238			    IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
2239		}
2240	}
2241
2242	ixa_refrele(coa->coa_ixa);
2243
2244	if (coa->coa_changed & COA_HEADER_CHANGED) {
2245		/*
2246		 * Rebuild the header template if we are connected.
2247		 * Otherwise clear conn_v6lastdst so we rebuild the header
2248		 * in the data path.
2249		 */
2250		mutex_enter(&connp->conn_lock);
2251		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
2252		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
2253			err = icmp_build_hdr_template(connp,
2254			    &connp->conn_saddr_v6, &connp->conn_faddr_v6,
2255			    connp->conn_flowinfo);
2256			if (err != 0) {
2257				mutex_exit(&connp->conn_lock);
2258				return (err);
2259			}
2260		} else {
2261			connp->conn_v6lastdst = ipv6_all_zeros;
2262		}
2263		mutex_exit(&connp->conn_lock);
2264	}
2265	if (coa->coa_changed & COA_RCVBUF_CHANGED) {
2266		(void) proto_set_rx_hiwat(connp->conn_rq, connp,
2267		    connp->conn_rcvbuf);
2268	}
2269	if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
2270		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
2271	}
2272	if (coa->coa_changed & COA_WROFF_CHANGED) {
2273		/* Increase wroff if needed */
2274		uint_t wroff;
2275
2276		mutex_enter(&connp->conn_lock);
2277		wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra;
2278		if (wroff > connp->conn_wroff) {
2279			connp->conn_wroff = wroff;
2280			mutex_exit(&connp->conn_lock);
2281			(void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
2282		} else {
2283			mutex_exit(&connp->conn_lock);
2284		}
2285	}
2286	if (coa->coa_changed & COA_ICMP_BIND_NEEDED) {
2287		icmp_bind_proto(icmp);
2288	}
2289	return (err);
2290}
2291
2292/* This routine sets socket options. */
2293int
2294icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
2295    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2296    void *thisdg_attrs, cred_t *cr)
2297{
2298	conn_t	*connp = Q_TO_CONN(q);
2299	int error;
2300
2301	error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
2302	    outlenp, outvalp, thisdg_attrs, cr);
2303	return (error);
2304}
2305
2306/*
2307 * Setup IP headers.
2308 *
2309 * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto,
2310 * but icmp_output_hdrincl restores ipha_protocol once we return.
2311 */
2312mblk_t *
2313icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2314    const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo,
2315    mblk_t *data_mp, int *errorp)
2316{
2317	mblk_t		*mp;
2318	icmp_stack_t	*is = connp->conn_netstack->netstack_icmp;
2319	uint_t		data_len;
2320	uint32_t	cksum;
2321
2322	data_len = msgdsize(data_mp);
2323	mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto,
2324	    flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp);
2325	if (mp == NULL) {
2326		ASSERT(*errorp != 0);
2327		return (NULL);
2328	}
2329
2330	ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2331
2332	/*
2333	 * If there was a routing option/header then conn_prepend_hdr
2334	 * has massaged it and placed the pseudo-header checksum difference
2335	 * in the cksum argument.
2336	 *
2337	 * Prepare for ICMPv6 checksum done in IP.
2338	 *
2339	 * We make it easy for IP to include our pseudo header
2340	 * by putting our length (and any routing header adjustment)
2341	 * in the ICMPv6 checksum field.
2342	 * The IP source, destination, and length have already been set by
2343	 * conn_prepend_hdr.
2344	 */
2345	cksum += data_len;
2346	cksum = (cksum >> 16) + (cksum & 0xFFFF);
2347	ASSERT(cksum < 0x10000);
2348
2349	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2350		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
2351
2352		ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
2353	} else {
2354		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
2355		uint_t	cksum_offset = 0;
2356
2357		ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
2358
2359		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
2360			if (connp->conn_proto == IPPROTO_ICMPV6) {
2361				cksum_offset = ixa->ixa_ip_hdr_length +
2362				    offsetof(icmp6_t, icmp6_cksum);
2363			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2364				cksum_offset = ixa->ixa_ip_hdr_length +
2365				    ixa->ixa_raw_cksum_offset;
2366			}
2367		}
2368		if (cksum_offset != 0) {
2369			uint16_t *ptr;
2370
2371			/* Make sure the checksum fits in the first mblk */
2372			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
2373				mblk_t *mp1;
2374
2375				mp1 = msgpullup(mp,
2376				    cksum_offset + sizeof (short));
2377				freemsg(mp);
2378				if (mp1 == NULL) {
2379					*errorp = ENOMEM;
2380					return (NULL);
2381				}
2382				mp = mp1;
2383				ip6h = (ip6_t *)mp->b_rptr;
2384			}
2385			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
2386			*ptr = htons(cksum);
2387		}
2388	}
2389
2390	/* Note that we don't try to update wroff due to ancillary data */
2391	return (mp);
2392}
2393
2394static int
2395icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
2396    const in6_addr_t *v6dst, uint32_t flowinfo)
2397{
2398	int		error;
2399
2400	ASSERT(MUTEX_HELD(&connp->conn_lock));
2401	/*
2402	 * We clear lastdst to make sure we don't use the lastdst path
2403	 * next time sending since we might not have set v6dst yet.
2404	 */
2405	connp->conn_v6lastdst = ipv6_all_zeros;
2406
2407	error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo);
2408	if (error != 0)
2409		return (error);
2410
2411	/*
2412	 * Any routing header/option has been massaged. The checksum difference
2413	 * is stored in conn_sum.
2414	 */
2415	return (0);
2416}
2417
2418/*
2419 * This routine retrieves the value of an ND variable in a icmpparam_t
2420 * structure.  It is called through nd_getset when a user reads the
2421 * variable.
2422 */
2423/* ARGSUSED */
2424static int
2425icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
2426{
2427	icmpparam_t	*icmppa = (icmpparam_t *)cp;
2428
2429	(void) mi_mpprintf(mp, "%d", icmppa->icmp_param_value);
2430	return (0);
2431}
2432
2433/*
2434 * Walk through the param array specified registering each element with the
2435 * named dispatch (ND) handler.
2436 */
2437static boolean_t
2438icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt)
2439{
2440	for (; cnt-- > 0; icmppa++) {
2441		if (icmppa->icmp_param_name && icmppa->icmp_param_name[0]) {
2442			if (!nd_load(ndp, icmppa->icmp_param_name,
2443			    icmp_param_get, icmp_param_set,
2444			    (caddr_t)icmppa)) {
2445				nd_free(ndp);
2446				return (B_FALSE);
2447			}
2448		}
2449	}
2450	return (B_TRUE);
2451}
2452
2453/* This routine sets an ND variable in a icmpparam_t structure. */
2454/* ARGSUSED */
2455static int
2456icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
2457{
2458	long		new_value;
2459	icmpparam_t	*icmppa = (icmpparam_t *)cp;
2460
2461	/*
2462	 * Fail the request if the new value does not lie within the
2463	 * required bounds.
2464	 */
2465	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
2466	    new_value < icmppa->icmp_param_min ||
2467	    new_value > icmppa->icmp_param_max) {
2468		return (EINVAL);
2469	}
2470	/* Set the new value */
2471	icmppa->icmp_param_value = new_value;
2472	return (0);
2473}
2474
2475static mblk_t *
2476icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
2477{
2478	ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
2479	if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
2480		/*
2481		 * fallback has started but messages have not been moved yet
2482		 */
2483		if (icmp->icmp_fallback_queue_head == NULL) {
2484			ASSERT(icmp->icmp_fallback_queue_tail == NULL);
2485			icmp->icmp_fallback_queue_head = mp;
2486			icmp->icmp_fallback_queue_tail = mp;
2487		} else {
2488			ASSERT(icmp->icmp_fallback_queue_tail != NULL);
2489			icmp->icmp_fallback_queue_tail->b_next = mp;
2490			icmp->icmp_fallback_queue_tail = mp;
2491		}
2492		return (NULL);
2493	} else {
2494		/*
2495		 * Fallback completed, let the caller putnext() the mblk.
2496		 */
2497		return (mp);
2498	}
2499}
2500
2501/*
2502 * Deliver data to ULP. In case we have a socket, and it's falling back to
2503 * TPI, then we'll queue the mp for later processing.
2504 */
2505static void
2506icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len)
2507{
2508	if (IPCL_IS_NONSTR(connp)) {
2509		icmp_t *icmp = connp->conn_icmp;
2510		int error;
2511
2512		ASSERT(len == msgdsize(mp));
2513		if ((*connp->conn_upcalls->su_recv)
2514		    (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
2515			mutex_enter(&icmp->icmp_recv_lock);
2516			if (error == ENOSPC) {
2517				/*
2518				 * let's confirm while holding the lock
2519				 */
2520				if ((*connp->conn_upcalls->su_recv)
2521				    (connp->conn_upper_handle, NULL, 0, 0,
2522				    &error, NULL) < 0) {
2523					ASSERT(error == ENOSPC);
2524					if (error == ENOSPC) {
2525						connp->conn_flow_cntrld =
2526						    B_TRUE;
2527					}
2528				}
2529				mutex_exit(&icmp->icmp_recv_lock);
2530			} else {
2531				ASSERT(error == EOPNOTSUPP);
2532				mp = icmp_queue_fallback(icmp, mp);
2533				mutex_exit(&icmp->icmp_recv_lock);
2534				if (mp != NULL)
2535					putnext(connp->conn_rq, mp);
2536			}
2537		}
2538		ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
2539	} else {
2540		putnext(connp->conn_rq, mp);
2541	}
2542}
2543
2544/*
2545 * This is the inbound data path.
2546 * IP has already pulled up the IP headers and verified alignment
2547 * etc.
2548 */
2549/* ARGSUSED2 */
2550static void
2551icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2552{
2553	conn_t			*connp = (conn_t *)arg1;
2554	struct T_unitdata_ind	*tudi;
2555	uchar_t			*rptr;		/* Pointer to IP header */
2556	int			ip_hdr_length;
2557	int			udi_size;	/* Size of T_unitdata_ind */
2558	int			pkt_len;
2559	icmp_t			*icmp;
2560	ip_pkt_t		ipps;
2561	ip6_t			*ip6h;
2562	mblk_t			*mp1;
2563	crb_t			recv_ancillary;
2564	icmp_stack_t		*is;
2565	sin_t			*sin;
2566	sin6_t			*sin6;
2567	ipha_t			*ipha;
2568
2569	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2570
2571	icmp = connp->conn_icmp;
2572	is = icmp->icmp_is;
2573	rptr = mp->b_rptr;
2574
2575	ASSERT(DB_TYPE(mp) == M_DATA);
2576	ASSERT(OK_32PTR(rptr));
2577	ASSERT(ira->ira_pktlen == msgdsize(mp));
2578	pkt_len = ira->ira_pktlen;
2579
2580	/*
2581	 * Get a snapshot of these and allow other threads to change
2582	 * them after that. We need the same recv_ancillary when determining
2583	 * the size as when adding the ancillary data items.
2584	 */
2585	mutex_enter(&connp->conn_lock);
2586	recv_ancillary = connp->conn_recv_ancillary;
2587	mutex_exit(&connp->conn_lock);
2588
2589	ip_hdr_length = ira->ira_ip_hdr_length;
2590	ASSERT(MBLKL(mp) >= ip_hdr_length);	/* IP did a pullup */
2591
2592	/* Initialize regardless of IP version */
2593	ipps.ipp_fields = 0;
2594
2595	if (ira->ira_flags & IRAF_IS_IPV4) {
2596		ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
2597		ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2598		ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
2599
2600		ipha = (ipha_t *)mp->b_rptr;
2601		if (recv_ancillary.crb_all != 0)
2602			(void) ip_find_hdr_v4(ipha, &ipps, B_FALSE);
2603
2604		/*
2605		 * BSD for some reason adjusts ipha_length to exclude the
2606		 * IP header length. We do the same.
2607		 */
2608		if (is->is_bsd_compat) {
2609			ushort_t len;
2610
2611			len = ntohs(ipha->ipha_length);
2612			if (mp->b_datap->db_ref > 1) {
2613				/*
2614				 * Allocate a new IP header so that we can
2615				 * modify ipha_length.
2616				 */
2617				mblk_t	*mp1;
2618
2619				mp1 = allocb(ip_hdr_length, BPRI_MED);
2620				if (mp1 == NULL) {
2621					freemsg(mp);
2622					BUMP_MIB(&is->is_rawip_mib,
2623					    rawipInErrors);
2624					return;
2625				}
2626				bcopy(rptr, mp1->b_rptr, ip_hdr_length);
2627				mp->b_rptr = rptr + ip_hdr_length;
2628				rptr = mp1->b_rptr;
2629				ipha = (ipha_t *)rptr;
2630				mp1->b_cont = mp;
2631				mp1->b_wptr = rptr + ip_hdr_length;
2632				mp = mp1;
2633			}
2634			len -= ip_hdr_length;
2635			ipha->ipha_length = htons(len);
2636		}
2637
2638		/*
2639		 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6
2640		 * sockets. This is ensured by icmp_bind and the IP fanout code.
2641		 */
2642		ASSERT(connp->conn_family == AF_INET);
2643
2644		/*
2645		 * This is the inbound data path.  Packets are passed upstream
2646		 * as T_UNITDATA_IND messages with full IPv4 headers still
2647		 * attached.
2648		 */
2649
2650		/*
2651		 * Normally only send up the source address.
2652		 * If any ancillary data items are wanted we add those.
2653		 */
2654		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
2655		if (recv_ancillary.crb_all != 0) {
2656			udi_size += conn_recvancillary_size(connp,
2657			    recv_ancillary, ira, mp, &ipps);
2658		}
2659
2660		/* Allocate a message block for the T_UNITDATA_IND structure. */
2661		mp1 = allocb(udi_size, BPRI_MED);
2662		if (mp1 == NULL) {
2663			freemsg(mp);
2664			BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2665			return;
2666		}
2667		mp1->b_cont = mp;
2668		tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2669		mp1->b_datap->db_type = M_PROTO;
2670		mp1->b_wptr = (uchar_t *)tudi + udi_size;
2671		tudi->PRIM_type = T_UNITDATA_IND;
2672		tudi->SRC_length = sizeof (sin_t);
2673		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2674		sin = (sin_t *)&tudi[1];
2675		*sin = sin_null;
2676		sin->sin_family = AF_INET;
2677		sin->sin_addr.s_addr = ipha->ipha_src;
2678		*(uint32_t *)&sin->sin_zero[0] = 0;
2679		*(uint32_t *)&sin->sin_zero[4] = 0;
2680		tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
2681		    sizeof (sin_t);
2682		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
2683		tudi->OPT_length = udi_size;
2684
2685		/*
2686		 * Add options if IP_RECVIF etc is set
2687		 */
2688		if (udi_size != 0) {
2689			conn_recvancillary_add(connp, recv_ancillary, ira,
2690			    &ipps, (uchar_t *)&sin[1], udi_size);
2691		}
2692		goto deliver;
2693	}
2694
2695	ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
2696	/*
2697	 * IPv6 packets can only be received by applications
2698	 * that are prepared to receive IPv6 addresses.
2699	 * The IP fanout must ensure this.
2700	 */
2701	ASSERT(connp->conn_family == AF_INET6);
2702
2703	/*
2704	 * Handle IPv6 packets. We don't pass up the IP headers with the
2705	 * payload for IPv6.
2706	 */
2707
2708	ip6h = (ip6_t *)rptr;
2709	if (recv_ancillary.crb_all != 0) {
2710		/*
2711		 * Call on ip_find_hdr_v6 which gets individual lenghts of
2712		 * extension headers (and pointers to them).
2713		 */
2714		uint8_t		nexthdr;
2715
2716		/* We don't care about the length or nextheader. */
2717		(void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr);
2718
2719		/*
2720		 * We do not pass up hop-by-hop options or any other
2721		 * extension header as part of the packet. Applications
2722		 * that want to see them have to specify IPV6_RECV* socket
2723		 * options. And conn_recvancillary_size/add explicitly
2724		 * drops the TX option from IPV6_HOPOPTS as it does for UDP.
2725		 *
2726		 * If we had multilevel ICMP sockets, then we'd want to
2727		 * modify conn_recvancillary_size/add to
2728		 * allow the user to see the label.
2729		 */
2730	}
2731
2732	/*
2733	 * Check a filter for ICMPv6 types if needed.
2734	 * Verify raw checksums if needed.
2735	 */
2736	mutex_enter(&connp->conn_lock);
2737	if (icmp->icmp_filter != NULL) {
2738		int type;
2739
2740		/* Assumes that IP has done the pullupmsg */
2741		type = mp->b_rptr[ip_hdr_length];
2742
2743		ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr);
2744		if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
2745			mutex_exit(&connp->conn_lock);
2746			freemsg(mp);
2747			return;
2748		}
2749	}
2750	if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2751		/* Checksum */
2752		uint16_t	*up;
2753		uint32_t	sum;
2754		int		remlen;
2755
2756		up = (uint16_t *)&ip6h->ip6_src;
2757
2758		remlen = msgdsize(mp) - ip_hdr_length;
2759		sum = htons(connp->conn_proto + remlen)
2760		    + up[0] + up[1] + up[2] + up[3]
2761		    + up[4] + up[5] + up[6] + up[7]
2762		    + up[8] + up[9] + up[10] + up[11]
2763		    + up[12] + up[13] + up[14] + up[15];
2764		sum = (sum & 0xffff) + (sum >> 16);
2765		sum = IP_CSUM(mp, ip_hdr_length, sum);
2766		if (sum != 0) {
2767			/* IPv6 RAW checksum failed */
2768			ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum));
2769			mutex_exit(&connp->conn_lock);
2770			freemsg(mp);
2771			BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs);
2772			return;
2773		}
2774	}
2775	mutex_exit(&connp->conn_lock);
2776
2777	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2778
2779	if (recv_ancillary.crb_all != 0) {
2780		udi_size += conn_recvancillary_size(connp,
2781		    recv_ancillary, ira, mp, &ipps);
2782	}
2783
2784	mp1 = allocb(udi_size, BPRI_MED);
2785	if (mp1 == NULL) {
2786		freemsg(mp);
2787		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2788		return;
2789	}
2790	mp1->b_cont = mp;
2791	mp1->b_datap->db_type = M_PROTO;
2792	tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2793	mp1->b_wptr = (uchar_t *)tudi + udi_size;
2794	tudi->PRIM_type = T_UNITDATA_IND;
2795	tudi->SRC_length = sizeof (sin6_t);
2796	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2797	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2798	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
2799	tudi->OPT_length = udi_size;
2800	sin6 = (sin6_t *)&tudi[1];
2801	*sin6 = sin6_null;
2802	sin6->sin6_port = 0;
2803	sin6->sin6_family = AF_INET6;
2804
2805	sin6->sin6_addr = ip6h->ip6_src;
2806	/* No sin6_flowinfo per API */
2807	sin6->sin6_flowinfo = 0;
2808	/* For link-scope pass up scope id */
2809	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
2810		sin6->sin6_scope_id = ira->ira_ruifindex;
2811	else
2812		sin6->sin6_scope_id = 0;
2813	sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
2814	    IPCL_ZONEID(connp), is->is_netstack);
2815
2816	if (udi_size != 0) {
2817		conn_recvancillary_add(connp, recv_ancillary, ira,
2818		    &ipps, (uchar_t *)&sin6[1], udi_size);
2819	}
2820
2821	/* Skip all the IPv6 headers per API */
2822	mp->b_rptr += ip_hdr_length;
2823	pkt_len -= ip_hdr_length;
2824
2825deliver:
2826	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
2827	icmp_ulp_recv(connp, mp1, pkt_len);
2828}
2829
2830/*
2831 * return SNMP stuff in buffer in mpdata. We don't hold any lock and report
2832 * information that can be changing beneath us.
2833 */
2834mblk_t *
2835icmp_snmp_get(queue_t *q, mblk_t *mpctl)
2836{
2837	mblk_t			*mpdata;
2838	struct opthdr		*optp;
2839	conn_t			*connp = Q_TO_CONN(q);
2840	icmp_stack_t		*is = connp->conn_netstack->netstack_icmp;
2841	mblk_t			*mp2ctl;
2842
2843	/*
2844	 * make a copy of the original message
2845	 */
2846	mp2ctl = copymsg(mpctl);
2847
2848	if (mpctl == NULL ||
2849	    (mpdata = mpctl->b_cont) == NULL) {
2850		freemsg(mpctl);
2851		freemsg(mp2ctl);
2852		return (0);
2853	}
2854
2855	/* fixed length structure for IPv4 and IPv6 counters */
2856	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
2857	optp->level = EXPER_RAWIP;
2858	optp->name = 0;
2859	(void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
2860	    sizeof (is->is_rawip_mib));
2861	optp->len = msgdsize(mpdata);
2862	qreply(q, mpctl);
2863
2864	return (mp2ctl);
2865}
2866
2867/*
2868 * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
2869 * TODO:  If this ever actually tries to set anything, it needs to be
2870 * to do the appropriate locking.
2871 */
2872/* ARGSUSED */
2873int
2874icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
2875    uchar_t *ptr, int len)
2876{
2877	switch (level) {
2878	case EXPER_RAWIP:
2879		return (0);
2880	default:
2881		return (1);
2882	}
2883}
2884
2885/*
2886 * This routine creates a T_UDERROR_IND message and passes it upstream.
2887 * The address and options are copied from the T_UNITDATA_REQ message
2888 * passed in mp.  This message is freed.
2889 */
2890static void
2891icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
2892{
2893	struct T_unitdata_req *tudr;
2894	mblk_t	*mp1;
2895	uchar_t *destaddr;
2896	t_scalar_t destlen;
2897	uchar_t	*optaddr;
2898	t_scalar_t optlen;
2899
2900	if ((mp->b_wptr < mp->b_rptr) ||
2901	    (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
2902		goto done;
2903	}
2904	tudr = (struct T_unitdata_req *)mp->b_rptr;
2905	destaddr = mp->b_rptr + tudr->DEST_offset;
2906	if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
2907	    destaddr + tudr->DEST_length < mp->b_rptr ||
2908	    destaddr + tudr->DEST_length > mp->b_wptr) {
2909		goto done;
2910	}
2911	optaddr = mp->b_rptr + tudr->OPT_offset;
2912	if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
2913	    optaddr + tudr->OPT_length < mp->b_rptr ||
2914	    optaddr + tudr->OPT_length > mp->b_wptr) {
2915		goto done;
2916	}
2917	destlen = tudr->DEST_length;
2918	optlen = tudr->OPT_length;
2919
2920	mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
2921	    (char *)optaddr, optlen, err);
2922	if (mp1 != NULL)
2923		qreply(q, mp1);
2924
2925done:
2926	freemsg(mp);
2927}
2928
2929static int
2930rawip_do_unbind(conn_t *connp)
2931{
2932	icmp_t	*icmp = connp->conn_icmp;
2933
2934	mutex_enter(&connp->conn_lock);
2935	/* If a bind has not been done, we can't unbind. */
2936	if (icmp->icmp_state == TS_UNBND) {
2937		mutex_exit(&connp->conn_lock);
2938		return (-TOUTSTATE);
2939	}
2940	connp->conn_saddr_v6 = ipv6_all_zeros;
2941	connp->conn_bound_addr_v6 = ipv6_all_zeros;
2942	connp->conn_laddr_v6 = ipv6_all_zeros;
2943	connp->conn_mcbc_bind = B_FALSE;
2944	connp->conn_lport = 0;
2945	connp->conn_fport = 0;
2946	/* In case we were also connected */
2947	connp->conn_faddr_v6 = ipv6_all_zeros;
2948	connp->conn_v6lastdst = ipv6_all_zeros;
2949
2950	icmp->icmp_state = TS_UNBND;
2951
2952	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
2953	    &connp->conn_faddr_v6, connp->conn_flowinfo);
2954	mutex_exit(&connp->conn_lock);
2955
2956	ip_unbind(connp);
2957	return (0);
2958}
2959
2960/*
2961 * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
2962 * After some error checking, the message is passed downstream to ip.
2963 */
2964static void
2965icmp_tpi_unbind(queue_t *q, mblk_t *mp)
2966{
2967	conn_t	*connp = Q_TO_CONN(q);
2968	int	error;
2969
2970	ASSERT(mp->b_cont == NULL);
2971	error = rawip_do_unbind(connp);
2972	if (error) {
2973		if (error < 0) {
2974			icmp_err_ack(q, mp, -error, 0);
2975		} else {
2976			icmp_err_ack(q, mp, 0, error);
2977		}
2978		return;
2979	}
2980
2981	/*
2982	 * Convert mp into a T_OK_ACK
2983	 */
2984
2985	mp = mi_tpi_ok_ack_alloc(mp);
2986
2987	/*
2988	 * should not happen in practice... T_OK_ACK is smaller than the
2989	 * original message.
2990	 */
2991	ASSERT(mp != NULL);
2992	ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
2993	qreply(q, mp);
2994}
2995
2996/*
2997 * Process IPv4 packets that already include an IP header.
2998 * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
2999 * IPPROTO_IGMP).
3000 * In this case we ignore the address and any options in the T_UNITDATA_REQ.
3001 *
3002 * The packet is assumed to have a base (20 byte) IP header followed
3003 * by the upper-layer protocol. We include any IP_OPTIONS including a
3004 * CIPSO label but otherwise preserve the base IP header.
3005 */
3006static int
3007icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3008{
3009	icmp_t		*icmp = connp->conn_icmp;
3010	icmp_stack_t	*is = icmp->icmp_is;
3011	ipha_t		iphas;
3012	ipha_t		*ipha;
3013	int		ip_hdr_length;
3014	int		tp_hdr_len;
3015	ip_xmit_attr_t	*ixa;
3016	ip_pkt_t	*ipp;
3017	in6_addr_t	v6src;
3018	in6_addr_t	v6dst;
3019	in6_addr_t	v6nexthop;
3020	int		error;
3021	boolean_t	do_ipsec;
3022
3023	/*
3024	 * We need an exclusive copy of conn_ixa since the included IP
3025	 * header could have any destination.
3026	 * That copy has no pointers hence we
3027	 * need to set them up once we've parsed the ancillary data.
3028	 */
3029	ixa = conn_get_ixa_exclusive(connp);
3030	if (ixa == NULL) {
3031		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3032		freemsg(mp);
3033		return (ENOMEM);
3034	}
3035	ASSERT(cr != NULL);
3036	/*
3037	 * Caller has a reference on cr; from db_credp or because we
3038	 * are running in process context.
3039	 */
3040	ixa->ixa_cred = cr;
3041	ixa->ixa_cpid = pid;
3042	if (is_system_labeled()) {
3043		/* We need to restart with a label based on the cred */
3044		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3045	}
3046
3047	/* In case previous destination was multicast or multirt */
3048	ip_attr_newdst(ixa);
3049
3050	/* Get a copy of conn_xmit_ipp since the TX label might change it */
3051	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3052	if (ipp == NULL) {
3053		ixa_refrele(ixa);
3054		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3055		freemsg(mp);
3056		return (ENOMEM);
3057	}
3058	mutex_enter(&connp->conn_lock);
3059	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3060	mutex_exit(&connp->conn_lock);
3061	if (error != 0) {
3062		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3063		freemsg(mp);
3064		goto done;
3065	}
3066
3067	/* Sanity check length of packet */
3068	ipha = (ipha_t *)mp->b_rptr;
3069
3070	ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
3071	if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
3072		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
3073			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3074			freemsg(mp);
3075			goto done;
3076		}
3077		ipha = (ipha_t *)mp->b_rptr;
3078	}
3079	ipha->ipha_version_and_hdr_length =
3080	    (IP_VERSION<<4) | (ip_hdr_length>>2);
3081
3082	/*
3083	 * We set IXAF_DONTFRAG if the application set DF which makes
3084	 * IP not fragment.
3085	 */
3086	ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
3087	if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF))
3088		ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3089	else
3090		ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3091
3092	/* Even for multicast and broadcast we honor the apps ttl */
3093	ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
3094
3095	if (ipha->ipha_dst == INADDR_ANY)
3096		ipha->ipha_dst = htonl(INADDR_LOOPBACK);
3097
3098	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
3099	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
3100
3101	/* Defer IPsec if it might need to look at ICMP type/code */
3102	do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP;
3103	ixa->ixa_flags |= IXAF_IS_IPV4;
3104
3105	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3106	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop,
3107	    connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3108	    (do_ipsec ? IPDF_IPSEC : 0));
3109	switch (error) {
3110	case 0:
3111		break;
3112	case EADDRNOTAVAIL:
3113		/*
3114		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3115		 * Don't have the application see that errno
3116		 */
3117		error = ENETUNREACH;
3118		goto failed;
3119	case ENETDOWN:
3120		/*
3121		 * Have !ipif_addr_ready address; drop packet silently
3122		 * until we can get applications to not send until we
3123		 * are ready.
3124		 */
3125		error = 0;
3126		goto failed;
3127	case EHOSTUNREACH:
3128	case ENETUNREACH:
3129		if (ixa->ixa_ire != NULL) {
3130			/*
3131			 * Let conn_ip_output/ire_send_noroute return
3132			 * the error and send any local ICMP error.
3133			 */
3134			error = 0;
3135			break;
3136		}
3137		/* FALLTHRU */
3138	default:
3139	failed:
3140		freemsg(mp);
3141		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3142		goto done;
3143	}
3144	if (ipha->ipha_src == INADDR_ANY)
3145		IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
3146
3147	/*
3148	 * We might be going to a different destination than last time,
3149	 * thus check that TX allows the communication and compute any
3150	 * needed label.
3151	 *
3152	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3153	 * don't have to worry about concurrent threads.
3154	 */
3155	if (is_system_labeled()) {
3156		/*
3157		 * Check whether Trusted Solaris policy allows communication
3158		 * with this host, and pretend that the destination is
3159		 * unreachable if not.
3160		 * Compute any needed label and place it in ipp_label_v4/v6.
3161		 *
3162		 * Later conn_build_hdr_template/conn_prepend_hdr takes
3163		 * ipp_label_v4/v6 to form the packet.
3164		 *
3165		 * Tsol note: We have ipp structure local to this thread so
3166		 * no locking is needed.
3167		 */
3168		error = conn_update_label(connp, ixa, &v6dst, ipp);
3169		if (error != 0) {
3170			freemsg(mp);
3171			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3172			goto done;
3173		}
3174	}
3175
3176	/*
3177	 * Save away a copy of the IPv4 header the application passed down
3178	 * and then prepend an IPv4 header complete with any IP options
3179	 * including label.
3180	 * We need a struct copy since icmp_prepend_hdr will reuse the available
3181	 * space in the mblk.
3182	 */
3183	iphas = *ipha;
3184	mp->b_rptr += IP_SIMPLE_HDR_LENGTH;
3185
3186	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error);
3187	if (mp == NULL) {
3188		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3189		ASSERT(error != 0);
3190		goto done;
3191	}
3192	if (ixa->ixa_pktlen > IP_MAXPACKET) {
3193		error = EMSGSIZE;
3194		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3195		freemsg(mp);
3196		goto done;
3197	}
3198	/* Restore key parts of the header that the application passed down */
3199	ipha = (ipha_t *)mp->b_rptr;
3200	ipha->ipha_type_of_service = iphas.ipha_type_of_service;
3201	ipha->ipha_ident = iphas.ipha_ident;
3202	ipha->ipha_fragment_offset_and_flags =
3203	    iphas.ipha_fragment_offset_and_flags;
3204	ipha->ipha_ttl = iphas.ipha_ttl;
3205	ipha->ipha_protocol = iphas.ipha_protocol;
3206	ipha->ipha_src = iphas.ipha_src;
3207	ipha->ipha_dst = iphas.ipha_dst;
3208
3209	ixa->ixa_protocol = ipha->ipha_protocol;
3210
3211	/*
3212	 * Make sure that the IP header plus any transport header that is
3213	 * checksumed by ip_output is in the first mblk. (ip_output assumes
3214	 * that at least the checksum field is in the first mblk.)
3215	 */
3216	switch (ipha->ipha_protocol) {
3217	case IPPROTO_UDP:
3218		tp_hdr_len = 8;
3219		break;
3220	case IPPROTO_TCP:
3221		tp_hdr_len = 20;
3222		break;
3223	default:
3224		tp_hdr_len = 0;
3225		break;
3226	}
3227	ip_hdr_length = IPH_HDR_LENGTH(ipha);
3228	if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) {
3229		if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) {
3230			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3231			if (mp->b_cont == NULL)
3232				error = EINVAL;
3233			else
3234				error = ENOMEM;
3235			freemsg(mp);
3236			goto done;
3237		}
3238	}
3239
3240	if (!do_ipsec) {
3241		/* Policy might differ for different ICMP type/code */
3242		if (ixa->ixa_ipsec_policy != NULL) {
3243			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3244			ixa->ixa_ipsec_policy = NULL;
3245			ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3246		}
3247		mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa);
3248		if (mp == NULL) {
3249			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3250			error = EHOSTUNREACH;	/* IPsec policy failure */
3251			goto done;
3252		}
3253	}
3254
3255	/* We're done.  Pass the packet to ip. */
3256	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3257
3258	error = conn_ip_output(mp, ixa);
3259	/* No rawipOutErrors if an error since IP increases its error counter */
3260	switch (error) {
3261	case 0:
3262		break;
3263	case EWOULDBLOCK:
3264		(void) ixa_check_drain_insert(connp, ixa);
3265		error = 0;
3266		break;
3267	case EADDRNOTAVAIL:
3268		/*
3269		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3270		 * Don't have the application see that errno
3271		 */
3272		error = ENETUNREACH;
3273		break;
3274	}
3275done:
3276	ixa_refrele(ixa);
3277	ip_pkt_free(ipp);
3278	kmem_free(ipp, sizeof (*ipp));
3279	return (error);
3280}
3281
3282static mblk_t *
3283icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa)
3284{
3285	ipha_t	*ipha = NULL;
3286	ip6_t	*ip6h = NULL;
3287
3288	if (ixa->ixa_flags & IXAF_IS_IPV4)
3289		ipha = (ipha_t *)mp->b_rptr;
3290	else
3291		ip6h = (ip6_t *)mp->b_rptr;
3292
3293	if (ixa->ixa_ipsec_policy != NULL) {
3294		IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3295		ixa->ixa_ipsec_policy = NULL;
3296		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3297	}
3298	return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa));
3299}
3300
3301/*
3302 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
3303 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
3304 * the TPI options, otherwise we take them from msg_control.
3305 * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
3306 * Always consumes mp; never consumes tudr_mp.
3307 */
3308static int
3309icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
3310    mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
3311{
3312	icmp_t		*icmp = connp->conn_icmp;
3313	icmp_stack_t	*is = icmp->icmp_is;
3314	int		error;
3315	ip_xmit_attr_t	*ixa;
3316	ip_pkt_t	*ipp;
3317	in6_addr_t	v6src;
3318	in6_addr_t	v6dst;
3319	in6_addr_t	v6nexthop;
3320	in_port_t	dstport;
3321	uint32_t	flowinfo;
3322	uint_t		srcid;
3323	int		is_absreq_failure = 0;
3324	conn_opt_arg_t	coas, *coa;
3325
3326	ASSERT(tudr_mp != NULL || msg != NULL);
3327
3328	/*
3329	 * Get ixa before checking state to handle a disconnect race.
3330	 *
3331	 * We need an exclusive copy of conn_ixa since the ancillary data
3332	 * options might modify it. That copy has no pointers hence we
3333	 * need to set them up once we've parsed the ancillary data.
3334	 */
3335	ixa = conn_get_ixa_exclusive(connp);
3336	if (ixa == NULL) {
3337		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3338		freemsg(mp);
3339		return (ENOMEM);
3340	}
3341	ASSERT(cr != NULL);
3342	ixa->ixa_cred = cr;
3343	ixa->ixa_cpid = pid;
3344	if (is_system_labeled()) {
3345		/* We need to restart with a label based on the cred */
3346		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3347	}
3348
3349	/* In case previous destination was multicast or multirt */
3350	ip_attr_newdst(ixa);
3351
3352	/* Get a copy of conn_xmit_ipp since the options might change it */
3353	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3354	if (ipp == NULL) {
3355		ixa_refrele(ixa);
3356		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3357		freemsg(mp);
3358		return (ENOMEM);
3359	}
3360	mutex_enter(&connp->conn_lock);
3361	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3362	mutex_exit(&connp->conn_lock);
3363	if (error != 0) {
3364		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3365		freemsg(mp);
3366		goto done;
3367	}
3368
3369	/*
3370	 * Parse the options and update ixa and ipp as a result.
3371	 */
3372
3373	coa = &coas;
3374	coa->coa_connp = connp;
3375	coa->coa_ixa = ixa;
3376	coa->coa_ipp = ipp;
3377	coa->coa_ancillary = B_TRUE;
3378	coa->coa_changed = 0;
3379
3380	if (msg != NULL) {
3381		error = process_auxiliary_options(connp, msg->msg_control,
3382		    msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr);
3383	} else {
3384		struct T_unitdata_req *tudr;
3385
3386		tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
3387		ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
3388		error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
3389		    &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj,
3390		    coa, &is_absreq_failure);
3391	}
3392	if (error != 0) {
3393		/*
3394		 * Note: No special action needed in this
3395		 * module for "is_absreq_failure"
3396		 */
3397		freemsg(mp);
3398		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3399		goto done;
3400	}
3401	ASSERT(is_absreq_failure == 0);
3402
3403	mutex_enter(&connp->conn_lock);
3404	/*
3405	 * If laddr is unspecified then we look at sin6_src_id.
3406	 * We will give precedence to a source address set with IPV6_PKTINFO
3407	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
3408	 * want ip_attr_connect to select a source (since it can fail) when
3409	 * IPV6_PKTINFO is specified.
3410	 * If this doesn't result in a source address then we get a source
3411	 * from ip_attr_connect() below.
3412	 */
3413	v6src = connp->conn_saddr_v6;
3414	if (sin != NULL) {
3415		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
3416		dstport = sin->sin_port;
3417		flowinfo = 0;
3418		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3419		ixa->ixa_flags |= IXAF_IS_IPV4;
3420	} else if (sin6 != NULL) {
3421		v6dst = sin6->sin6_addr;
3422		dstport = sin6->sin6_port;
3423		flowinfo = sin6->sin6_flowinfo;
3424		srcid = sin6->__sin6_src_id;
3425		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
3426			ixa->ixa_scopeid = sin6->sin6_scope_id;
3427			ixa->ixa_flags |= IXAF_SCOPEID_SET;
3428		} else {
3429			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3430		}
3431		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
3432			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
3433			    connp->conn_netstack);
3434		}
3435		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
3436			ixa->ixa_flags |= IXAF_IS_IPV4;
3437		else
3438			ixa->ixa_flags &= ~IXAF_IS_IPV4;
3439	} else {
3440		/* Connected case */
3441		v6dst = connp->conn_faddr_v6;
3442		flowinfo = connp->conn_flowinfo;
3443	}
3444	mutex_exit(&connp->conn_lock);
3445	/* Handle IPV6_PKTINFO setting source address. */
3446	if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
3447	    (ipp->ipp_fields & IPPF_ADDR)) {
3448		if (ixa->ixa_flags & IXAF_IS_IPV4) {
3449			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3450				v6src = ipp->ipp_addr;
3451		} else {
3452			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3453				v6src = ipp->ipp_addr;
3454		}
3455	}
3456
3457	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3458	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
3459	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
3460
3461	switch (error) {
3462	case 0:
3463		break;
3464	case EADDRNOTAVAIL:
3465		/*
3466		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3467		 * Don't have the application see that errno
3468		 */
3469		error = ENETUNREACH;
3470		goto failed;
3471	case ENETDOWN:
3472		/*
3473		 * Have !ipif_addr_ready address; drop packet silently
3474		 * until we can get applications to not send until we
3475		 * are ready.
3476		 */
3477		error = 0;
3478		goto failed;
3479	case EHOSTUNREACH:
3480	case ENETUNREACH:
3481		if (ixa->ixa_ire != NULL) {
3482			/*
3483			 * Let conn_ip_output/ire_send_noroute return
3484			 * the error and send any local ICMP error.
3485			 */
3486			error = 0;
3487			break;
3488		}
3489		/* FALLTHRU */
3490	default:
3491	failed:
3492		freemsg(mp);
3493		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3494		goto done;
3495	}
3496
3497	/*
3498	 * We might be going to a different destination than last time,
3499	 * thus check that TX allows the communication and compute any
3500	 * needed label.
3501	 *
3502	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3503	 * don't have to worry about concurrent threads.
3504	 */
3505	if (is_system_labeled()) {
3506		/*
3507		 * Check whether Trusted Solaris policy allows communication
3508		 * with this host, and pretend that the destination is
3509		 * unreachable if not.
3510		 * Compute any needed label and place it in ipp_label_v4/v6.
3511		 *
3512		 * Later conn_build_hdr_template/conn_prepend_hdr takes
3513		 * ipp_label_v4/v6 to form the packet.
3514		 *
3515		 * Tsol note: We have ipp structure local to this thread so
3516		 * no locking is needed.
3517		 */
3518		error = conn_update_label(connp, ixa, &v6dst, ipp);
3519		if (error != 0) {
3520			freemsg(mp);
3521			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3522			goto done;
3523		}
3524	}
3525	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp,
3526	    &error);
3527	if (mp == NULL) {
3528		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3529		ASSERT(error != 0);
3530		goto done;
3531	}
3532	if (ixa->ixa_pktlen > IP_MAXPACKET) {
3533		error = EMSGSIZE;
3534		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3535		freemsg(mp);
3536		goto done;
3537	}
3538
3539	/* Policy might differ for different ICMP type/code */
3540	mp = icmp_output_attach_policy(mp, connp, ixa);
3541	if (mp == NULL) {
3542		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3543		error = EHOSTUNREACH;	/* IPsec policy failure */
3544		goto done;
3545	}
3546
3547	/* We're done.  Pass the packet to ip. */
3548	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3549
3550	/* Allow source not assigned to the system? */
3551	ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3552	error = conn_ip_output(mp, ixa);
3553	if (!connp->conn_unspec_src)
3554		ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
3555	/* No rawipOutErrors if an error since IP increases its error counter */
3556	switch (error) {
3557	case 0:
3558		break;
3559	case EWOULDBLOCK:
3560		(void) ixa_check_drain_insert(connp, ixa);
3561		error = 0;
3562		break;
3563	case EADDRNOTAVAIL:
3564		/*
3565		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3566		 * Don't have the application see that errno
3567		 */
3568		error = ENETUNREACH;
3569		/* FALLTHRU */
3570	default:
3571		mutex_enter(&connp->conn_lock);
3572		/*
3573		 * Clear the source and v6lastdst so we call ip_attr_connect
3574		 * for the next packet and try to pick a better source.
3575		 */
3576		if (connp->conn_mcbc_bind)
3577			connp->conn_saddr_v6 = ipv6_all_zeros;
3578		else
3579			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3580		connp->conn_v6lastdst = ipv6_all_zeros;
3581		mutex_exit(&connp->conn_lock);
3582		break;
3583	}
3584done:
3585	ixa_refrele(ixa);
3586	ip_pkt_free(ipp);
3587	kmem_free(ipp, sizeof (*ipp));
3588	return (error);
3589}
3590
3591/*
3592 * Handle sending an M_DATA for a connected socket.
3593 * Handles both IPv4 and IPv6.
3594 */
3595int
3596icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3597{
3598	icmp_t		*icmp = connp->conn_icmp;
3599	icmp_stack_t	*is = icmp->icmp_is;
3600	int		error;
3601	ip_xmit_attr_t	*ixa;
3602	boolean_t	do_ipsec;
3603
3604	/*
3605	 * If no other thread is using conn_ixa this just gets a reference to
3606	 * conn_ixa. Otherwise we get a safe copy of conn_ixa.
3607	 */
3608	ixa = conn_get_ixa(connp, B_FALSE);
3609	if (ixa == NULL) {
3610		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3611		freemsg(mp);
3612		return (ENOMEM);
3613	}
3614
3615	ASSERT(cr != NULL);
3616	ixa->ixa_cred = cr;
3617	ixa->ixa_cpid = pid;
3618
3619	/* Defer IPsec if it might need to look at ICMP type/code */
3620	switch (ixa->ixa_protocol) {
3621	case IPPROTO_ICMP:
3622	case IPPROTO_ICMPV6:
3623		do_ipsec = B_FALSE;
3624		break;
3625	default:
3626		do_ipsec = B_TRUE;
3627	}
3628
3629	mutex_enter(&connp->conn_lock);
3630	mp = icmp_prepend_header_template(connp, ixa, mp,
3631	    &connp->conn_saddr_v6, connp->conn_flowinfo, &error);
3632
3633	if (mp == NULL) {
3634		ASSERT(error != 0);
3635		mutex_exit(&connp->conn_lock);
3636		ixa_refrele(ixa);
3637		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3638		freemsg(mp);
3639		return (error);
3640	}
3641
3642	if (!do_ipsec) {
3643		/* Policy might differ for different ICMP type/code */
3644		mp = icmp_output_attach_policy(mp, connp, ixa);
3645		if (mp == NULL) {
3646			mutex_exit(&connp->conn_lock);
3647			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3648			ixa_refrele(ixa);
3649			return (EHOSTUNREACH);	/* IPsec policy failure */
3650		}
3651	}
3652
3653	/*
3654	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3655	 * safe copy, then we need to fill in any pointers in it.
3656	 */
3657	if (ixa->ixa_ire == NULL) {
3658		in6_addr_t	faddr, saddr;
3659		in6_addr_t	nexthop;
3660		in_port_t	fport;
3661
3662		saddr = connp->conn_saddr_v6;
3663		faddr = connp->conn_faddr_v6;
3664		fport = connp->conn_fport;
3665		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
3666		mutex_exit(&connp->conn_lock);
3667
3668		error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
3669		    fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3670		    (do_ipsec ? IPDF_IPSEC : 0));
3671		switch (error) {
3672		case 0:
3673			break;
3674		case EADDRNOTAVAIL:
3675			/*
3676			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3677			 * Don't have the application see that errno
3678			 */
3679			error = ENETUNREACH;
3680			goto failed;
3681		case ENETDOWN:
3682			/*
3683			 * Have !ipif_addr_ready address; drop packet silently
3684			 * until we can get applications to not send until we
3685			 * are ready.
3686			 */
3687			error = 0;
3688			goto failed;
3689		case EHOSTUNREACH:
3690		case ENETUNREACH:
3691			if (ixa->ixa_ire != NULL) {
3692				/*
3693				 * Let conn_ip_output/ire_send_noroute return
3694				 * the error and send any local ICMP error.
3695				 */
3696				error = 0;
3697				break;
3698			}
3699			/* FALLTHRU */
3700		default:
3701		failed:
3702			ixa_refrele(ixa);
3703			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3704			freemsg(mp);
3705			return (error);
3706		}
3707	} else {
3708		/* Done with conn_t */
3709		mutex_exit(&connp->conn_lock);
3710	}
3711
3712	/* We're done.  Pass the packet to ip. */
3713	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3714
3715	error = conn_ip_output(mp, ixa);
3716	/* No rawipOutErrors if an error since IP increases its error counter */
3717	switch (error) {
3718	case 0:
3719		break;
3720	case EWOULDBLOCK:
3721		(void) ixa_check_drain_insert(connp, ixa);
3722		error = 0;
3723		break;
3724	case EADDRNOTAVAIL:
3725		/*
3726		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3727		 * Don't have the application see that errno
3728		 */
3729		error = ENETUNREACH;
3730		break;
3731	}
3732	ixa_refrele(ixa);
3733	return (error);
3734}
3735
3736/*
3737 * Handle sending an M_DATA to the last destination.
3738 * Handles both IPv4 and IPv6.
3739 *
3740 * NOTE: The caller must hold conn_lock and we drop it here.
3741 */
3742int
3743icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
3744    ip_xmit_attr_t *ixa)
3745{
3746	icmp_t		*icmp = connp->conn_icmp;
3747	icmp_stack_t	*is = icmp->icmp_is;
3748	int		error;
3749	boolean_t	do_ipsec;
3750
3751	ASSERT(MUTEX_HELD(&connp->conn_lock));
3752	ASSERT(ixa != NULL);
3753
3754	ASSERT(cr != NULL);
3755	ixa->ixa_cred = cr;
3756	ixa->ixa_cpid = pid;
3757
3758	/* Defer IPsec if it might need to look at ICMP type/code */
3759	switch (ixa->ixa_protocol) {
3760	case IPPROTO_ICMP:
3761	case IPPROTO_ICMPV6:
3762		do_ipsec = B_FALSE;
3763		break;
3764	default:
3765		do_ipsec = B_TRUE;
3766	}
3767
3768
3769	mp = icmp_prepend_header_template(connp, ixa, mp,
3770	    &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error);
3771
3772	if (mp == NULL) {
3773		ASSERT(error != 0);
3774		mutex_exit(&connp->conn_lock);
3775		ixa_refrele(ixa);
3776		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3777		freemsg(mp);
3778		return (error);
3779	}
3780
3781	if (!do_ipsec) {
3782		/* Policy might differ for different ICMP type/code */
3783		mp = icmp_output_attach_policy(mp, connp, ixa);
3784		if (mp == NULL) {
3785			mutex_exit(&connp->conn_lock);
3786			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3787			ixa_refrele(ixa);
3788			return (EHOSTUNREACH);	/* IPsec policy failure */
3789		}
3790	}
3791
3792	/*
3793	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3794	 * safe copy, then we need to fill in any pointers in it.
3795	 */
3796	if (ixa->ixa_ire == NULL) {
3797		in6_addr_t	lastdst, lastsrc;
3798		in6_addr_t	nexthop;
3799		in_port_t	lastport;
3800
3801		lastsrc = connp->conn_v6lastsrc;
3802		lastdst = connp->conn_v6lastdst;
3803		lastport = connp->conn_lastdstport;
3804		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
3805		mutex_exit(&connp->conn_lock);
3806
3807		error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
3808		    &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
3809		    IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0));
3810		switch (error) {
3811		case 0:
3812			break;
3813		case EADDRNOTAVAIL:
3814			/*
3815			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3816			 * Don't have the application see that errno
3817			 */
3818			error = ENETUNREACH;
3819			goto failed;
3820		case ENETDOWN:
3821			/*
3822			 * Have !ipif_addr_ready address; drop packet silently
3823			 * until we can get applications to not send until we
3824			 * are ready.
3825			 */
3826			error = 0;
3827			goto failed;
3828		case EHOSTUNREACH:
3829		case ENETUNREACH:
3830			if (ixa->ixa_ire != NULL) {
3831				/*
3832				 * Let conn_ip_output/ire_send_noroute return
3833				 * the error and send any local ICMP error.
3834				 */
3835				error = 0;
3836				break;
3837			}
3838			/* FALLTHRU */
3839		default:
3840		failed:
3841			ixa_refrele(ixa);
3842			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3843			freemsg(mp);
3844			return (error);
3845		}
3846	} else {
3847		/* Done with conn_t */
3848		mutex_exit(&connp->conn_lock);
3849	}
3850
3851	/* We're done.  Pass the packet to ip. */
3852	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3853	error = conn_ip_output(mp, ixa);
3854	/* No rawipOutErrors if an error since IP increases its error counter */
3855	switch (error) {
3856	case 0:
3857		break;
3858	case EWOULDBLOCK:
3859		(void) ixa_check_drain_insert(connp, ixa);
3860		error = 0;
3861		break;
3862	case EADDRNOTAVAIL:
3863		/*
3864		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3865		 * Don't have the application see that errno
3866		 */
3867		error = ENETUNREACH;
3868		/* FALLTHRU */
3869	default:
3870		mutex_enter(&connp->conn_lock);
3871		/*
3872		 * Clear the source and v6lastdst so we call ip_attr_connect
3873		 * for the next packet and try to pick a better source.
3874		 */
3875		if (connp->conn_mcbc_bind)
3876			connp->conn_saddr_v6 = ipv6_all_zeros;
3877		else
3878			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3879		connp->conn_v6lastdst = ipv6_all_zeros;
3880		mutex_exit(&connp->conn_lock);
3881		break;
3882	}
3883	ixa_refrele(ixa);
3884	return (error);
3885}
3886
3887
3888/*
3889 * Prepend the header template and then fill in the source and
3890 * flowinfo. The caller needs to handle the destination address since
3891 * it's setting is different if rthdr or source route.
3892 *
3893 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
3894 * When it returns NULL it sets errorp.
3895 */
3896static mblk_t *
3897icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
3898    const in6_addr_t *v6src, uint32_t flowinfo, int *errorp)
3899{
3900	icmp_t		*icmp = connp->conn_icmp;
3901	icmp_stack_t	*is = icmp->icmp_is;
3902	uint_t		pktlen;
3903	uint_t		copylen;
3904	uint8_t		*iph;
3905	uint_t		ip_hdr_length;
3906	uint32_t	cksum;
3907	ip_pkt_t	*ipp;
3908
3909	ASSERT(MUTEX_HELD(&connp->conn_lock));
3910
3911	/*
3912	 * Copy the header template.
3913	 */
3914	copylen = connp->conn_ht_iphc_len;
3915	pktlen = copylen + msgdsize(mp);
3916	if (pktlen > IP_MAXPACKET) {
3917		freemsg(mp);
3918		*errorp = EMSGSIZE;
3919		return (NULL);
3920	}
3921	ixa->ixa_pktlen = pktlen;
3922
3923	/* check/fix buffer config, setup pointers into it */
3924	iph = mp->b_rptr - copylen;
3925	if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
3926		mblk_t *mp1;
3927
3928		mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED);
3929		if (mp1 == NULL) {
3930			freemsg(mp);
3931			*errorp = ENOMEM;
3932			return (NULL);
3933		}
3934		mp1->b_wptr = DB_LIM(mp1);
3935		mp1->b_cont = mp;
3936		mp = mp1;
3937		iph = (mp->b_wptr - copylen);
3938	}
3939	mp->b_rptr = iph;
3940	bcopy(connp->conn_ht_iphc, iph, copylen);
3941	ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
3942
3943	ixa->ixa_ip_hdr_length = ip_hdr_length;
3944
3945	/*
3946	 * Prepare for ICMPv6 checksum done in IP.
3947	 *
3948	 * icmp_build_hdr_template has already massaged any routing header
3949	 * and placed the result in conn_sum.
3950	 *
3951	 * We make it easy for IP to include our pseudo header
3952	 * by putting our length (and any routing header adjustment)
3953	 * in the ICMPv6 checksum field.
3954	 */
3955	cksum = pktlen - ip_hdr_length;
3956
3957	cksum += connp->conn_sum;
3958	cksum = (cksum >> 16) + (cksum & 0xFFFF);
3959	ASSERT(cksum < 0x10000);
3960
3961	ipp = &connp->conn_xmit_ipp;
3962	if (ixa->ixa_flags & IXAF_IS_IPV4) {
3963		ipha_t	*ipha = (ipha_t *)iph;
3964
3965		ipha->ipha_length = htons((uint16_t)pktlen);
3966
3967		/* if IP_PKTINFO specified an addres it wins over bind() */
3968		if ((ipp->ipp_fields & IPPF_ADDR) &&
3969		    IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
3970			ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
3971			ipha->ipha_src = ipp->ipp_addr_v4;
3972		} else {
3973			IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
3974		}
3975	} else {
3976		ip6_t *ip6h = (ip6_t *)iph;
3977		uint_t	cksum_offset = 0;
3978
3979		ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
3980
3981		/* if IP_PKTINFO specified an addres it wins over bind() */
3982		if ((ipp->ipp_fields & IPPF_ADDR) &&
3983		    !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
3984			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
3985			ip6h->ip6_src = ipp->ipp_addr;
3986		} else {
3987			ip6h->ip6_src = *v6src;
3988		}
3989		ip6h->ip6_vcf =
3990		    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
3991		    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
3992		if (ipp->ipp_fields & IPPF_TCLASS) {
3993			/* Overrides the class part of flowinfo */
3994			ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
3995			    ipp->ipp_tclass);
3996		}
3997
3998		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
3999			if (connp->conn_proto == IPPROTO_ICMPV6) {
4000				cksum_offset = ixa->ixa_ip_hdr_length +
4001				    offsetof(icmp6_t, icmp6_cksum);
4002			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
4003				cksum_offset = ixa->ixa_ip_hdr_length +
4004				    ixa->ixa_raw_cksum_offset;
4005			}
4006		}
4007		if (cksum_offset != 0) {
4008			uint16_t *ptr;
4009
4010			/* Make sure the checksum fits in the first mblk */
4011			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
4012				mblk_t *mp1;
4013
4014				mp1 = msgpullup(mp,
4015				    cksum_offset + sizeof (short));
4016				freemsg(mp);
4017				if (mp1 == NULL) {
4018					*errorp = ENOMEM;
4019					return (NULL);
4020				}
4021				mp = mp1;
4022				iph = mp->b_rptr;
4023				ip6h = (ip6_t *)iph;
4024			}
4025			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
4026			*ptr = htons(cksum);
4027		}
4028	}
4029
4030	return (mp);
4031}
4032
4033/*
4034 * This routine handles all messages passed downstream.  It either
4035 * consumes the message or passes it downstream; it never queues a
4036 * a message.
4037 */
4038void
4039icmp_wput(queue_t *q, mblk_t *mp)
4040{
4041	sin6_t		*sin6;
4042	sin_t		*sin = NULL;
4043	uint_t		srcid;
4044	conn_t		*connp = Q_TO_CONN(q);
4045	icmp_t		*icmp = connp->conn_icmp;
4046	int		error = 0;
4047	struct sockaddr	*addr = NULL;
4048	socklen_t	addrlen;
4049	icmp_stack_t	*is = icmp->icmp_is;
4050	struct T_unitdata_req *tudr;
4051	mblk_t		*data_mp;
4052	cred_t		*cr;
4053	pid_t		pid;
4054
4055	/*
4056	 * We directly handle several cases here: T_UNITDATA_REQ message
4057	 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
4058	 * socket.
4059	 */
4060	switch (DB_TYPE(mp)) {
4061	case M_DATA:
4062		/* sockfs never sends down M_DATA */
4063		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4064		freemsg(mp);
4065		return;
4066
4067	case M_PROTO:
4068	case M_PCPROTO:
4069		tudr = (struct T_unitdata_req *)mp->b_rptr;
4070		if (MBLKL(mp) < sizeof (*tudr) ||
4071		    ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
4072			icmp_wput_other(q, mp);
4073			return;
4074		}
4075		break;
4076
4077	default:
4078		icmp_wput_other(q, mp);
4079		return;
4080	}
4081
4082	/* Handle valid T_UNITDATA_REQ here */
4083	data_mp = mp->b_cont;
4084	if (data_mp == NULL) {
4085		error = EPROTO;
4086		goto ud_error2;
4087	}
4088	mp->b_cont = NULL;
4089
4090	if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
4091		error = EADDRNOTAVAIL;
4092		goto ud_error2;
4093	}
4094
4095	/*
4096	 * All Solaris components should pass a db_credp
4097	 * for this message, hence we ASSERT.
4098	 * On production kernels we return an error to be robust against
4099	 * random streams modules sitting on top of us.
4100	 */
4101	cr = msg_getcred(mp, &pid);
4102	ASSERT(cr != NULL);
4103	if (cr == NULL) {
4104		error = EINVAL;
4105		goto ud_error2;
4106	}
4107
4108	/*
4109	 * If a port has not been bound to the stream, fail.
4110	 * This is not a problem when sockfs is directly
4111	 * above us, because it will ensure that the socket
4112	 * is first bound before allowing data to be sent.
4113	 */
4114	if (icmp->icmp_state == TS_UNBND) {
4115		error = EPROTO;
4116		goto ud_error2;
4117	}
4118	addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
4119	addrlen = tudr->DEST_length;
4120
4121	switch (connp->conn_family) {
4122	case AF_INET6:
4123		sin6 = (sin6_t *)addr;
4124		if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
4125		    (sin6->sin6_family != AF_INET6)) {
4126			error = EADDRNOTAVAIL;
4127			goto ud_error2;
4128		}
4129
4130		/* No support for mapped addresses on raw sockets */
4131		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4132			error = EADDRNOTAVAIL;
4133			goto ud_error2;
4134		}
4135		srcid = sin6->__sin6_src_id;
4136
4137		/*
4138		 * If the local address is a mapped address return
4139		 * an error.
4140		 * It would be possible to send an IPv6 packet but the
4141		 * response would never make it back to the application
4142		 * since it is bound to a mapped address.
4143		 */
4144		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
4145			error = EADDRNOTAVAIL;
4146			goto ud_error2;
4147		}
4148
4149		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4150			sin6->sin6_addr = ipv6_loopback;
4151
4152		if (tudr->OPT_length != 0) {
4153			/*
4154			 * If we are connected then the destination needs to be
4155			 * the same as the connected one.
4156			 */
4157			if (icmp->icmp_state == TS_DATA_XFER &&
4158			    !conn_same_as_last_v6(connp, sin6)) {
4159				error = EISCONN;
4160				goto ud_error2;
4161			}
4162			error = icmp_output_ancillary(connp, NULL, sin6,
4163			    data_mp, mp, NULL, cr, pid);
4164		} else {
4165			ip_xmit_attr_t *ixa;
4166
4167			/*
4168			 * We have to allocate an ip_xmit_attr_t before we grab
4169			 * conn_lock and we need to hold conn_lock once we've
4170			 * checked conn_same_as_last_v6 to handle concurrent
4171			 * send* calls on a socket.
4172			 */
4173			ixa = conn_get_ixa(connp, B_FALSE);
4174			if (ixa == NULL) {
4175				error = ENOMEM;
4176				goto ud_error2;
4177			}
4178			mutex_enter(&connp->conn_lock);
4179
4180			if (conn_same_as_last_v6(connp, sin6) &&
4181			    connp->conn_lastsrcid == srcid &&
4182			    ipsec_outbound_policy_current(ixa)) {
4183				/* icmp_output_lastdst drops conn_lock */
4184				error = icmp_output_lastdst(connp, data_mp, cr,
4185				    pid, ixa);
4186			} else {
4187				/* icmp_output_newdst drops conn_lock */
4188				error = icmp_output_newdst(connp, data_mp, NULL,
4189				    sin6, cr, pid, ixa);
4190			}
4191			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4192		}
4193		if (error == 0) {
4194			freeb(mp);
4195			return;
4196		}
4197		break;
4198
4199	case AF_INET:
4200		sin = (sin_t *)addr;
4201		if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
4202		    (sin->sin_family != AF_INET)) {
4203			error = EADDRNOTAVAIL;
4204			goto ud_error2;
4205		}
4206		if (sin->sin_addr.s_addr == INADDR_ANY)
4207			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
4208
4209		/* Protocol 255 contains full IP headers */
4210		/* Read without holding lock */
4211		if (icmp->icmp_hdrincl) {
4212			if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) {
4213				if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) {
4214					error = EINVAL;
4215					goto ud_error2;
4216				}
4217			}
4218			error = icmp_output_hdrincl(connp, data_mp, cr, pid);
4219			if (error == 0) {
4220				freeb(mp);
4221				return;
4222			}
4223			/* data_mp consumed above */
4224			data_mp = NULL;
4225			goto ud_error2;
4226		}
4227
4228		if (tudr->OPT_length != 0) {
4229			/*
4230			 * If we are connected then the destination needs to be
4231			 * the same as the connected one.
4232			 */
4233			if (icmp->icmp_state == TS_DATA_XFER &&
4234			    !conn_same_as_last_v4(connp, sin)) {
4235				error = EISCONN;
4236				goto ud_error2;
4237			}
4238			error = icmp_output_ancillary(connp, sin, NULL,
4239			    data_mp, mp, NULL, cr, pid);
4240		} else {
4241			ip_xmit_attr_t *ixa;
4242
4243			/*
4244			 * We have to allocate an ip_xmit_attr_t before we grab
4245			 * conn_lock and we need to hold conn_lock once we've
4246			 * checked conn_same_as_last_v4 to handle concurrent
4247			 * send* calls on a socket.
4248			 */
4249			ixa = conn_get_ixa(connp, B_FALSE);
4250			if (ixa == NULL) {
4251				error = ENOMEM;
4252				goto ud_error2;
4253			}
4254			mutex_enter(&connp->conn_lock);
4255
4256			if (conn_same_as_last_v4(connp, sin) &&
4257			    ipsec_outbound_policy_current(ixa)) {
4258				/* icmp_output_lastdst drops conn_lock */
4259				error = icmp_output_lastdst(connp, data_mp, cr,
4260				    pid, ixa);
4261			} else {
4262				/* icmp_output_newdst drops conn_lock */
4263				error = icmp_output_newdst(connp, data_mp, sin,
4264				    NULL, cr, pid, ixa);
4265			}
4266			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4267		}
4268		if (error == 0) {
4269			freeb(mp);
4270			return;
4271		}
4272		break;
4273	}
4274	ASSERT(mp != NULL);
4275	/* mp is freed by the following routine */
4276	icmp_ud_err(q, mp, (t_scalar_t)error);
4277	return;
4278
4279ud_error2:
4280	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4281	freemsg(data_mp);
4282	ASSERT(mp != NULL);
4283	/* mp is freed by the following routine */
4284	icmp_ud_err(q, mp, (t_scalar_t)error);
4285}
4286
4287/*
4288 * Handle the case of the IP address or flow label being different
4289 * for both IPv4 and IPv6.
4290 *
4291 * NOTE: The caller must hold conn_lock and we drop it here.
4292 */
4293static int
4294icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
4295    cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
4296{
4297	icmp_t		*icmp = connp->conn_icmp;
4298	icmp_stack_t	*is = icmp->icmp_is;
4299	int		error;
4300	ip_xmit_attr_t	*oldixa;
4301	boolean_t	do_ipsec;
4302	uint_t		srcid;
4303	uint32_t	flowinfo;
4304	in6_addr_t	v6src;
4305	in6_addr_t	v6dst;
4306	in6_addr_t	v6nexthop;
4307	in_port_t	dstport;
4308
4309	ASSERT(MUTEX_HELD(&connp->conn_lock));
4310	ASSERT(ixa != NULL);
4311
4312	/*
4313	 * We hold conn_lock across all the use and modifications of
4314	 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
4315	 * stay consistent.
4316	 */
4317
4318	ASSERT(cr != NULL);
4319	ixa->ixa_cred = cr;
4320	ixa->ixa_cpid = pid;
4321	if (is_system_labeled()) {
4322		/* We need to restart with a label based on the cred */
4323		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
4324	}
4325	/*
4326	 * If we are connected then the destination needs to be the
4327	 * same as the connected one, which is not the case here since we
4328	 * checked for that above.
4329	 */
4330	if (icmp->icmp_state == TS_DATA_XFER) {
4331		mutex_exit(&connp->conn_lock);
4332		error = EISCONN;
4333		goto ud_error;
4334	}
4335
4336	/* In case previous destination was multicast or multirt */
4337	ip_attr_newdst(ixa);
4338
4339	/*
4340	 * If laddr is unspecified then we look at sin6_src_id.
4341	 * We will give precedence to a source address set with IPV6_PKTINFO
4342	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
4343	 * want ip_attr_connect to select a source (since it can fail) when
4344	 * IPV6_PKTINFO is specified.
4345	 * If this doesn't result in a source address then we get a source
4346	 * from ip_attr_connect() below.
4347	 */
4348	v6src = connp->conn_saddr_v6;
4349	if (sin != NULL) {
4350		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
4351		dstport = sin->sin_port;
4352		flowinfo = 0;
4353		srcid = 0;
4354		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4355		if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) {
4356			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4357			    connp->conn_netstack);
4358		}
4359		ixa->ixa_flags |= IXAF_IS_IPV4;
4360	} else {
4361		v6dst = sin6->sin6_addr;
4362		dstport = sin6->sin6_port;
4363		flowinfo = sin6->sin6_flowinfo;
4364		srcid = sin6->__sin6_src_id;
4365		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
4366			ixa->ixa_scopeid = sin6->sin6_scope_id;
4367			ixa->ixa_flags |= IXAF_SCOPEID_SET;
4368		} else {
4369			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4370		}
4371		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
4372			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4373			    connp->conn_netstack);
4374		}
4375		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
4376			ixa->ixa_flags |= IXAF_IS_IPV4;
4377		else
4378			ixa->ixa_flags &= ~IXAF_IS_IPV4;
4379	}
4380	/* Handle IPV6_PKTINFO setting source address. */
4381	if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
4382	    (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR)) {
4383		ip_pkt_t *ipp = &connp->conn_xmit_ipp;
4384
4385		if (ixa->ixa_flags & IXAF_IS_IPV4) {
4386			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4387				v6src = ipp->ipp_addr;
4388		} else {
4389			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4390				v6src = ipp->ipp_addr;
4391		}
4392	}
4393
4394	/* Defer IPsec if it might need to look at ICMP type/code */
4395	switch (ixa->ixa_protocol) {
4396	case IPPROTO_ICMP:
4397	case IPPROTO_ICMPV6:
4398		do_ipsec = B_FALSE;
4399		break;
4400	default:
4401		do_ipsec = B_TRUE;
4402	}
4403
4404	ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
4405	mutex_exit(&connp->conn_lock);
4406
4407	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
4408	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
4409	    (do_ipsec ? IPDF_IPSEC : 0));
4410	switch (error) {
4411	case 0:
4412		break;
4413	case EADDRNOTAVAIL:
4414		/*
4415		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
4416		 * Don't have the application see that errno
4417		 */
4418		error = ENETUNREACH;
4419		goto failed;
4420	case ENETDOWN:
4421		/*
4422		 * Have !ipif_addr_ready address; drop packet silently
4423		 * until we can get applications to not send until we
4424		 * are ready.
4425		 */
4426		error = 0;
4427		goto failed;
4428	case EHOSTUNREACH:
4429	case ENETUNREACH:
4430		if (ixa->ixa_ire != NULL) {
4431			/*
4432			 * Let conn_ip_output/ire_send_noroute return
4433			 * the error and send any local ICMP error.
4434			 */
4435			error = 0;
4436			break;
4437		}
4438		/* FALLTHRU */
4439	default:
4440	failed:
4441		goto ud_error;
4442	}
4443
4444	mutex_enter(&connp->conn_lock);
4445	/*
4446	 * While we dropped the lock some other thread might have connected
4447	 * this socket. If so we bail out with EISCONN to ensure that the
4448	 * connecting thread is the one that updates conn_ixa, conn_ht_*
4449	 * and conn_*last*.
4450	 */
4451	if (icmp->icmp_state == TS_DATA_XFER) {
4452		mutex_exit(&connp->conn_lock);
4453		error = EISCONN;
4454		goto ud_error;
4455	}
4456
4457	/*
4458	 * We need to rebuild the headers if
4459	 *  - we are labeling packets (could be different for different
4460	 *    destinations)
4461	 *  - we have a source route (or routing header) since we need to
4462	 *    massage that to get the pseudo-header checksum
4463	 *  - a socket option with COA_HEADER_CHANGED has been set which
4464	 *    set conn_v6lastdst to zero.
4465	 *
4466	 * Otherwise the prepend function will just update the src, dst,
4467	 * and flow label.
4468	 */
4469	if (is_system_labeled()) {
4470		/* TX MLP requires SCM_UCRED and don't have that here */
4471		if (connp->conn_mlp_type != mlptSingle) {
4472			mutex_exit(&connp->conn_lock);
4473			error = ECONNREFUSED;
4474			goto ud_error;
4475		}
4476		/*
4477		 * Check whether Trusted Solaris policy allows communication
4478		 * with this host, and pretend that the destination is
4479		 * unreachable if not.
4480		 * Compute any needed label and place it in ipp_label_v4/v6.
4481		 *
4482		 * Later conn_build_hdr_template/conn_prepend_hdr takes
4483		 * ipp_label_v4/v6 to form the packet.
4484		 *
4485		 * Tsol note: Since we hold conn_lock we know no other
4486		 * thread manipulates conn_xmit_ipp.
4487		 */
4488		error = conn_update_label(connp, ixa, &v6dst,
4489		    &connp->conn_xmit_ipp);
4490		if (error != 0) {
4491			mutex_exit(&connp->conn_lock);
4492			goto ud_error;
4493		}
4494		/* Rebuild the header template */
4495		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4496		    flowinfo);
4497		if (error != 0) {
4498			mutex_exit(&connp->conn_lock);
4499			goto ud_error;
4500		}
4501	} else if (connp->conn_xmit_ipp.ipp_fields &
4502	    (IPPF_IPV4_OPTIONS|IPPF_RTHDR) ||
4503	    IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
4504		/* Rebuild the header template */
4505		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4506		    flowinfo);
4507		if (error != 0) {
4508			mutex_exit(&connp->conn_lock);
4509			goto ud_error;
4510		}
4511	} else {
4512		/* Simply update the destination address if no source route */
4513		if (ixa->ixa_flags & IXAF_IS_IPV4) {
4514			ipha_t	*ipha = (ipha_t *)connp->conn_ht_iphc;
4515
4516			IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
4517			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
4518				ipha->ipha_fragment_offset_and_flags |=
4519				    IPH_DF_HTONS;
4520			} else {
4521				ipha->ipha_fragment_offset_and_flags &=
4522				    ~IPH_DF_HTONS;
4523			}
4524		} else {
4525			ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
4526			ip6h->ip6_dst = v6dst;
4527		}
4528	}
4529
4530	/*
4531	 * Remember the dst etc which corresponds to the built header
4532	 * template and conn_ixa.
4533	 */
4534	oldixa = conn_replace_ixa(connp, ixa);
4535	connp->conn_v6lastdst = v6dst;
4536	connp->conn_lastflowinfo = flowinfo;
4537	connp->conn_lastscopeid = ixa->ixa_scopeid;
4538	connp->conn_lastsrcid = srcid;
4539	/* Also remember a source to use together with lastdst */
4540	connp->conn_v6lastsrc = v6src;
4541
4542	data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src,
4543	    flowinfo, &error);
4544
4545	/* Done with conn_t */
4546	mutex_exit(&connp->conn_lock);
4547	ixa_refrele(oldixa);
4548
4549	if (data_mp == NULL) {
4550		ASSERT(error != 0);
4551		goto ud_error;
4552	}
4553
4554	if (!do_ipsec) {
4555		/* Policy might differ for different ICMP type/code */
4556		data_mp = icmp_output_attach_policy(data_mp, connp, ixa);
4557		if (data_mp == NULL) {
4558			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4559			error = EHOSTUNREACH;	/* IPsec policy failure */
4560			goto done;
4561		}
4562	}
4563
4564	/* We're done.  Pass the packet to ip. */
4565	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4566
4567	error = conn_ip_output(data_mp, ixa);
4568	/* No rawipOutErrors if an error since IP increases its error counter */
4569	switch (error) {
4570	case 0:
4571		break;
4572	case EWOULDBLOCK:
4573		(void) ixa_check_drain_insert(connp, ixa);
4574		error = 0;
4575		break;
4576	case EADDRNOTAVAIL:
4577		/*
4578		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
4579		 * Don't have the application see that errno
4580		 */
4581		error = ENETUNREACH;
4582		/* FALLTHRU */
4583	default:
4584		mutex_enter(&connp->conn_lock);
4585		/*
4586		 * Clear the source and v6lastdst so we call ip_attr_connect
4587		 * for the next packet and try to pick a better source.
4588		 */
4589		if (connp->conn_mcbc_bind)
4590			connp->conn_saddr_v6 = ipv6_all_zeros;
4591		else
4592			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
4593		connp->conn_v6lastdst = ipv6_all_zeros;
4594		mutex_exit(&connp->conn_lock);
4595		break;
4596	}
4597done:
4598	ixa_refrele(ixa);
4599	return (error);
4600
4601ud_error:
4602	if (ixa != NULL)
4603		ixa_refrele(ixa);
4604
4605	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4606	freemsg(data_mp);
4607	return (error);
4608}
4609
4610/* ARGSUSED */
4611static void
4612icmp_wput_fallback(queue_t *q, mblk_t *mp)
4613{
4614#ifdef DEBUG
4615	cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4616#endif
4617	freemsg(mp);
4618}
4619
4620static void
4621icmp_wput_other(queue_t *q, mblk_t *mp)
4622{
4623	uchar_t	*rptr = mp->b_rptr;
4624	struct iocblk *iocp;
4625	conn_t	*connp = Q_TO_CONN(q);
4626	icmp_t	*icmp = connp->conn_icmp;
4627	icmp_stack_t *is = icmp->icmp_is;
4628	cred_t *cr;
4629
4630	switch (mp->b_datap->db_type) {
4631	case M_PROTO:
4632	case M_PCPROTO:
4633		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
4634			/*
4635			 * If the message does not contain a PRIM_type,
4636			 * throw it away.
4637			 */
4638			freemsg(mp);
4639			return;
4640		}
4641		switch (((t_primp_t)rptr)->type) {
4642		case T_ADDR_REQ:
4643			icmp_addr_req(q, mp);
4644			return;
4645		case O_T_BIND_REQ:
4646		case T_BIND_REQ:
4647			icmp_tpi_bind(q, mp);
4648			return;
4649		case T_CONN_REQ:
4650			icmp_tpi_connect(q, mp);
4651			return;
4652		case T_CAPABILITY_REQ:
4653			icmp_capability_req(q, mp);
4654			return;
4655		case T_INFO_REQ:
4656			icmp_info_req(q, mp);
4657			return;
4658		case T_UNITDATA_REQ:
4659			/*
4660			 * If a T_UNITDATA_REQ gets here, the address must
4661			 * be bad.  Valid T_UNITDATA_REQs are handled
4662			 * in icmp_wput.
4663			 */
4664			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4665			return;
4666		case T_UNBIND_REQ:
4667			icmp_tpi_unbind(q, mp);
4668			return;
4669		case T_SVR4_OPTMGMT_REQ:
4670			/*
4671			 * All Solaris components should pass a db_credp
4672			 * for this TPI message, hence we ASSERT.
4673			 * But in case there is some other M_PROTO that looks
4674			 * like a TPI message sent by some other kernel
4675			 * component, we check and return an error.
4676			 */
4677			cr = msg_getcred(mp, NULL);
4678			ASSERT(cr != NULL);
4679			if (cr == NULL) {
4680				icmp_err_ack(q, mp, TSYSERR, EINVAL);
4681				return;
4682			}
4683
4684			if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
4685			    cr)) {
4686				svr4_optcom_req(q, mp, cr, &icmp_opt_obj);
4687			}
4688			return;
4689
4690		case T_OPTMGMT_REQ:
4691			/*
4692			 * All Solaris components should pass a db_credp
4693			 * for this TPI message, hence we ASSERT.
4694			 * But in case there is some other M_PROTO that looks
4695			 * like a TPI message sent by some other kernel
4696			 * component, we check and return an error.
4697			 */
4698			cr = msg_getcred(mp, NULL);
4699			ASSERT(cr != NULL);
4700			if (cr == NULL) {
4701				icmp_err_ack(q, mp, TSYSERR, EINVAL);
4702				return;
4703			}
4704			tpi_optcom_req(q, mp, cr, &icmp_opt_obj);
4705			return;
4706
4707		case T_DISCON_REQ:
4708			icmp_tpi_disconnect(q, mp);
4709			return;
4710
4711		/* The following TPI message is not supported by icmp. */
4712		case O_T_CONN_RES:
4713		case T_CONN_RES:
4714			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4715			return;
4716
4717		/* The following 3 TPI requests are illegal for icmp. */
4718		case T_DATA_REQ:
4719		case T_EXDATA_REQ:
4720		case T_ORDREL_REQ:
4721			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4722			return;
4723		default:
4724			break;
4725		}
4726		break;
4727	case M_FLUSH:
4728		if (*rptr & FLUSHW)
4729			flushq(q, FLUSHDATA);
4730		break;
4731	case M_IOCTL:
4732		iocp = (struct iocblk *)mp->b_rptr;
4733		switch (iocp->ioc_cmd) {
4734		case TI_GETPEERNAME:
4735			if (icmp->icmp_state != TS_DATA_XFER) {
4736				/*
4737				 * If a default destination address has not
4738				 * been associated with the stream, then we
4739				 * don't know the peer's name.
4740				 */
4741				iocp->ioc_error = ENOTCONN;
4742				iocp->ioc_count = 0;
4743				mp->b_datap->db_type = M_IOCACK;
4744				qreply(q, mp);
4745				return;
4746			}
4747			/* FALLTHRU */
4748		case TI_GETMYNAME:
4749			/*
4750			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
4751			 * need to copyin the user's strbuf structure.
4752			 * Processing will continue in the M_IOCDATA case
4753			 * below.
4754			 */
4755			mi_copyin(q, mp, NULL,
4756			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
4757			return;
4758		case ND_SET:
4759			/* nd_getset performs the necessary checking */
4760		case ND_GET:
4761			if (nd_getset(q, is->is_nd, mp)) {
4762				qreply(q, mp);
4763				return;
4764			}
4765			break;
4766		default:
4767			break;
4768		}
4769		break;
4770	case M_IOCDATA:
4771		icmp_wput_iocdata(q, mp);
4772		return;
4773	default:
4774		/* Unrecognized messages are passed through without change. */
4775		break;
4776	}
4777	ip_wput_nondata(q, mp);
4778}
4779
4780/*
4781 * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA
4782 * messages.
4783 */
4784static void
4785icmp_wput_iocdata(queue_t *q, mblk_t *mp)
4786{
4787	mblk_t		*mp1;
4788	STRUCT_HANDLE(strbuf, sb);
4789	uint_t		addrlen;
4790	conn_t		*connp = Q_TO_CONN(q);
4791	icmp_t		*icmp = connp->conn_icmp;
4792
4793	/* Make sure it is one of ours. */
4794	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4795	case TI_GETMYNAME:
4796	case TI_GETPEERNAME:
4797		break;
4798	default:
4799		ip_wput_nondata(q, mp);
4800		return;
4801	}
4802
4803	switch (mi_copy_state(q, mp, &mp1)) {
4804	case -1:
4805		return;
4806	case MI_COPY_CASE(MI_COPY_IN, 1):
4807		break;
4808	case MI_COPY_CASE(MI_COPY_OUT, 1):
4809		/*
4810		 * The address has been copied out, so now
4811		 * copyout the strbuf.
4812		 */
4813		mi_copyout(q, mp);
4814		return;
4815	case MI_COPY_CASE(MI_COPY_OUT, 2):
4816		/*
4817		 * The address and strbuf have been copied out.
4818		 * We're done, so just acknowledge the original
4819		 * M_IOCTL.
4820		 */
4821		mi_copy_done(q, mp, 0);
4822		return;
4823	default:
4824		/*
4825		 * Something strange has happened, so acknowledge
4826		 * the original M_IOCTL with an EPROTO error.
4827		 */
4828		mi_copy_done(q, mp, EPROTO);
4829		return;
4830	}
4831
4832	/*
4833	 * Now we have the strbuf structure for TI_GETMYNAME
4834	 * and TI_GETPEERNAME.  Next we copyout the requested
4835	 * address and then we'll copyout the strbuf.
4836	 */
4837	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
4838	    (void *)mp1->b_rptr);
4839
4840	if (connp->conn_family == AF_INET)
4841		addrlen = sizeof (sin_t);
4842	else
4843		addrlen = sizeof (sin6_t);
4844
4845	if (STRUCT_FGET(sb, maxlen) < addrlen) {
4846		mi_copy_done(q, mp, EINVAL);
4847		return;
4848	}
4849	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4850	case TI_GETMYNAME:
4851		break;
4852	case TI_GETPEERNAME:
4853		if (icmp->icmp_state != TS_DATA_XFER) {
4854			mi_copy_done(q, mp, ENOTCONN);
4855			return;
4856		}
4857		break;
4858	default:
4859		mi_copy_done(q, mp, EPROTO);
4860		return;
4861	}
4862	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
4863	if (!mp1)
4864		return;
4865
4866	STRUCT_FSET(sb, len, addrlen);
4867	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4868	case TI_GETMYNAME:
4869		(void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
4870		    &addrlen);
4871		break;
4872	case TI_GETPEERNAME:
4873		(void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
4874		    &addrlen);
4875		break;
4876	}
4877	mp1->b_wptr += addrlen;
4878	/* Copy out the address */
4879	mi_copyout(q, mp);
4880}
4881
4882void
4883icmp_ddi_g_init(void)
4884{
4885	icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
4886	    icmp_opt_obj.odb_opt_arr_cnt);
4887
4888	/*
4889	 * We want to be informed each time a stack is created or
4890	 * destroyed in the kernel, so we can maintain the
4891	 * set of icmp_stack_t's.
4892	 */
4893	netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
4894}
4895
4896void
4897icmp_ddi_g_destroy(void)
4898{
4899	netstack_unregister(NS_ICMP);
4900}
4901
4902#define	INET_NAME	"ip"
4903
4904/*
4905 * Initialize the ICMP stack instance.
4906 */
4907static void *
4908rawip_stack_init(netstackid_t stackid, netstack_t *ns)
4909{
4910	icmp_stack_t	*is;
4911	icmpparam_t	*pa;
4912	int		error = 0;
4913	major_t		major;
4914
4915	is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
4916	is->is_netstack = ns;
4917
4918	pa = (icmpparam_t *)kmem_alloc(sizeof (icmp_param_arr), KM_SLEEP);
4919	is->is_param_arr = pa;
4920	bcopy(icmp_param_arr, is->is_param_arr, sizeof (icmp_param_arr));
4921
4922	(void) icmp_param_register(&is->is_nd,
4923	    is->is_param_arr, A_CNT(icmp_param_arr));
4924	is->is_ksp = rawip_kstat_init(stackid);
4925
4926	major = mod_name_to_major(INET_NAME);
4927	error = ldi_ident_from_major(major, &is->is_ldi_ident);
4928	ASSERT(error == 0);
4929	return (is);
4930}
4931
4932/*
4933 * Free the ICMP stack instance.
4934 */
4935static void
4936rawip_stack_fini(netstackid_t stackid, void *arg)
4937{
4938	icmp_stack_t *is = (icmp_stack_t *)arg;
4939
4940	nd_free(&is->is_nd);
4941	kmem_free(is->is_param_arr, sizeof (icmp_param_arr));
4942	is->is_param_arr = NULL;
4943
4944	rawip_kstat_fini(stackid, is->is_ksp);
4945	is->is_ksp = NULL;
4946	ldi_ident_release(is->is_ldi_ident);
4947	kmem_free(is, sizeof (*is));
4948}
4949
4950static void *
4951rawip_kstat_init(netstackid_t stackid) {
4952	kstat_t	*ksp;
4953
4954	rawip_named_kstat_t template = {
4955		{ "inDatagrams",	KSTAT_DATA_UINT32, 0 },
4956		{ "inCksumErrs",	KSTAT_DATA_UINT32, 0 },
4957		{ "inErrors",		KSTAT_DATA_UINT32, 0 },
4958		{ "outDatagrams",	KSTAT_DATA_UINT32, 0 },
4959		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
4960	};
4961
4962	ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
4963					KSTAT_TYPE_NAMED,
4964					NUM_OF_FIELDS(rawip_named_kstat_t),
4965					0, stackid);
4966	if (ksp == NULL || ksp->ks_data == NULL)
4967		return (NULL);
4968
4969	bcopy(&template, ksp->ks_data, sizeof (template));
4970	ksp->ks_update = rawip_kstat_update;
4971	ksp->ks_private = (void *)(uintptr_t)stackid;
4972
4973	kstat_install(ksp);
4974	return (ksp);
4975}
4976
4977static void
4978rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
4979{
4980	if (ksp != NULL) {
4981		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
4982		kstat_delete_netstack(ksp, stackid);
4983	}
4984}
4985
4986static int
4987rawip_kstat_update(kstat_t *ksp, int rw)
4988{
4989	rawip_named_kstat_t *rawipkp;
4990	netstackid_t	stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
4991	netstack_t	*ns;
4992	icmp_stack_t	*is;
4993
4994	if ((ksp == NULL) || (ksp->ks_data == NULL))
4995		return (EIO);
4996
4997	if (rw == KSTAT_WRITE)
4998		return (EACCES);
4999
5000	rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5001
5002	ns = netstack_find_by_stackid(stackid);
5003	if (ns == NULL)
5004		return (-1);
5005	is = ns->netstack_icmp;
5006	if (is == NULL) {
5007		netstack_rele(ns);
5008		return (-1);
5009	}
5010	rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5011	rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5012	rawipkp->inErrors.value.ui32 =	   is->is_rawip_mib.rawipInErrors;
5013	rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5014	rawipkp->outErrors.value.ui32 =	   is->is_rawip_mib.rawipOutErrors;
5015	netstack_rele(ns);
5016	return (0);
5017}
5018
5019/* ARGSUSED */
5020int
5021rawip_accept(sock_lower_handle_t lproto_handle,
5022    sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
5023    cred_t *cr)
5024{
5025	return (EOPNOTSUPP);
5026}
5027
5028/* ARGSUSED */
5029int
5030rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5031    socklen_t len, cred_t *cr)
5032{
5033	conn_t  *connp = (conn_t *)proto_handle;
5034	int	error;
5035
5036	/* All Solaris components should pass a cred for this operation. */
5037	ASSERT(cr != NULL);
5038
5039	/* Binding to a NULL address really means unbind */
5040	if (sa == NULL)
5041		error = rawip_do_unbind(connp);
5042	else
5043		error = rawip_do_bind(connp, sa, len);
5044
5045	if (error < 0) {
5046		if (error == -TOUTSTATE)
5047			error = EINVAL;
5048		else
5049			error = proto_tlitosyserr(-error);
5050	}
5051	return (error);
5052}
5053
5054static int
5055rawip_implicit_bind(conn_t *connp)
5056{
5057	sin6_t sin6addr;
5058	sin_t *sin;
5059	sin6_t *sin6;
5060	socklen_t len;
5061	int error;
5062
5063	if (connp->conn_family == AF_INET) {
5064		len = sizeof (struct sockaddr_in);
5065		sin = (sin_t *)&sin6addr;
5066		*sin = sin_null;
5067		sin->sin_family = AF_INET;
5068		sin->sin_addr.s_addr = INADDR_ANY;
5069	} else {
5070		ASSERT(connp->conn_family == AF_INET6);
5071		len = sizeof (sin6_t);
5072		sin6 = (sin6_t *)&sin6addr;
5073		*sin6 = sin6_null;
5074		sin6->sin6_family = AF_INET6;
5075		V6_SET_ZERO(sin6->sin6_addr);
5076	}
5077
5078	error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
5079
5080	return ((error < 0) ? proto_tlitosyserr(-error) : error);
5081}
5082
5083static int
5084rawip_unbind(conn_t *connp)
5085{
5086	int error;
5087
5088	error = rawip_do_unbind(connp);
5089	if (error < 0) {
5090		error = proto_tlitosyserr(-error);
5091	}
5092	return (error);
5093}
5094
5095/* ARGSUSED */
5096int
5097rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
5098{
5099	return (EOPNOTSUPP);
5100}
5101
5102int
5103rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5104    socklen_t len, sock_connid_t *id, cred_t *cr)
5105{
5106	conn_t	*connp = (conn_t *)proto_handle;
5107	icmp_t *icmp = connp->conn_icmp;
5108	int	error;
5109	boolean_t did_bind = B_FALSE;
5110	pid_t	pid = curproc->p_pid;
5111
5112	/* All Solaris components should pass a cred for this operation. */
5113	ASSERT(cr != NULL);
5114
5115	if (sa == NULL) {
5116		/*
5117		 * Disconnect
5118		 * Make sure we are connected
5119		 */
5120		if (icmp->icmp_state != TS_DATA_XFER)
5121			return (EINVAL);
5122
5123		error = icmp_disconnect(connp);
5124		return (error);
5125	}
5126
5127	error = proto_verify_ip_addr(connp->conn_family, sa, len);
5128	if (error != 0)
5129		return (error);
5130
5131	/* do an implicit bind if necessary */
5132	if (icmp->icmp_state == TS_UNBND) {
5133		error = rawip_implicit_bind(connp);
5134		/*
5135		 * We could be racing with an actual bind, in which case
5136		 * we would see EPROTO. We cross our fingers and try
5137		 * to connect.
5138		 */
5139		if (!(error == 0 || error == EPROTO))
5140			return (error);
5141		did_bind = B_TRUE;
5142	}
5143
5144	/*
5145	 * set SO_DGRAM_ERRIND
5146	 */
5147	connp->conn_dgram_errind = B_TRUE;
5148
5149	error = rawip_do_connect(connp, sa, len, cr, pid);
5150	if (error != 0 && did_bind) {
5151		int unbind_err;
5152
5153		unbind_err = rawip_unbind(connp);
5154		ASSERT(unbind_err == 0);
5155	}
5156
5157	if (error == 0) {
5158		*id = 0;
5159		(*connp->conn_upcalls->su_connected)(connp->conn_upper_handle,
5160		    0, NULL, -1);
5161	} else if (error < 0) {
5162		error = proto_tlitosyserr(-error);
5163	}
5164	return (error);
5165}
5166
5167/* ARGSUSED2 */
5168int
5169rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
5170    boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
5171{
5172	conn_t  *connp = (conn_t *)proto_handle;
5173	icmp_t	*icmp;
5174	struct T_capability_ack tca;
5175	struct sockaddr_in6 laddr, faddr;
5176	socklen_t laddrlen, faddrlen;
5177	short opts;
5178	struct stroptions *stropt;
5179	mblk_t *stropt_mp;
5180	int error;
5181
5182	icmp = connp->conn_icmp;
5183
5184	stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
5185
5186	/*
5187	 * setup the fallback stream that was allocated
5188	 */
5189	connp->conn_dev = (dev_t)RD(q)->q_ptr;
5190	connp->conn_minor_arena = WR(q)->q_ptr;
5191
5192	RD(q)->q_ptr = WR(q)->q_ptr = connp;
5193
5194	WR(q)->q_qinfo = &icmpwinit;
5195
5196	connp->conn_rq = RD(q);
5197	connp->conn_wq = WR(q);
5198
5199	/* Notify stream head about options before sending up data */
5200	stropt_mp->b_datap->db_type = M_SETOPTS;
5201	stropt_mp->b_wptr += sizeof (*stropt);
5202	stropt = (struct stroptions *)stropt_mp->b_rptr;
5203	stropt->so_flags = SO_WROFF | SO_HIWAT;
5204	stropt->so_wroff = connp->conn_wroff;
5205	stropt->so_hiwat = connp->conn_rcvbuf;
5206	putnext(RD(q), stropt_mp);
5207
5208	/*
5209	 * free helper stream
5210	 */
5211	ip_free_helper_stream(connp);
5212
5213	/*
5214	 * Collect the information needed to sync with the sonode
5215	 */
5216	icmp_do_capability_ack(icmp, &tca, TC1_INFO);
5217
5218	laddrlen = faddrlen = sizeof (sin6_t);
5219	(void) rawip_getsockname((sock_lower_handle_t)connp,
5220	    (struct sockaddr *)&laddr, &laddrlen, CRED());
5221	error = rawip_getpeername((sock_lower_handle_t)connp,
5222	    (struct sockaddr *)&faddr, &faddrlen, CRED());
5223	if (error != 0)
5224		faddrlen = 0;
5225	opts = 0;
5226	if (connp->conn_dgram_errind)
5227		opts |= SO_DGRAM_ERRIND;
5228	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
5229		opts |= SO_DONTROUTE;
5230
5231	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
5232	    (struct sockaddr *)&laddr, laddrlen,
5233	    (struct sockaddr *)&faddr, faddrlen, opts);
5234
5235	/*
5236	 * Attempts to send data up during fallback will result in it being
5237	 * queued in icmp_t. Now we push up any queued packets.
5238	 */
5239	mutex_enter(&icmp->icmp_recv_lock);
5240	while (icmp->icmp_fallback_queue_head != NULL) {
5241		mblk_t	*mp;
5242
5243		mp = icmp->icmp_fallback_queue_head;
5244		icmp->icmp_fallback_queue_head = mp->b_next;
5245		mp->b_next = NULL;
5246		mutex_exit(&icmp->icmp_recv_lock);
5247		putnext(RD(q), mp);
5248		mutex_enter(&icmp->icmp_recv_lock);
5249	}
5250	icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
5251
5252	/*
5253	 * No longer a streams less socket
5254	 */
5255	mutex_enter(&connp->conn_lock);
5256	connp->conn_flags &= ~IPCL_NONSTR;
5257	mutex_exit(&connp->conn_lock);
5258
5259	mutex_exit(&icmp->icmp_recv_lock);
5260
5261	ASSERT(icmp->icmp_fallback_queue_head == NULL &&
5262	    icmp->icmp_fallback_queue_tail == NULL);
5263
5264	ASSERT(connp->conn_ref >= 1);
5265
5266	return (0);
5267}
5268
5269/* ARGSUSED2 */
5270sock_lower_handle_t
5271rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
5272    uint_t *smodep, int *errorp, int flags, cred_t *credp)
5273{
5274	conn_t *connp;
5275
5276	if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
5277		*errorp = EPROTONOSUPPORT;
5278		return (NULL);
5279	}
5280
5281	connp = rawip_do_open(family, credp, errorp, flags);
5282	if (connp != NULL) {
5283		connp->conn_flags |= IPCL_NONSTR;
5284
5285		mutex_enter(&connp->conn_lock);
5286		connp->conn_state_flags &= ~CONN_INCIPIENT;
5287		mutex_exit(&connp->conn_lock);
5288		*sock_downcalls = &sock_rawip_downcalls;
5289		*smodep = SM_ATOMIC;
5290	} else {
5291		ASSERT(*errorp != 0);
5292	}
5293
5294	return ((sock_lower_handle_t)connp);
5295}
5296
5297/* ARGSUSED3 */
5298void
5299rawip_activate(sock_lower_handle_t proto_handle,
5300    sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
5301    cred_t *cr)
5302{
5303	conn_t 			*connp = (conn_t *)proto_handle;
5304	struct sock_proto_props sopp;
5305
5306	/* All Solaris components should pass a cred for this operation. */
5307	ASSERT(cr != NULL);
5308
5309	connp->conn_upcalls = sock_upcalls;
5310	connp->conn_upper_handle = sock_handle;
5311
5312	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
5313	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
5314	sopp.sopp_wroff = connp->conn_wroff;
5315	sopp.sopp_rxhiwat = connp->conn_rcvbuf;
5316	sopp.sopp_rxlowat = connp->conn_rcvlowat;
5317	sopp.sopp_maxblk = INFPSZ;
5318	sopp.sopp_maxpsz = IP_MAXPACKET;
5319	sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
5320	    icmp_mod_info.mi_minpsz;
5321
5322	(*connp->conn_upcalls->su_set_proto_props)
5323	    (connp->conn_upper_handle, &sopp);
5324
5325	icmp_bind_proto(connp->conn_icmp);
5326}
5327
5328/* ARGSUSED3 */
5329int
5330rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5331    socklen_t *salenp, cred_t *cr)
5332{
5333	conn_t  *connp = (conn_t *)proto_handle;
5334	icmp_t  *icmp = connp->conn_icmp;
5335	int	error;
5336
5337	/* All Solaris components should pass a cred for this operation. */
5338	ASSERT(cr != NULL);
5339
5340	mutex_enter(&connp->conn_lock);
5341	if (icmp->icmp_state != TS_DATA_XFER)
5342		error = ENOTCONN;
5343	else
5344		error = conn_getpeername(connp, sa, salenp);
5345	mutex_exit(&connp->conn_lock);
5346	return (error);
5347}
5348
5349/* ARGSUSED3 */
5350int
5351rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5352    socklen_t *salenp, cred_t *cr)
5353{
5354	conn_t  *connp = (conn_t *)proto_handle;
5355	int	error;
5356
5357	/* All Solaris components should pass a cred for this operation. */
5358	ASSERT(cr != NULL);
5359
5360	mutex_enter(&connp->conn_lock);
5361	error = conn_getsockname(connp, sa, salenp);
5362	mutex_exit(&connp->conn_lock);
5363	return (error);
5364}
5365
5366int
5367rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5368    const void *optvalp, socklen_t optlen, cred_t *cr)
5369{
5370	conn_t	*connp = (conn_t *)proto_handle;
5371	int error;
5372
5373	/* All Solaris components should pass a cred for this operation. */
5374	ASSERT(cr != NULL);
5375
5376	error = proto_opt_check(level, option_name, optlen, NULL,
5377	    icmp_opt_obj.odb_opt_des_arr,
5378	    icmp_opt_obj.odb_opt_arr_cnt,
5379	    B_TRUE, B_FALSE, cr);
5380
5381	if (error != 0) {
5382		/*
5383		 * option not recognized
5384		 */
5385		if (error < 0) {
5386			error = proto_tlitosyserr(-error);
5387		}
5388		return (error);
5389	}
5390
5391	error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
5392	    option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
5393	    (uchar_t *)optvalp, NULL, cr);
5394
5395	ASSERT(error >= 0);
5396
5397	return (error);
5398}
5399
5400int
5401rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5402    void *optvalp, socklen_t *optlen, cred_t *cr)
5403{
5404	int		error;
5405	conn_t		*connp = (conn_t *)proto_handle;
5406	t_uscalar_t	max_optbuf_len;
5407	void		*optvalp_buf;
5408	int		len;
5409
5410	/* All Solaris components should pass a cred for this operation. */
5411	ASSERT(cr != NULL);
5412
5413	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
5414	    icmp_opt_obj.odb_opt_des_arr,
5415	    icmp_opt_obj.odb_opt_arr_cnt,
5416	    B_FALSE, B_TRUE, cr);
5417
5418	if (error != 0) {
5419		if (error < 0) {
5420			error = proto_tlitosyserr(-error);
5421		}
5422		return (error);
5423	}
5424
5425	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
5426	len = icmp_opt_get(connp, level, option_name, optvalp_buf);
5427	if (len == -1) {
5428		kmem_free(optvalp_buf, max_optbuf_len);
5429		return (EINVAL);
5430	}
5431
5432	/*
5433	 * update optlen and copy option value
5434	 */
5435	t_uscalar_t size = MIN(len, *optlen);
5436
5437	bcopy(optvalp_buf, optvalp, size);
5438	bcopy(&size, optlen, sizeof (size));
5439
5440	kmem_free(optvalp_buf, max_optbuf_len);
5441	return (0);
5442}
5443
5444/* ARGSUSED1 */
5445int
5446rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
5447{
5448	conn_t	*connp = (conn_t *)proto_handle;
5449
5450	/* All Solaris components should pass a cred for this operation. */
5451	ASSERT(cr != NULL);
5452
5453	(void) rawip_do_close(connp);
5454	return (0);
5455}
5456
5457/* ARGSUSED2 */
5458int
5459rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
5460{
5461	conn_t  *connp = (conn_t *)proto_handle;
5462
5463	/* All Solaris components should pass a cred for this operation. */
5464	ASSERT(cr != NULL);
5465
5466	/* shut down the send side */
5467	if (how != SHUT_RD)
5468		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5469		    SOCK_OPCTL_SHUT_SEND, 0);
5470	/* shut down the recv side */
5471	if (how != SHUT_WR)
5472		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5473		    SOCK_OPCTL_SHUT_RECV, 0);
5474	return (0);
5475}
5476
5477void
5478rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
5479{
5480	conn_t  *connp = (conn_t *)proto_handle;
5481	icmp_t	*icmp = connp->conn_icmp;
5482
5483	mutex_enter(&icmp->icmp_recv_lock);
5484	connp->conn_flow_cntrld = B_FALSE;
5485	mutex_exit(&icmp->icmp_recv_lock);
5486}
5487
5488int
5489rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
5490    int mode, int32_t *rvalp, cred_t *cr)
5491{
5492	conn_t  	*connp = (conn_t *)proto_handle;
5493	int		error;
5494
5495	/* All Solaris components should pass a cred for this operation. */
5496	ASSERT(cr != NULL);
5497
5498	/*
5499	 * If we don't have a helper stream then create one.
5500	 * ip_create_helper_stream takes care of locking the conn_t,
5501	 * so this check for NULL is just a performance optimization.
5502	 */
5503	if (connp->conn_helper_info == NULL) {
5504		icmp_stack_t *is = connp->conn_icmp->icmp_is;
5505
5506		ASSERT(is->is_ldi_ident != NULL);
5507
5508		/*
5509		 * Create a helper stream for non-STREAMS socket.
5510		 */
5511		error = ip_create_helper_stream(connp, is->is_ldi_ident);
5512		if (error != 0) {
5513			ip0dbg(("rawip_ioctl: create of IP helper stream "
5514			    "failed %d\n", error));
5515			return (error);
5516		}
5517	}
5518
5519	switch (cmd) {
5520	case ND_SET:
5521	case ND_GET:
5522	case _SIOCSOCKFALLBACK:
5523	case TI_GETPEERNAME:
5524	case TI_GETMYNAME:
5525#ifdef DEBUG
5526		cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
5527		    " socket", cmd);
5528#endif
5529		error = EINVAL;
5530		break;
5531	default:
5532		/*
5533		 * Pass on to IP using helper stream
5534		 */
5535		error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
5536		    cmd, arg, mode, cr, rvalp);
5537		break;
5538	}
5539	return (error);
5540}
5541
5542int
5543rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
5544    cred_t *cr)
5545{
5546	sin6_t		*sin6;
5547	sin_t		*sin = NULL;
5548	uint_t		srcid;
5549	conn_t		*connp = (conn_t *)proto_handle;
5550	icmp_t		*icmp = connp->conn_icmp;
5551	int		error = 0;
5552	icmp_stack_t	*is = icmp->icmp_is;
5553	pid_t		pid = curproc->p_pid;
5554	ip_xmit_attr_t	*ixa;
5555
5556	ASSERT(DB_TYPE(mp) == M_DATA);
5557
5558	/* All Solaris components should pass a cred for this operation. */
5559	ASSERT(cr != NULL);
5560
5561	/* do an implicit bind if necessary */
5562	if (icmp->icmp_state == TS_UNBND) {
5563		error = rawip_implicit_bind(connp);
5564		/*
5565		 * We could be racing with an actual bind, in which case
5566		 * we would see EPROTO. We cross our fingers and try
5567		 * to connect.
5568		 */
5569		if (!(error == 0 || error == EPROTO)) {
5570			freemsg(mp);
5571			return (error);
5572		}
5573	}
5574
5575	/* Protocol 255 contains full IP headers */
5576	/* Read without holding lock */
5577	if (icmp->icmp_hdrincl) {
5578		ASSERT(connp->conn_ipversion == IPV4_VERSION);
5579		if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
5580			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
5581				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5582				freemsg(mp);
5583				return (EINVAL);
5584			}
5585		}
5586		error = icmp_output_hdrincl(connp, mp, cr, pid);
5587		if (is->is_sendto_ignerr)
5588			return (0);
5589		else
5590			return (error);
5591	}
5592
5593	/* Connected? */
5594	if (msg->msg_name == NULL) {
5595		if (icmp->icmp_state != TS_DATA_XFER) {
5596			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5597			return (EDESTADDRREQ);
5598		}
5599		if (msg->msg_controllen != 0) {
5600			error = icmp_output_ancillary(connp, NULL, NULL, mp,
5601			    NULL, msg, cr, pid);
5602		} else {
5603			error = icmp_output_connected(connp, mp, cr, pid);
5604		}
5605		if (is->is_sendto_ignerr)
5606			return (0);
5607		else
5608			return (error);
5609	}
5610	if (icmp->icmp_state == TS_DATA_XFER) {
5611		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5612		return (EISCONN);
5613	}
5614	error = proto_verify_ip_addr(connp->conn_family,
5615	    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
5616	if (error != 0) {
5617		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5618		return (error);
5619	}
5620	switch (connp->conn_family) {
5621	case AF_INET6:
5622		sin6 = (sin6_t *)msg->msg_name;
5623
5624		/* No support for mapped addresses on raw sockets */
5625		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
5626			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5627			return (EADDRNOTAVAIL);
5628		}
5629		srcid = sin6->__sin6_src_id;
5630
5631		/*
5632		 * If the local address is a mapped address return
5633		 * an error.
5634		 * It would be possible to send an IPv6 packet but the
5635		 * response would never make it back to the application
5636		 * since it is bound to a mapped address.
5637		 */
5638		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
5639			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5640			return (EADDRNOTAVAIL);
5641		}
5642
5643		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
5644			sin6->sin6_addr = ipv6_loopback;
5645
5646		/*
5647		 * We have to allocate an ip_xmit_attr_t before we grab
5648		 * conn_lock and we need to hold conn_lock once we've check
5649		 * conn_same_as_last_v6 to handle concurrent send* calls on a
5650		 * socket.
5651		 */
5652		if (msg->msg_controllen == 0) {
5653			ixa = conn_get_ixa(connp, B_FALSE);
5654			if (ixa == NULL) {
5655				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5656				return (ENOMEM);
5657			}
5658		} else {
5659			ixa = NULL;
5660		}
5661		mutex_enter(&connp->conn_lock);
5662		if (icmp->icmp_delayed_error != 0) {
5663			sin6_t  *sin2 = (sin6_t *)&icmp->icmp_delayed_addr;
5664
5665			error = icmp->icmp_delayed_error;
5666			icmp->icmp_delayed_error = 0;
5667
5668			/* Compare IP address and family */
5669
5670			if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
5671			    &sin2->sin6_addr) &&
5672			    sin6->sin6_family == sin2->sin6_family) {
5673				mutex_exit(&connp->conn_lock);
5674				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5675				if (ixa != NULL)
5676					ixa_refrele(ixa);
5677				return (error);
5678			}
5679		}
5680		if (msg->msg_controllen != 0) {
5681			mutex_exit(&connp->conn_lock);
5682			ASSERT(ixa == NULL);
5683			error = icmp_output_ancillary(connp, NULL, sin6, mp,
5684			    NULL, msg, cr, pid);
5685		} else if (conn_same_as_last_v6(connp, sin6) &&
5686		    connp->conn_lastsrcid == srcid &&
5687		    ipsec_outbound_policy_current(ixa)) {
5688			/* icmp_output_lastdst drops conn_lock */
5689			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5690		} else {
5691			/* icmp_output_newdst drops conn_lock */
5692			error = icmp_output_newdst(connp, mp, NULL, sin6, cr,
5693			    pid, ixa);
5694		}
5695		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5696		if (is->is_sendto_ignerr)
5697			return (0);
5698		else
5699			return (error);
5700	case AF_INET:
5701		sin = (sin_t *)msg->msg_name;
5702
5703		if (sin->sin_addr.s_addr == INADDR_ANY)
5704			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
5705
5706		/*
5707		 * We have to allocate an ip_xmit_attr_t before we grab
5708		 * conn_lock and we need to hold conn_lock once we've check
5709		 * conn_same_as_last_v6 to handle concurrent send* on a socket.
5710		 */
5711		if (msg->msg_controllen == 0) {
5712			ixa = conn_get_ixa(connp, B_FALSE);
5713			if (ixa == NULL) {
5714				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5715				return (ENOMEM);
5716			}
5717		} else {
5718			ixa = NULL;
5719		}
5720		mutex_enter(&connp->conn_lock);
5721		if (icmp->icmp_delayed_error != 0) {
5722			sin_t  *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
5723
5724			error = icmp->icmp_delayed_error;
5725			icmp->icmp_delayed_error = 0;
5726
5727			/* Compare IP address */
5728
5729			if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
5730				mutex_exit(&connp->conn_lock);
5731				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5732				if (ixa != NULL)
5733					ixa_refrele(ixa);
5734				return (error);
5735			}
5736		}
5737
5738		if (msg->msg_controllen != 0) {
5739			mutex_exit(&connp->conn_lock);
5740			ASSERT(ixa == NULL);
5741			error = icmp_output_ancillary(connp, sin, NULL, mp,
5742			    NULL, msg, cr, pid);
5743		} else if (conn_same_as_last_v4(connp, sin) &&
5744		    ipsec_outbound_policy_current(ixa)) {
5745			/* icmp_output_lastdst drops conn_lock */
5746			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5747		} else {
5748			/* icmp_output_newdst drops conn_lock */
5749			error = icmp_output_newdst(connp, mp, sin, NULL, cr,
5750			    pid, ixa);
5751		}
5752		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5753		if (is->is_sendto_ignerr)
5754			return (0);
5755		else
5756			return (error);
5757	default:
5758		return (EINVAL);
5759	}
5760}
5761
5762sock_downcalls_t sock_rawip_downcalls = {
5763	rawip_activate,
5764	rawip_accept,
5765	rawip_bind,
5766	rawip_listen,
5767	rawip_connect,
5768	rawip_getpeername,
5769	rawip_getsockname,
5770	rawip_getsockopt,
5771	rawip_setsockopt,
5772	rawip_send,
5773	NULL,
5774	NULL,
5775	NULL,
5776	rawip_shutdown,
5777	rawip_clr_flowctrl,
5778	rawip_ioctl,
5779	rawip_close
5780};
5781