socktpi.c revision 11042:2d6e217af1b4
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/t_lock.h>
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/buf.h>
32#include <sys/conf.h>
33#include <sys/cred.h>
34#include <sys/kmem.h>
35#include <sys/kmem_impl.h>
36#include <sys/sysmacros.h>
37#include <sys/vfs.h>
38#include <sys/vnode.h>
39#include <sys/debug.h>
40#include <sys/errno.h>
41#include <sys/time.h>
42#include <sys/file.h>
43#include <sys/open.h>
44#include <sys/user.h>
45#include <sys/termios.h>
46#include <sys/stream.h>
47#include <sys/strsubr.h>
48#include <sys/strsun.h>
49#include <sys/suntpi.h>
50#include <sys/ddi.h>
51#include <sys/esunddi.h>
52#include <sys/flock.h>
53#include <sys/modctl.h>
54#include <sys/vtrace.h>
55#include <sys/cmn_err.h>
56#include <sys/pathname.h>
57
58#include <sys/socket.h>
59#include <sys/socketvar.h>
60#include <sys/sockio.h>
61#include <netinet/in.h>
62#include <sys/un.h>
63#include <sys/strsun.h>
64
65#include <sys/tiuser.h>
66#define	_SUN_TPI_VERSION	2
67#include <sys/tihdr.h>
68#include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
69
70#include <c2/audit.h>
71
72#include <inet/common.h>
73#include <inet/ip.h>
74#include <inet/ip6.h>
75#include <inet/tcp.h>
76#include <inet/udp_impl.h>
77
78#include <sys/zone.h>
79
80#include <fs/sockfs/nl7c.h>
81#include <fs/sockfs/nl7curi.h>
82
83#include <inet/kssl/ksslapi.h>
84
85#include <fs/sockfs/sockcommon.h>
86#include <fs/sockfs/socktpi.h>
87#include <fs/sockfs/socktpi_impl.h>
88
89/*
90 * Possible failures when memory can't be allocated. The documented behavior:
91 *
92 * 		5.5:			4.X:		XNET:
93 * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
94 *							EINTR
95 *	(4.X does not document EINTR but returns it)
96 * bind:	ENOSR			-		ENOBUFS/ENOSR
97 * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
98 * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
99 * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
100 *	(4.X getpeername and getsockname do not fail in practice)
101 * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
102 * listen:	-			-		ENOBUFS
103 * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
104 *							EINTR
105 * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
106 *							EINTR
107 * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
108 * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
109 * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
110 * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
111 *
112 * Resolution. When allocation fails:
113 *	recv: return EINTR
114 *	send: return EINTR
115 *	connect, accept: EINTR
116 *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
117 *	socket, socketpair: ENOBUFS
118 *	getpeername, getsockname: sleep
119 *	getsockopt, setsockopt: sleep
120 */
121
122#ifdef SOCK_TEST
123/*
124 * Variables that make sockfs do something other than the standard TPI
125 * for the AF_INET transports.
126 *
127 * solisten_tpi_tcp:
128 *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
129 *	the transport is already bound. This is needed to avoid loosing the
130 *	port number should listen() do a T_UNBIND_REQ followed by a
131 *	O_T_BIND_REQ.
132 *
133 * soconnect_tpi_udp:
134 *	UDP and ICMP can handle a T_CONN_REQ.
135 *	This is needed to make the sequence of connect(), getsockname()
136 *	return the local IP address used to send packets to the connected to
137 *	destination.
138 *
139 * soconnect_tpi_tcp:
140 *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
141 *	Set this to non-zero to send TPI conformant messages to TCP in this
142 *	respect. This is a performance optimization.
143 *
144 * soaccept_tpi_tcp:
145 *	TCP can handle a T_CONN_REQ without the acceptor being bound.
146 *	This is a performance optimization that has been picked up in XTI.
147 *
148 * soaccept_tpi_multioptions:
149 *	When inheriting SOL_SOCKET options from the listener to the accepting
150 *	socket send them as a single message for AF_INET{,6}.
151 */
152int solisten_tpi_tcp = 0;
153int soconnect_tpi_udp = 0;
154int soconnect_tpi_tcp = 0;
155int soaccept_tpi_tcp = 0;
156int soaccept_tpi_multioptions = 1;
157#else /* SOCK_TEST */
158#define	soconnect_tpi_tcp	0
159#define	soconnect_tpi_udp	0
160#define	solisten_tpi_tcp	0
161#define	soaccept_tpi_tcp	0
162#define	soaccept_tpi_multioptions	1
163#endif /* SOCK_TEST */
164
165#ifdef SOCK_TEST
166extern int do_useracc;
167extern clock_t sock_test_timelimit;
168#endif /* SOCK_TEST */
169
170/*
171 * Some X/Open added checks might have to be backed out to keep SunOS 4.X
172 * applications working. Turn on this flag to disable these checks.
173 */
174int xnet_skip_checks = 0;
175int xnet_check_print = 0;
176int xnet_truncate_print = 0;
177
178static void sotpi_destroy(struct sonode *);
179static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
180    int, int *, cred_t *cr);
181
182static boolean_t	sotpi_info_create(struct sonode *, int);
183static void		sotpi_info_init(struct sonode *);
184static void 		sotpi_info_fini(struct sonode *);
185static void 		sotpi_info_destroy(struct sonode *);
186
187/*
188 * Do direct function call to the transport layer below; this would
189 * also allow the transport to utilize read-side synchronous stream
190 * interface if necessary.  This is a /etc/system tunable that must
191 * not be modified on a running system.  By default this is enabled
192 * for performance reasons and may be disabled for debugging purposes.
193 */
194boolean_t socktpi_direct = B_TRUE;
195
196static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
197
198extern	void sigintr(k_sigset_t *, int);
199extern	void sigunintr(k_sigset_t *);
200
201/* Sockets acting as an in-kernel SSL proxy */
202extern mblk_t	*strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *,
203		    strsigset_t *, strsigset_t *, strpollset_t *);
204extern mblk_t	*strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *,
205		    strsigset_t *, strsigset_t *, strpollset_t *);
206
207static int	sotpi_unbind(struct sonode *, int);
208
209/* TPI sockfs sonode operations */
210int 		sotpi_init(struct sonode *, struct sonode *, struct cred *,
211		    int);
212static int	sotpi_accept(struct sonode *, int, struct cred *,
213		    struct sonode **);
214static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
215		    int, struct cred *);
216static int	sotpi_listen(struct sonode *, int, struct cred *);
217static int	sotpi_connect(struct sonode *, const struct sockaddr *,
218		    socklen_t, int, int, struct cred *);
219extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
220		    struct uio *, struct cred *);
221static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
222		    struct uio *, struct cred *);
223static int	sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
224		    struct cred *, mblk_t **);
225static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
226		    struct uio *, void *, t_uscalar_t, int);
227static int	sodgram_direct(struct sonode *, struct sockaddr *,
228		    socklen_t, struct uio *, int);
229extern int	sotpi_getpeername(struct sonode *, struct sockaddr *,
230		    socklen_t *, boolean_t, struct cred *);
231static int	sotpi_getsockname(struct sonode *, struct sockaddr *,
232		    socklen_t *, struct cred *);
233static int	sotpi_shutdown(struct sonode *, int, struct cred *);
234extern int	sotpi_getsockopt(struct sonode *, int, int, void *,
235		    socklen_t *, int, struct cred *);
236extern int	sotpi_setsockopt(struct sonode *, int, int, const void *,
237		    socklen_t, struct cred *);
238static int 	sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
239		    int32_t *);
240static int 	socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
241		    struct cred *, int32_t *);
242static int 	sotpi_poll(struct sonode *, short, int, short *,
243		    struct pollhead **);
244static int 	sotpi_close(struct sonode *, int, struct cred *);
245
246static int	i_sotpi_info_constructor(sotpi_info_t *);
247static void 	i_sotpi_info_destructor(sotpi_info_t *);
248
249sonodeops_t sotpi_sonodeops = {
250	sotpi_init,		/* sop_init		*/
251	sotpi_accept,		/* sop_accept		*/
252	sotpi_bind,		/* sop_bind		*/
253	sotpi_listen,		/* sop_listen		*/
254	sotpi_connect,		/* sop_connect		*/
255	sotpi_recvmsg,		/* sop_recvmsg		*/
256	sotpi_sendmsg,		/* sop_sendmsg		*/
257	sotpi_sendmblk,		/* sop_sendmblk		*/
258	sotpi_getpeername,	/* sop_getpeername	*/
259	sotpi_getsockname,	/* sop_getsockname	*/
260	sotpi_shutdown,		/* sop_shutdown		*/
261	sotpi_getsockopt,	/* sop_getsockopt	*/
262	sotpi_setsockopt,	/* sop_setsockopt	*/
263	sotpi_ioctl,		/* sop_ioctl		*/
264	sotpi_poll,		/* sop_poll		*/
265	sotpi_close,		/* sop_close		*/
266};
267
268/*
269 * Return a TPI socket vnode.
270 *
271 * Note that sockets assume that the driver will clone (either itself
272 * or by using the clone driver) i.e. a socket() call will always
273 * result in a new vnode being created.
274 */
275
276/*
277 * Common create code for socket and accept. If tso is set the values
278 * from that node is used instead of issuing a T_INFO_REQ.
279 */
280
281/* ARGSUSED */
282static struct sonode *
283sotpi_create(struct sockparams *sp, int family, int type, int protocol,
284    int version, int sflags, int *errorp, cred_t *cr)
285{
286	struct sonode	*so;
287	kmem_cache_t 	*cp;
288	int		sfamily = family;
289
290	ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
291
292	if (family == AF_NCA) {
293		/*
294		 * The request is for an NCA socket so for NL7C use the
295		 * INET domain instead and mark NL7C_AF_NCA below.
296		 */
297		family = AF_INET;
298		/*
299		 * NL7C is not supported in the non-global zone,
300		 * we enforce this restriction here.
301		 */
302		if (getzoneid() != GLOBAL_ZONEID) {
303			*errorp = ENOTSUP;
304			return (NULL);
305		}
306	}
307
308	/*
309	 * to be compatible with old tpi socket implementation ignore
310	 * sleep flag (sflags) passed in
311	 */
312	cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
313	so = kmem_cache_alloc(cp, KM_SLEEP);
314	if (so == NULL) {
315		*errorp = ENOMEM;
316		return (NULL);
317	}
318
319	sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
320	sotpi_info_init(so);
321
322	if (sfamily == AF_NCA) {
323		SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
324	}
325
326	if (version == SOV_DEFAULT)
327		version = so_default_version;
328
329	so->so_version = (short)version;
330	*errorp = 0;
331
332	return (so);
333}
334
335static void
336sotpi_destroy(struct sonode *so)
337{
338	kmem_cache_t *cp;
339	struct sockparams *origsp;
340
341	/*
342	 * If there is a new dealloc function (ie. smod_destroy_func),
343	 * then it should check the correctness of the ops.
344	 */
345
346	ASSERT(so->so_ops == &sotpi_sonodeops);
347
348	origsp = SOTOTPI(so)->sti_orig_sp;
349
350	sotpi_info_fini(so);
351
352	if (so->so_state & SS_FALLBACK_COMP) {
353		/*
354		 * A fallback happend, which means that a sotpi_info_t struct
355		 * was allocated (as opposed to being allocated from the TPI
356		 * sonode cache. Therefore we explicitly free the struct
357		 * here.
358		 */
359		sotpi_info_destroy(so);
360		ASSERT(origsp != NULL);
361
362		origsp->sp_smod_info->smod_sock_destroy_func(so);
363		SOCKPARAMS_DEC_REF(origsp);
364	} else {
365		sonode_fini(so);
366		cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
367		    socktpi_cache;
368		kmem_cache_free(cp, so);
369	}
370}
371
372/* ARGSUSED1 */
373int
374sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
375{
376	major_t maj;
377	dev_t newdev;
378	struct vnode *vp;
379	int error = 0;
380	struct stdata *stp;
381
382	sotpi_info_t *sti = SOTOTPI(so);
383
384	dprint(1, ("sotpi_init()\n"));
385
386	/*
387	 * over write the sleep flag passed in but that is ok
388	 * as tpi socket does not honor sleep flag.
389	 */
390	flags |= FREAD|FWRITE;
391
392	/*
393	 * Record in so_flag that it is a clone.
394	 */
395	if (getmajor(sti->sti_dev) == clone_major)
396		so->so_flag |= SOCLONE;
397
398	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
399	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
400	    (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
401	    so->so_protocol == IPPROTO_IP)) {
402		/* Tell tcp or udp that it's talking to sockets */
403		flags |= SO_SOCKSTR;
404
405		/*
406		 * Here we indicate to socktpi_open() our attempt to
407		 * make direct calls between sockfs and transport.
408		 * The final decision is left to socktpi_open().
409		 */
410		sti->sti_direct = 1;
411
412		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
413		if (so->so_type == SOCK_STREAM && tso != NULL) {
414			if (SOTOTPI(tso)->sti_direct) {
415				/*
416				 * Inherit sti_direct from listener and pass
417				 * SO_ACCEPTOR open flag to tcp, indicating
418				 * that this is an accept fast-path instance.
419				 */
420				flags |= SO_ACCEPTOR;
421			} else {
422				/*
423				 * sti_direct is not set on listener, meaning
424				 * that the listener has been converted from
425				 * a socket to a stream.  Ensure that the
426				 * acceptor inherits these settings.
427				 */
428				sti->sti_direct = 0;
429				flags &= ~SO_SOCKSTR;
430			}
431		}
432	}
433
434	/*
435	 * Tell local transport that it is talking to sockets.
436	 */
437	if (so->so_family == AF_UNIX) {
438		flags |= SO_SOCKSTR;
439	}
440
441	vp = SOTOV(so);
442	newdev = vp->v_rdev;
443	maj = getmajor(newdev);
444	ASSERT(STREAMSTAB(maj));
445
446	error = stropen(vp, &newdev, flags, cr);
447
448	stp = vp->v_stream;
449	if (error == 0) {
450		if (so->so_flag & SOCLONE)
451			ASSERT(newdev != vp->v_rdev);
452		mutex_enter(&so->so_lock);
453		sti->sti_dev = newdev;
454		vp->v_rdev = newdev;
455		mutex_exit(&so->so_lock);
456
457		if (stp->sd_flag & STRISTTY) {
458			/*
459			 * this is a post SVR4 tty driver - a socket can not
460			 * be a controlling terminal. Fail the open.
461			 */
462			(void) sotpi_close(so, flags, cr);
463			return (ENOTTY);	/* XXX */
464		}
465
466		ASSERT(stp->sd_wrq != NULL);
467		sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
468
469		/*
470		 * If caller is interested in doing direct function call
471		 * interface to/from transport module, probe the module
472		 * directly beneath the streamhead to see if it qualifies.
473		 *
474		 * We turn off the direct interface when qualifications fail.
475		 * In the acceptor case, we simply turn off the sti_direct
476		 * flag on the socket. We do the fallback after the accept
477		 * has completed, before the new socket is returned to the
478		 * application.
479		 */
480		if (sti->sti_direct) {
481			queue_t *tq = stp->sd_wrq->q_next;
482
483			/*
484			 * sti_direct is currently supported and tested
485			 * only for tcp/udp; this is the main reason to
486			 * have the following assertions.
487			 */
488			ASSERT(so->so_family == AF_INET ||
489			    so->so_family == AF_INET6);
490			ASSERT(so->so_protocol == IPPROTO_UDP ||
491			    so->so_protocol == IPPROTO_TCP ||
492			    so->so_protocol == IPPROTO_IP);
493			ASSERT(so->so_type == SOCK_DGRAM ||
494			    so->so_type == SOCK_STREAM);
495
496			/*
497			 * Abort direct call interface if the module directly
498			 * underneath the stream head is not defined with the
499			 * _D_DIRECT flag.  This could happen in the tcp or
500			 * udp case, when some other module is autopushed
501			 * above it, or for some reasons the expected module
502			 * isn't purely D_MP (which is the main requirement).
503			 */
504			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
505			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
506				int rval;
507
508				/* Continue on without direct calls */
509				sti->sti_direct = 0;
510
511				/*
512				 * Cannot issue ioctl on fallback socket since
513				 * there is no conn associated with the queue.
514				 * The fallback downcall will notify the proto
515				 * of the change.
516				 */
517				if (!(flags & SO_ACCEPTOR) &&
518				    !(flags & SO_FALLBACK)) {
519					if ((error = strioctl(vp,
520					    _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
521					    cr, &rval)) != 0) {
522						(void) sotpi_close(so, flags,
523						    cr);
524						return (error);
525					}
526				}
527			}
528		}
529
530		if (flags & SO_FALLBACK) {
531			/*
532			 * The stream created does not have a conn.
533			 * do stream set up after conn has been assigned
534			 */
535			return (error);
536		}
537		if (error = so_strinit(so, tso)) {
538			(void) sotpi_close(so, flags, cr);
539			return (error);
540		}
541
542		/* Wildcard */
543		if (so->so_protocol != so->so_sockparams->sp_protocol) {
544			int protocol = so->so_protocol;
545			/*
546			 * Issue SO_PROTOTYPE setsockopt.
547			 */
548			error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
549			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
550			if (error != 0) {
551				(void) sotpi_close(so, flags, cr);
552				/*
553				 * Setsockopt often fails with ENOPROTOOPT but
554				 * socket() should fail with
555				 * EPROTONOSUPPORT/EPROTOTYPE.
556				 */
557				return (EPROTONOSUPPORT);
558			}
559		}
560
561	} else {
562		/*
563		 * While the same socket can not be reopened (unlike specfs)
564		 * the stream head sets STREOPENFAIL when the autopush fails.
565		 */
566		if ((stp != NULL) &&
567		    (stp->sd_flag & STREOPENFAIL)) {
568			/*
569			 * Open failed part way through.
570			 */
571			mutex_enter(&stp->sd_lock);
572			stp->sd_flag &= ~STREOPENFAIL;
573			mutex_exit(&stp->sd_lock);
574			(void) sotpi_close(so, flags, cr);
575			return (error);
576			/*NOTREACHED*/
577		}
578		ASSERT(stp == NULL);
579	}
580	TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
581	    "sockfs open:maj %d vp %p so %p error %d",
582	    maj, vp, so, error);
583	return (error);
584}
585
586/*
587 * Bind the socket to an unspecified address in sockfs only.
588 * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
589 * required in all cases.
590 */
591static void
592so_automatic_bind(struct sonode *so)
593{
594	sotpi_info_t *sti = SOTOTPI(so);
595	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
596
597	ASSERT(MUTEX_HELD(&so->so_lock));
598	ASSERT(!(so->so_state & SS_ISBOUND));
599	ASSERT(sti->sti_unbind_mp);
600
601	ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
602	bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
603	sti->sti_laddr_sa->sa_family = so->so_family;
604	so->so_state |= SS_ISBOUND;
605}
606
607
608/*
609 * bind the socket.
610 *
611 * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
612 * are passed in we allow rebinding. Note that for backwards compatibility
613 * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
614 * Thus the rebinding code is currently not executed.
615 *
616 * The constraints for rebinding are:
617 * - it is a SOCK_DGRAM, or
618 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
619 *   and no listen() has been done.
620 * This rebinding code was added based on some language in the XNET book
621 * about not returning EINVAL it the protocol allows rebinding. However,
622 * this language is not present in the Posix socket draft. Thus maybe the
623 * rebinding logic should be deleted from the source.
624 *
625 * A null "name" can be used to unbind the socket if:
626 * - it is a SOCK_DGRAM, or
627 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
628 *   and no listen() has been done.
629 */
630/* ARGSUSED */
631static int
632sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
633    socklen_t namelen, int backlog, int flags, struct cred *cr)
634{
635	struct T_bind_req	bind_req;
636	struct T_bind_ack	*bind_ack;
637	int			error = 0;
638	mblk_t			*mp;
639	void			*addr;
640	t_uscalar_t		addrlen;
641	int			unbind_on_err = 1;
642	boolean_t		clear_acceptconn_on_err = B_FALSE;
643	boolean_t		restore_backlog_on_err = B_FALSE;
644	int			save_so_backlog;
645	t_scalar_t		PRIM_type = O_T_BIND_REQ;
646	boolean_t		tcp_udp_xport;
647	void			*nl7c = NULL;
648	sotpi_info_t		*sti = SOTOTPI(so);
649
650	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
651	    (void *)so, (void *)name, namelen, backlog, flags,
652	    pr_state(so->so_state, so->so_mode)));
653
654	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
655
656	if (!(flags & _SOBIND_LOCK_HELD)) {
657		mutex_enter(&so->so_lock);
658		so_lock_single(so);	/* Set SOLOCKED */
659	} else {
660		ASSERT(MUTEX_HELD(&so->so_lock));
661		ASSERT(so->so_flag & SOLOCKED);
662	}
663
664	/*
665	 * Make sure that there is a preallocated unbind_req message
666	 * before binding. This message allocated when the socket is
667	 * created  but it might be have been consumed.
668	 */
669	if (sti->sti_unbind_mp == NULL) {
670		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
671		/* NOTE: holding so_lock while sleeping */
672		sti->sti_unbind_mp =
673		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
674		    cr);
675	}
676
677	if (flags & _SOBIND_REBIND) {
678		/*
679		 * Called from solisten after doing an sotpi_unbind() or
680		 * potentially without the unbind (latter for AF_INET{,6}).
681		 */
682		ASSERT(name == NULL && namelen == 0);
683
684		if (so->so_family == AF_UNIX) {
685			ASSERT(sti->sti_ux_bound_vp);
686			addr = &sti->sti_ux_laddr;
687			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
688			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
689			    "addr 0x%p, vp %p\n",
690			    addrlen,
691			    (void *)((struct so_ux_addr *)addr)->soua_vp,
692			    (void *)sti->sti_ux_bound_vp));
693		} else {
694			addr = sti->sti_laddr_sa;
695			addrlen = (t_uscalar_t)sti->sti_laddr_len;
696		}
697	} else if (flags & _SOBIND_UNSPEC) {
698		ASSERT(name == NULL && namelen == 0);
699
700		/*
701		 * The caller checked SS_ISBOUND but not necessarily
702		 * under so_lock
703		 */
704		if (so->so_state & SS_ISBOUND) {
705			/* No error */
706			goto done;
707		}
708
709		/* Set an initial local address */
710		switch (so->so_family) {
711		case AF_UNIX:
712			/*
713			 * Use an address with same size as struct sockaddr
714			 * just like BSD.
715			 */
716			sti->sti_laddr_len =
717			    (socklen_t)sizeof (struct sockaddr);
718			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
719			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
720			sti->sti_laddr_sa->sa_family = so->so_family;
721
722			/*
723			 * Pass down an address with the implicit bind
724			 * magic number and the rest all zeros.
725			 * The transport will return a unique address.
726			 */
727			sti->sti_ux_laddr.soua_vp = NULL;
728			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
729			addr = &sti->sti_ux_laddr;
730			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
731			break;
732
733		case AF_INET:
734		case AF_INET6:
735			/*
736			 * An unspecified bind in TPI has a NULL address.
737			 * Set the address in sockfs to have the sa_family.
738			 */
739			sti->sti_laddr_len = (so->so_family == AF_INET) ?
740			    (socklen_t)sizeof (sin_t) :
741			    (socklen_t)sizeof (sin6_t);
742			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
743			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
744			sti->sti_laddr_sa->sa_family = so->so_family;
745			addr = NULL;
746			addrlen = 0;
747			break;
748
749		default:
750			/*
751			 * An unspecified bind in TPI has a NULL address.
752			 * Set the address in sockfs to be zero length.
753			 *
754			 * Can not assume there is a sa_family for all
755			 * protocol families. For example, AF_X25 does not
756			 * have a family field.
757			 */
758			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
759			sti->sti_laddr_len = 0;	/* XXX correct? */
760			addr = NULL;
761			addrlen = 0;
762			break;
763		}
764
765	} else {
766		if (so->so_state & SS_ISBOUND) {
767			/*
768			 * If it is ok to rebind the socket, first unbind
769			 * with the transport. A rebind to the NULL address
770			 * is interpreted as an unbind.
771			 * Note that a bind to NULL in BSD does unbind the
772			 * socket but it fails with EINVAL.
773			 * Note that regular sockets set SOV_SOCKBSD i.e.
774			 * _SOBIND_SOCKBSD gets set here hence no type of
775			 * socket does currently allow rebinding.
776			 *
777			 * If the name is NULL just do an unbind.
778			 */
779			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
780			    name != NULL) {
781				error = EINVAL;
782				unbind_on_err = 0;
783				eprintsoline(so, error);
784				goto done;
785			}
786			if ((so->so_mode & SM_CONNREQUIRED) &&
787			    (so->so_state & SS_CANTREBIND)) {
788				error = EINVAL;
789				unbind_on_err = 0;
790				eprintsoline(so, error);
791				goto done;
792			}
793			error = sotpi_unbind(so, 0);
794			if (error) {
795				eprintsoline(so, error);
796				goto done;
797			}
798			ASSERT(!(so->so_state & SS_ISBOUND));
799			if (name == NULL) {
800				so->so_state &=
801				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
802				goto done;
803			}
804		}
805
806		/* X/Open requires this check */
807		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
808			if (xnet_check_print) {
809				printf("sockfs: X/Open bind state check "
810				    "caused EINVAL\n");
811			}
812			error = EINVAL;
813			goto done;
814		}
815
816		switch (so->so_family) {
817		case AF_UNIX:
818			/*
819			 * All AF_UNIX addresses are nul terminated
820			 * when copied (copyin_name) in so the minimum
821			 * length is 3 bytes.
822			 */
823			if (name == NULL ||
824			    (ssize_t)namelen <= sizeof (short) + 1) {
825				error = EISDIR;
826				eprintsoline(so, error);
827				goto done;
828			}
829			/*
830			 * Verify so_family matches the bound family.
831			 * BSD does not check this for AF_UNIX resulting
832			 * in funny mknods.
833			 */
834			if (name->sa_family != so->so_family) {
835				error = EAFNOSUPPORT;
836				goto done;
837			}
838			break;
839		case AF_INET:
840			if (name == NULL) {
841				error = EINVAL;
842				eprintsoline(so, error);
843				goto done;
844			}
845			if ((size_t)namelen != sizeof (sin_t)) {
846				error = name->sa_family != so->so_family ?
847				    EAFNOSUPPORT : EINVAL;
848				eprintsoline(so, error);
849				goto done;
850			}
851			if ((flags & _SOBIND_XPG4_2) &&
852			    (name->sa_family != so->so_family)) {
853				/*
854				 * This check has to be made for X/Open
855				 * sockets however application failures have
856				 * been observed when it is applied to
857				 * all sockets.
858				 */
859				error = EAFNOSUPPORT;
860				eprintsoline(so, error);
861				goto done;
862			}
863			/*
864			 * Force a zero sa_family to match so_family.
865			 *
866			 * Some programs like inetd(1M) don't set the
867			 * family field. Other programs leave
868			 * sin_family set to garbage - SunOS 4.X does
869			 * not check the family field on a bind.
870			 * We use the family field that
871			 * was passed in to the socket() call.
872			 */
873			name->sa_family = so->so_family;
874			break;
875
876		case AF_INET6: {
877#ifdef DEBUG
878			sin6_t *sin6 = (sin6_t *)name;
879#endif /* DEBUG */
880
881			if (name == NULL) {
882				error = EINVAL;
883				eprintsoline(so, error);
884				goto done;
885			}
886			if ((size_t)namelen != sizeof (sin6_t)) {
887				error = name->sa_family != so->so_family ?
888				    EAFNOSUPPORT : EINVAL;
889				eprintsoline(so, error);
890				goto done;
891			}
892			if (name->sa_family != so->so_family) {
893				/*
894				 * With IPv6 we require the family to match
895				 * unlike in IPv4.
896				 */
897				error = EAFNOSUPPORT;
898				eprintsoline(so, error);
899				goto done;
900			}
901#ifdef DEBUG
902			/*
903			 * Verify that apps don't forget to clear
904			 * sin6_scope_id etc
905			 */
906			if (sin6->sin6_scope_id != 0 &&
907			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
908				zcmn_err(getzoneid(), CE_WARN,
909				    "bind with uninitialized sin6_scope_id "
910				    "(%d) on socket. Pid = %d\n",
911				    (int)sin6->sin6_scope_id,
912				    (int)curproc->p_pid);
913			}
914			if (sin6->__sin6_src_id != 0) {
915				zcmn_err(getzoneid(), CE_WARN,
916				    "bind with uninitialized __sin6_src_id "
917				    "(%d) on socket. Pid = %d\n",
918				    (int)sin6->__sin6_src_id,
919				    (int)curproc->p_pid);
920			}
921#endif /* DEBUG */
922			break;
923		}
924		default:
925			/*
926			 * Don't do any length or sa_family check to allow
927			 * non-sockaddr style addresses.
928			 */
929			if (name == NULL) {
930				error = EINVAL;
931				eprintsoline(so, error);
932				goto done;
933			}
934			break;
935		}
936
937		if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
938			error = ENAMETOOLONG;
939			eprintsoline(so, error);
940			goto done;
941		}
942		/*
943		 * Save local address.
944		 */
945		sti->sti_laddr_len = (socklen_t)namelen;
946		ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
947		bcopy(name, sti->sti_laddr_sa, namelen);
948
949		addr = sti->sti_laddr_sa;
950		addrlen = (t_uscalar_t)sti->sti_laddr_len;
951		switch (so->so_family) {
952		case AF_INET6:
953		case AF_INET:
954			break;
955		case AF_UNIX: {
956			struct sockaddr_un *soun =
957			    (struct sockaddr_un *)sti->sti_laddr_sa;
958			struct vnode *vp, *rvp;
959			struct vattr vattr;
960
961			ASSERT(sti->sti_ux_bound_vp == NULL);
962			/*
963			 * Create vnode for the specified path name.
964			 * Keep vnode held with a reference in sti_ux_bound_vp.
965			 * Use the vnode pointer as the address used in the
966			 * bind with the transport.
967			 *
968			 * Use the same mode as in BSD. In particular this does
969			 * not observe the umask.
970			 */
971			/* MAXPATHLEN + soun_family + nul termination */
972			if (sti->sti_laddr_len >
973			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
974				error = ENAMETOOLONG;
975				eprintsoline(so, error);
976				goto done;
977			}
978			vattr.va_type = VSOCK;
979			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
980			vattr.va_mask = AT_TYPE|AT_MODE;
981			/* NOTE: holding so_lock */
982			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
983			    EXCL, 0, &vp, CRMKNOD, 0, 0);
984			if (error) {
985				if (error == EEXIST)
986					error = EADDRINUSE;
987				eprintsoline(so, error);
988				goto done;
989			}
990			/*
991			 * Establish pointer from the underlying filesystem
992			 * vnode to the socket node.
993			 * sti_ux_bound_vp and v_stream->sd_vnode form the
994			 * cross-linkage between the underlying filesystem
995			 * node and the socket node.
996			 */
997
998			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
999				VN_HOLD(rvp);
1000				VN_RELE(vp);
1001				vp = rvp;
1002			}
1003
1004			ASSERT(SOTOV(so)->v_stream);
1005			mutex_enter(&vp->v_lock);
1006			vp->v_stream = SOTOV(so)->v_stream;
1007			sti->sti_ux_bound_vp = vp;
1008			mutex_exit(&vp->v_lock);
1009
1010			/*
1011			 * Use the vnode pointer value as a unique address
1012			 * (together with the magic number to avoid conflicts
1013			 * with implicit binds) in the transport provider.
1014			 */
1015			sti->sti_ux_laddr.soua_vp =
1016			    (void *)sti->sti_ux_bound_vp;
1017			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1018			addr = &sti->sti_ux_laddr;
1019			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1020			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1021			    addrlen,
1022			    (void *)((struct so_ux_addr *)addr)->soua_vp));
1023			break;
1024		}
1025		} /* end switch (so->so_family) */
1026	}
1027
1028	/*
1029	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1030	 * the transport can start passing up T_CONN_IND messages
1031	 * as soon as it receives the bind req and strsock_proto()
1032	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1033	 */
1034	if (flags & _SOBIND_LISTEN) {
1035		if ((so->so_state & SS_ACCEPTCONN) == 0)
1036			clear_acceptconn_on_err = B_TRUE;
1037		save_so_backlog = so->so_backlog;
1038		restore_backlog_on_err = B_TRUE;
1039		so->so_state |= SS_ACCEPTCONN;
1040		so->so_backlog = backlog;
1041	}
1042
1043	/*
1044	 * If NL7C addr(s) have been configured check for addr/port match,
1045	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1046	 *
1047	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1048	 * family sockets only. If match mark as such.
1049	 */
1050	if (nl7c_enabled && ((addr != NULL &&
1051	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1052	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1053	    sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1054		/*
1055		 * NL7C is not supported in non-global zones,
1056		 * we enforce this restriction here.
1057		 */
1058		if (so->so_zoneid == GLOBAL_ZONEID) {
1059			/* An NL7C socket, mark it */
1060			sti->sti_nl7c_flags |= NL7C_ENABLED;
1061			if (nl7c == NULL) {
1062				/*
1063				 * Was an AF_NCA bind() so add it to the
1064				 * addr list for reporting purposes.
1065				 */
1066				nl7c = nl7c_add_addr(addr, addrlen);
1067			}
1068		} else
1069			nl7c = NULL;
1070	}
1071
1072	/*
1073	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1074	 * for other transports we will send in a O_T_BIND_REQ.
1075	 */
1076	if (tcp_udp_xport &&
1077	    (so->so_family == AF_INET || so->so_family == AF_INET6))
1078		PRIM_type = T_BIND_REQ;
1079
1080	bind_req.PRIM_type = PRIM_type;
1081	bind_req.ADDR_length = addrlen;
1082	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1083	bind_req.CONIND_number = backlog;
1084	/* NOTE: holding so_lock while sleeping */
1085	mp = soallocproto2(&bind_req, sizeof (bind_req),
1086	    addr, addrlen, 0, _ALLOC_SLEEP, cr);
1087	sti->sti_laddr_valid = 0;
1088
1089	/* Done using sti_laddr_sa - can drop the lock */
1090	mutex_exit(&so->so_lock);
1091
1092	/*
1093	 * Intercept the bind_req message here to check if this <address/port>
1094	 * was configured as an SSL proxy server, or if another endpoint was
1095	 * already configured to act as a proxy for us.
1096	 *
1097	 * Note, only if NL7C not enabled for this socket.
1098	 */
1099	if (nl7c == NULL &&
1100	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1101	    so->so_type == SOCK_STREAM) {
1102
1103		if (sti->sti_kssl_ent != NULL) {
1104			kssl_release_ent(sti->sti_kssl_ent, so,
1105			    sti->sti_kssl_type);
1106			sti->sti_kssl_ent = NULL;
1107		}
1108
1109		sti->sti_kssl_type = kssl_check_proxy(mp, so,
1110		    &sti->sti_kssl_ent);
1111		switch (sti->sti_kssl_type) {
1112		case KSSL_NO_PROXY:
1113			break;
1114
1115		case KSSL_HAS_PROXY:
1116			mutex_enter(&so->so_lock);
1117			goto skip_transport;
1118
1119		case KSSL_IS_PROXY:
1120			break;
1121		}
1122	}
1123
1124	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1125	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1126	if (error) {
1127		eprintsoline(so, error);
1128		mutex_enter(&so->so_lock);
1129		goto done;
1130	}
1131
1132	mutex_enter(&so->so_lock);
1133	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1134	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1135	if (error) {
1136		eprintsoline(so, error);
1137		goto done;
1138	}
1139skip_transport:
1140	ASSERT(mp);
1141	/*
1142	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1143	 * strsock_proto while the lock was dropped above, the bind
1144	 * is allowed to complete.
1145	 */
1146
1147	/* Mark as bound. This will be undone if we detect errors below. */
1148	if (flags & _SOBIND_NOXLATE) {
1149		ASSERT(so->so_family == AF_UNIX);
1150		sti->sti_faddr_noxlate = 1;
1151	}
1152	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1153	so->so_state |= SS_ISBOUND;
1154	ASSERT(sti->sti_unbind_mp);
1155
1156	/* note that we've already set SS_ACCEPTCONN above */
1157
1158	/*
1159	 * Recompute addrlen - an unspecied bind sent down an
1160	 * address of length zero but we expect the appropriate length
1161	 * in return.
1162	 */
1163	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1164	    sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1165
1166	bind_ack = (struct T_bind_ack *)mp->b_rptr;
1167	/*
1168	 * The alignment restriction is really too strict but
1169	 * we want enough alignment to inspect the fields of
1170	 * a sockaddr_in.
1171	 */
1172	addr = sogetoff(mp, bind_ack->ADDR_offset,
1173	    bind_ack->ADDR_length,
1174	    __TPI_ALIGN_SIZE);
1175	if (addr == NULL) {
1176		freemsg(mp);
1177		error = EPROTO;
1178		eprintsoline(so, error);
1179		goto done;
1180	}
1181	if (!(flags & _SOBIND_UNSPEC)) {
1182		/*
1183		 * Verify that the transport didn't return something we
1184		 * did not want e.g. an address other than what we asked for.
1185		 *
1186		 * NOTE: These checks would go away if/when we switch to
1187		 * using the new TPI (in which the transport would fail
1188		 * the request instead of assigning a different address).
1189		 *
1190		 * NOTE2: For protocols that we don't know (i.e. any
1191		 * other than AF_INET6, AF_INET and AF_UNIX), we
1192		 * cannot know if the transport should be expected to
1193		 * return the same address as that requested.
1194		 *
1195		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1196		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1197		 *
1198		 * For example, in the case of netatalk it may be
1199		 * inappropriate for the transport to return the
1200		 * requested address (as it may have allocated a local
1201		 * port number in behaviour similar to that of an
1202		 * AF_INET bind request with a port number of zero).
1203		 *
1204		 * Given the definition of O_T_BIND_REQ, where the
1205		 * transport may bind to an address other than the
1206		 * requested address, it's not possible to determine
1207		 * whether a returned address that differs from the
1208		 * requested address is a reason to fail (because the
1209		 * requested address was not available) or succeed
1210		 * (because the transport allocated an appropriate
1211		 * address and/or port).
1212		 *
1213		 * sockfs currently requires that the transport return
1214		 * the requested address in the T_BIND_ACK, unless
1215		 * there is code here to allow for any discrepancy.
1216		 * Such code exists for AF_INET and AF_INET6.
1217		 *
1218		 * Netatalk chooses to return the requested address
1219		 * rather than the (correct) allocated address.  This
1220		 * means that netatalk violates the TPI specification
1221		 * (and would not function correctly if used from a
1222		 * TLI application), but it does mean that it works
1223		 * with sockfs.
1224		 *
1225		 * As noted above, using the newer XTI bind primitive
1226		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1227		 * allow sockfs to be more sure about whether or not
1228		 * the bind request had succeeded (as transports are
1229		 * not permitted to bind to a different address than
1230		 * that requested - they must return failure).
1231		 * Unfortunately, support for T_BIND_REQ may not be
1232		 * present in all transport implementations (netatalk,
1233		 * for example, doesn't have it), making the
1234		 * transition difficult.
1235		 */
1236		if (bind_ack->ADDR_length != addrlen) {
1237			/* Assumes that the requested address was in use */
1238			freemsg(mp);
1239			error = EADDRINUSE;
1240			eprintsoline(so, error);
1241			goto done;
1242		}
1243
1244		switch (so->so_family) {
1245		case AF_INET6:
1246		case AF_INET: {
1247			sin_t *rname, *aname;
1248
1249			rname = (sin_t *)addr;
1250			aname = (sin_t *)sti->sti_laddr_sa;
1251
1252			/*
1253			 * Take advantage of the alignment
1254			 * of sin_port and sin6_port which fall
1255			 * in the same place in their data structures.
1256			 * Just use sin_port for either address family.
1257			 *
1258			 * This may become a problem if (heaven forbid)
1259			 * there's a separate ipv6port_reserved... :-P
1260			 *
1261			 * Binding to port 0 has the semantics of letting
1262			 * the transport bind to any port.
1263			 *
1264			 * If the transport is TCP or UDP since we had sent
1265			 * a T_BIND_REQ we would not get a port other than
1266			 * what we asked for.
1267			 */
1268			if (tcp_udp_xport) {
1269				/*
1270				 * Pick up the new port number if we bound to
1271				 * port 0.
1272				 */
1273				if (aname->sin_port == 0)
1274					aname->sin_port = rname->sin_port;
1275				sti->sti_laddr_valid = 1;
1276				break;
1277			}
1278			if (aname->sin_port != 0 &&
1279			    aname->sin_port != rname->sin_port) {
1280				freemsg(mp);
1281				error = EADDRINUSE;
1282				eprintsoline(so, error);
1283				goto done;
1284			}
1285			/*
1286			 * Pick up the new port number if we bound to port 0.
1287			 */
1288			aname->sin_port = rname->sin_port;
1289
1290			/*
1291			 * Unfortunately, addresses aren't _quite_ the same.
1292			 */
1293			if (so->so_family == AF_INET) {
1294				if (aname->sin_addr.s_addr !=
1295				    rname->sin_addr.s_addr) {
1296					freemsg(mp);
1297					error = EADDRNOTAVAIL;
1298					eprintsoline(so, error);
1299					goto done;
1300				}
1301			} else {
1302				sin6_t *rname6 = (sin6_t *)rname;
1303				sin6_t *aname6 = (sin6_t *)aname;
1304
1305				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1306				    &rname6->sin6_addr)) {
1307					freemsg(mp);
1308					error = EADDRNOTAVAIL;
1309					eprintsoline(so, error);
1310					goto done;
1311				}
1312			}
1313			break;
1314		}
1315		case AF_UNIX:
1316			if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1317				freemsg(mp);
1318				error = EADDRINUSE;
1319				eprintsoline(so, error);
1320				eprintso(so,
1321				    ("addrlen %d, addr 0x%x, vp %p\n",
1322				    addrlen, *((int *)addr),
1323				    (void *)sti->sti_ux_bound_vp));
1324				goto done;
1325			}
1326			sti->sti_laddr_valid = 1;
1327			break;
1328		default:
1329			/*
1330			 * NOTE: This assumes that addresses can be
1331			 * byte-compared for equivalence.
1332			 */
1333			if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1334				freemsg(mp);
1335				error = EADDRINUSE;
1336				eprintsoline(so, error);
1337				goto done;
1338			}
1339			/*
1340			 * Don't mark sti_laddr_valid, as we cannot be
1341			 * sure that the returned address is the real
1342			 * bound address when talking to an unknown
1343			 * transport.
1344			 */
1345			break;
1346		}
1347	} else {
1348		/*
1349		 * Save for returned address for getsockname.
1350		 * Needed for unspecific bind unless transport supports
1351		 * the TI_GETMYNAME ioctl.
1352		 * Do this for AF_INET{,6} even though they do, as
1353		 * caching info here is much better performance than
1354		 * a TPI/STREAMS trip to the transport for getsockname.
1355		 * Any which can't for some reason _must_ _not_ set
1356		 * sti_laddr_valid here for the caching version of
1357		 * getsockname to not break;
1358		 */
1359		switch (so->so_family) {
1360		case AF_UNIX:
1361			/*
1362			 * Record the address bound with the transport
1363			 * for use by socketpair.
1364			 */
1365			bcopy(addr, &sti->sti_ux_laddr, addrlen);
1366			sti->sti_laddr_valid = 1;
1367			break;
1368		case AF_INET:
1369		case AF_INET6:
1370			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1371			bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1372			sti->sti_laddr_valid = 1;
1373			break;
1374		default:
1375			/*
1376			 * Don't mark sti_laddr_valid, as we cannot be
1377			 * sure that the returned address is the real
1378			 * bound address when talking to an unknown
1379			 * transport.
1380			 */
1381			break;
1382		}
1383	}
1384
1385	if (nl7c != NULL) {
1386		/* Register listen()er sonode pointer with NL7C */
1387		nl7c_listener_addr(nl7c, so);
1388	}
1389
1390	freemsg(mp);
1391
1392done:
1393	if (error) {
1394		/* reset state & backlog to values held on entry */
1395		if (clear_acceptconn_on_err == B_TRUE)
1396			so->so_state &= ~SS_ACCEPTCONN;
1397		if (restore_backlog_on_err == B_TRUE)
1398			so->so_backlog = save_so_backlog;
1399
1400		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1401			int err;
1402
1403			err = sotpi_unbind(so, 0);
1404			/* LINTED - statement has no consequent: if */
1405			if (err) {
1406				eprintsoline(so, error);
1407			} else {
1408				ASSERT(!(so->so_state & SS_ISBOUND));
1409			}
1410		}
1411	}
1412	if (!(flags & _SOBIND_LOCK_HELD)) {
1413		so_unlock_single(so, SOLOCKED);
1414		mutex_exit(&so->so_lock);
1415	} else {
1416		ASSERT(MUTEX_HELD(&so->so_lock));
1417		ASSERT(so->so_flag & SOLOCKED);
1418	}
1419	return (error);
1420}
1421
1422/* bind the socket */
1423static int
1424sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1425    int flags, struct cred *cr)
1426{
1427	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1428		return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1429
1430	flags &= ~_SOBIND_SOCKETPAIR;
1431	return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1432}
1433
1434/*
1435 * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1436 * address, or when listen needs to unbind and bind.
1437 * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1438 * so that a sobind can pick them up.
1439 */
1440static int
1441sotpi_unbind(struct sonode *so, int flags)
1442{
1443	struct T_unbind_req	unbind_req;
1444	int			error = 0;
1445	mblk_t			*mp;
1446	sotpi_info_t		*sti = SOTOTPI(so);
1447
1448	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1449	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1450
1451	ASSERT(MUTEX_HELD(&so->so_lock));
1452	ASSERT(so->so_flag & SOLOCKED);
1453
1454	if (!(so->so_state & SS_ISBOUND)) {
1455		error = EINVAL;
1456		eprintsoline(so, error);
1457		goto done;
1458	}
1459
1460	mutex_exit(&so->so_lock);
1461
1462	/*
1463	 * Flush the read and write side (except stream head read queue)
1464	 * and send down T_UNBIND_REQ.
1465	 */
1466	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1467
1468	unbind_req.PRIM_type = T_UNBIND_REQ;
1469	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1470	    0, _ALLOC_SLEEP, CRED());
1471	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1472	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1473	mutex_enter(&so->so_lock);
1474	if (error) {
1475		eprintsoline(so, error);
1476		goto done;
1477	}
1478
1479	error = sowaitokack(so, T_UNBIND_REQ);
1480	if (error) {
1481		eprintsoline(so, error);
1482		goto done;
1483	}
1484
1485	/*
1486	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1487	 * strsock_proto while the lock was dropped above, the unbind
1488	 * is allowed to complete.
1489	 */
1490	if (!(flags & _SOUNBIND_REBIND)) {
1491		/*
1492		 * Clear out bound address.
1493		 */
1494		vnode_t *vp;
1495
1496		if ((vp = sti->sti_ux_bound_vp) != NULL) {
1497
1498			/* Undo any SSL proxy setup */
1499			if ((so->so_family == AF_INET ||
1500			    so->so_family == AF_INET6) &&
1501			    (so->so_type == SOCK_STREAM) &&
1502			    (sti->sti_kssl_ent != NULL)) {
1503				kssl_release_ent(sti->sti_kssl_ent, so,
1504				    sti->sti_kssl_type);
1505				sti->sti_kssl_ent = NULL;
1506				sti->sti_kssl_type = KSSL_NO_PROXY;
1507			}
1508			sti->sti_ux_bound_vp = NULL;
1509			vn_rele_stream(vp);
1510		}
1511		/* Clear out address */
1512		sti->sti_laddr_len = 0;
1513	}
1514	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1515	sti->sti_laddr_valid = 0;
1516
1517done:
1518
1519	/* If the caller held the lock don't release it here */
1520	ASSERT(MUTEX_HELD(&so->so_lock));
1521	ASSERT(so->so_flag & SOLOCKED);
1522
1523	return (error);
1524}
1525
1526/*
1527 * listen on the socket.
1528 * For TPI conforming transports this has to first unbind with the transport
1529 * and then bind again using the new backlog.
1530 */
1531/* ARGSUSED */
1532int
1533sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1534{
1535	int		error = 0;
1536	sotpi_info_t	*sti = SOTOTPI(so);
1537
1538	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1539	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1540
1541	if (sti->sti_serv_type == T_CLTS)
1542		return (EOPNOTSUPP);
1543
1544	/*
1545	 * If the socket is ready to accept connections already, then
1546	 * return without doing anything.  This avoids a problem where
1547	 * a second listen() call fails if a connection is pending and
1548	 * leaves the socket unbound. Only when we are not unbinding
1549	 * with the transport can we safely increase the backlog.
1550	 */
1551	if (so->so_state & SS_ACCEPTCONN &&
1552	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1553	    /*CONSTCOND*/
1554	    !solisten_tpi_tcp))
1555		return (0);
1556
1557	if (so->so_state & SS_ISCONNECTED)
1558		return (EINVAL);
1559
1560	mutex_enter(&so->so_lock);
1561	so_lock_single(so);	/* Set SOLOCKED */
1562
1563	/*
1564	 * If the listen doesn't change the backlog we do nothing.
1565	 * This avoids an EPROTO error from the transport.
1566	 */
1567	if ((so->so_state & SS_ACCEPTCONN) &&
1568	    so->so_backlog == backlog)
1569		goto done;
1570
1571	if (!(so->so_state & SS_ISBOUND)) {
1572		/*
1573		 * Must have been explicitly bound in the UNIX domain.
1574		 */
1575		if (so->so_family == AF_UNIX) {
1576			error = EINVAL;
1577			goto done;
1578		}
1579		error = sotpi_bindlisten(so, NULL, 0, backlog,
1580		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1581	} else if (backlog > 0) {
1582		/*
1583		 * AF_INET{,6} hack to avoid losing the port.
1584		 * Assumes that all AF_INET{,6} transports can handle a
1585		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1586		 * has already bound thus it is possible to avoid the unbind.
1587		 */
1588		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1589		    /*CONSTCOND*/
1590		    !solisten_tpi_tcp)) {
1591			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1592			if (error)
1593				goto done;
1594		}
1595		error = sotpi_bindlisten(so, NULL, 0, backlog,
1596		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1597	} else {
1598		so->so_state |= SS_ACCEPTCONN;
1599		so->so_backlog = backlog;
1600	}
1601	if (error)
1602		goto done;
1603	ASSERT(so->so_state & SS_ACCEPTCONN);
1604done:
1605	so_unlock_single(so, SOLOCKED);
1606	mutex_exit(&so->so_lock);
1607	return (error);
1608}
1609
1610/*
1611 * Disconnect either a specified seqno or all (-1).
1612 * The former is used on listening sockets only.
1613 *
1614 * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1615 * the current use of sodisconnect(seqno == -1) is only for shutdown
1616 * so there is no point (and potentially incorrect) to unbind.
1617 */
1618static int
1619sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1620{
1621	struct T_discon_req	discon_req;
1622	int			error = 0;
1623	mblk_t			*mp;
1624
1625	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1626	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1627
1628	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1629		mutex_enter(&so->so_lock);
1630		so_lock_single(so);	/* Set SOLOCKED */
1631	} else {
1632		ASSERT(MUTEX_HELD(&so->so_lock));
1633		ASSERT(so->so_flag & SOLOCKED);
1634	}
1635
1636	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1637		error = EINVAL;
1638		eprintsoline(so, error);
1639		goto done;
1640	}
1641
1642	mutex_exit(&so->so_lock);
1643	/*
1644	 * Flush the write side (unless this is a listener)
1645	 * and then send down a T_DISCON_REQ.
1646	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1647	 * and other messages.)
1648	 */
1649	if (!(so->so_state & SS_ACCEPTCONN))
1650		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1651
1652	discon_req.PRIM_type = T_DISCON_REQ;
1653	discon_req.SEQ_number = seqno;
1654	mp = soallocproto1(&discon_req, sizeof (discon_req),
1655	    0, _ALLOC_SLEEP, CRED());
1656	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1657	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1658	mutex_enter(&so->so_lock);
1659	if (error) {
1660		eprintsoline(so, error);
1661		goto done;
1662	}
1663
1664	error = sowaitokack(so, T_DISCON_REQ);
1665	if (error) {
1666		eprintsoline(so, error);
1667		goto done;
1668	}
1669	/*
1670	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1671	 * strsock_proto while the lock was dropped above, the disconnect
1672	 * is allowed to complete. However, it is not possible to
1673	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1674	 */
1675	so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1676	SOTOTPI(so)->sti_laddr_valid = 0;
1677	SOTOTPI(so)->sti_faddr_valid = 0;
1678done:
1679	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1680		so_unlock_single(so, SOLOCKED);
1681		mutex_exit(&so->so_lock);
1682	} else {
1683		/* If the caller held the lock don't release it here */
1684		ASSERT(MUTEX_HELD(&so->so_lock));
1685		ASSERT(so->so_flag & SOLOCKED);
1686	}
1687	return (error);
1688}
1689
1690/* ARGSUSED */
1691int
1692sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1693    struct sonode **nsop)
1694{
1695	struct T_conn_ind	*conn_ind;
1696	struct T_conn_res	*conn_res;
1697	int			error = 0;
1698	mblk_t			*mp, *ctxmp, *ack_mp;
1699	struct sonode		*nso;
1700	vnode_t			*nvp;
1701	void			*src;
1702	t_uscalar_t		srclen;
1703	void			*opt;
1704	t_uscalar_t		optlen;
1705	t_scalar_t		PRIM_type;
1706	t_scalar_t		SEQ_number;
1707	size_t			sinlen;
1708	sotpi_info_t		*sti = SOTOTPI(so);
1709	sotpi_info_t		*nsti;
1710
1711	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1712	    (void *)so, fflag, (void *)nsop,
1713	    pr_state(so->so_state, so->so_mode)));
1714
1715	/*
1716	 * Defer single-threading the accepting socket until
1717	 * the T_CONN_IND has been received and parsed and the
1718	 * new sonode has been opened.
1719	 */
1720
1721	/* Check that we are not already connected */
1722	if ((so->so_state & SS_ACCEPTCONN) == 0)
1723		goto conn_bad;
1724again:
1725	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1726		goto e_bad;
1727
1728	ASSERT(mp != NULL);
1729	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1730	ctxmp = mp->b_cont;
1731
1732	/*
1733	 * Save SEQ_number for error paths.
1734	 */
1735	SEQ_number = conn_ind->SEQ_number;
1736
1737	srclen = conn_ind->SRC_length;
1738	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1739	if (src == NULL) {
1740		error = EPROTO;
1741		freemsg(mp);
1742		eprintsoline(so, error);
1743		goto disconnect_unlocked;
1744	}
1745	optlen = conn_ind->OPT_length;
1746	switch (so->so_family) {
1747	case AF_INET:
1748	case AF_INET6:
1749		if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1750			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1751			    &opt, conn_ind->OPT_length);
1752		} else {
1753			/*
1754			 * The transport (in this case TCP) hasn't sent up
1755			 * a pointer to an instance for the accept fast-path.
1756			 * Disable fast-path completely because the call to
1757			 * sotpi_create() below would otherwise create an
1758			 * incomplete TCP instance, which would lead to
1759			 * problems when sockfs sends a normal T_CONN_RES
1760			 * message down the new stream.
1761			 */
1762			if (sti->sti_direct) {
1763				int rval;
1764				/*
1765				 * For consistency we inform tcp to disable
1766				 * direct interface on the listener, though
1767				 * we can certainly live without doing this
1768				 * because no data will ever travel upstream
1769				 * on the listening socket.
1770				 */
1771				sti->sti_direct = 0;
1772				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1773				    0, 0, K_TO_K, cr, &rval);
1774			}
1775			opt = NULL;
1776			optlen = 0;
1777		}
1778		break;
1779	case AF_UNIX:
1780	default:
1781		if (optlen != 0) {
1782			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1783			    __TPI_ALIGN_SIZE);
1784			if (opt == NULL) {
1785				error = EPROTO;
1786				freemsg(mp);
1787				eprintsoline(so, error);
1788				goto disconnect_unlocked;
1789			}
1790		}
1791		if (so->so_family == AF_UNIX) {
1792			if (!sti->sti_faddr_noxlate) {
1793				src = NULL;
1794				srclen = 0;
1795			}
1796			/* Extract src address from options */
1797			if (optlen != 0)
1798				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1799		}
1800		break;
1801	}
1802
1803	/*
1804	 * Create the new socket.
1805	 */
1806	nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1807	if (nso == NULL) {
1808		ASSERT(error != 0);
1809		/*
1810		 * Accept can not fail with ENOBUFS. sotpi_create
1811		 * sleeps waiting for memory until a signal is caught
1812		 * so return EINTR.
1813		 */
1814		freemsg(mp);
1815		if (error == ENOBUFS)
1816			error = EINTR;
1817		goto e_disc_unl;
1818	}
1819	nvp = SOTOV(nso);
1820	nsti = SOTOTPI(nso);
1821
1822	/*
1823	 * If the transport sent up an SSL connection context, then attach
1824	 * it the new socket, and set the (sd_wputdatafunc)() and
1825	 * (sd_rputdatafunc)() stream head hooks to intercept and process
1826	 * SSL records.
1827	 */
1828	if (ctxmp != NULL) {
1829		/*
1830		 * This kssl_ctx_t is already held for us by the transport.
1831		 * So, we don't need to do a kssl_hold_ctx() here.
1832		 */
1833		nsti->sti_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
1834		freemsg(ctxmp);
1835		mp->b_cont = NULL;
1836		strsetrwputdatahooks(nvp, strsock_kssl_input,
1837		    strsock_kssl_output);
1838	}
1839#ifdef DEBUG
1840	/*
1841	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1842	 * it's inherited early to allow debugging of the accept code itself.
1843	 */
1844	nso->so_options |= so->so_options & SO_DEBUG;
1845#endif /* DEBUG */
1846
1847	/*
1848	 * Save the SRC address from the T_CONN_IND
1849	 * for getpeername to work on AF_UNIX and on transports that do not
1850	 * support TI_GETPEERNAME.
1851	 *
1852	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1853	 * copyin_name().
1854	 */
1855	if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1856		error = EINVAL;
1857		freemsg(mp);
1858		eprintsoline(so, error);
1859		goto disconnect_vp_unlocked;
1860	}
1861	nsti->sti_faddr_len = (socklen_t)srclen;
1862	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1863	bcopy(src, nsti->sti_faddr_sa, srclen);
1864	nsti->sti_faddr_valid = 1;
1865
1866	/*
1867	 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1868	 */
1869	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1870	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1871		cred_t	*cr;
1872		pid_t	cpid;
1873
1874		cr = msg_getcred(mp, &cpid);
1875		if (cr != NULL) {
1876			crhold(cr);
1877			nso->so_peercred = cr;
1878			nso->so_cpid = cpid;
1879		}
1880		freemsg(mp);
1881
1882		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1883		    sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1884		if (mp == NULL) {
1885			/*
1886			 * Accept can not fail with ENOBUFS.
1887			 * A signal was caught so return EINTR.
1888			 */
1889			error = EINTR;
1890			eprintsoline(so, error);
1891			goto disconnect_vp_unlocked;
1892		}
1893		conn_res = (struct T_conn_res *)mp->b_rptr;
1894	} else {
1895		/*
1896		 * For efficency reasons we use msg_extractcred; no crhold
1897		 * needed since db_credp is cleared (i.e., we move the cred
1898		 * from the message to so_peercred.
1899		 */
1900		nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1901
1902		mp->b_rptr = DB_BASE(mp);
1903		conn_res = (struct T_conn_res *)mp->b_rptr;
1904		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1905
1906		mblk_setcred(mp, cr, curproc->p_pid);
1907	}
1908
1909	/*
1910	 * New socket must be bound at least in sockfs and, except for AF_INET,
1911	 * (or AF_INET6) it also has to be bound in the transport provider.
1912	 * We set the local address in the sonode from the T_OK_ACK of the
1913	 * T_CONN_RES. For this reason the address we bind to here isn't
1914	 * important.
1915	 */
1916	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1917	    /*CONSTCOND*/
1918	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1919		/*
1920		 * Optimization for AF_INET{,6} transports
1921		 * that can handle a T_CONN_RES without being bound.
1922		 */
1923		mutex_enter(&nso->so_lock);
1924		so_automatic_bind(nso);
1925		mutex_exit(&nso->so_lock);
1926	} else {
1927		/* Perform NULL bind with the transport provider. */
1928		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1929		    cr)) != 0) {
1930			ASSERT(error != ENOBUFS);
1931			freemsg(mp);
1932			eprintsoline(nso, error);
1933			goto disconnect_vp_unlocked;
1934		}
1935	}
1936
1937	/*
1938	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1939	 * so that any data arriving on the new socket will cause the
1940	 * appropriate signals to be delivered for the new socket.
1941	 *
1942	 * No other thread (except strsock_proto and strsock_misc)
1943	 * can access the new socket thus we relax the locking.
1944	 */
1945	nso->so_pgrp = so->so_pgrp;
1946	nso->so_state |= so->so_state & SS_ASYNC;
1947	nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1948
1949	if (nso->so_pgrp != 0) {
1950		if ((error = so_set_events(nso, nvp, cr)) != 0) {
1951			eprintsoline(nso, error);
1952			error = 0;
1953			nso->so_pgrp = 0;
1954		}
1955	}
1956
1957	/*
1958	 * Make note of the socket level options. TCP and IP level options
1959	 * are already inherited. We could do all this after accept is
1960	 * successful but doing it here simplifies code and no harm done
1961	 * for error case.
1962	 */
1963	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1964	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1965	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1966	nso->so_sndbuf = so->so_sndbuf;
1967	nso->so_rcvbuf = so->so_rcvbuf;
1968	if (nso->so_options & SO_LINGER)
1969		nso->so_linger = so->so_linger;
1970
1971	/*
1972	 * Note that the following sti_direct code path should be
1973	 * removed once we are confident that the direct sockets
1974	 * do not result in any degradation.
1975	 */
1976	if (sti->sti_direct) {
1977
1978		ASSERT(opt != NULL);
1979
1980		conn_res->OPT_length = optlen;
1981		conn_res->OPT_offset = MBLKL(mp);
1982		bcopy(&opt, mp->b_wptr, optlen);
1983		mp->b_wptr += optlen;
1984		conn_res->PRIM_type = T_CONN_RES;
1985		conn_res->ACCEPTOR_id = 0;
1986		PRIM_type = T_CONN_RES;
1987
1988		/* Send down the T_CONN_RES on acceptor STREAM */
1989		error = kstrputmsg(SOTOV(nso), mp, NULL,
1990		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1991		if (error) {
1992			mutex_enter(&so->so_lock);
1993			so_lock_single(so);
1994			eprintsoline(so, error);
1995			goto disconnect_vp;
1996		}
1997		mutex_enter(&nso->so_lock);
1998		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1999		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2000		if (error) {
2001			mutex_exit(&nso->so_lock);
2002			mutex_enter(&so->so_lock);
2003			so_lock_single(so);
2004			eprintsoline(so, error);
2005			goto disconnect_vp;
2006		}
2007		if (nso->so_family == AF_INET) {
2008			sin_t *sin;
2009
2010			sin = (sin_t *)(ack_mp->b_rptr +
2011			    sizeof (struct T_ok_ack));
2012			bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
2013			nsti->sti_laddr_len = sizeof (sin_t);
2014		} else {
2015			sin6_t *sin6;
2016
2017			sin6 = (sin6_t *)(ack_mp->b_rptr +
2018			    sizeof (struct T_ok_ack));
2019			bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
2020			nsti->sti_laddr_len = sizeof (sin6_t);
2021		}
2022		freemsg(ack_mp);
2023
2024		nso->so_state |= SS_ISCONNECTED;
2025		nso->so_proto_handle = (sock_lower_handle_t)opt;
2026		nsti->sti_laddr_valid = 1;
2027
2028		if (sti->sti_nl7c_flags & NL7C_ENABLED) {
2029			/*
2030			 * A NL7C marked listen()er so the new socket
2031			 * inherits the listen()er's NL7C state, except
2032			 * for NL7C_POLLIN.
2033			 *
2034			 * Only call NL7C to process the new socket if
2035			 * the listen socket allows blocking i/o.
2036			 */
2037			nsti->sti_nl7c_flags =
2038			    sti->sti_nl7c_flags & (~NL7C_POLLIN);
2039			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
2040				/*
2041				 * Nonblocking accept() just make it
2042				 * persist to defer processing to the
2043				 * read-side syscall (e.g. read).
2044				 */
2045				nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
2046			} else if (nl7c_process(nso, B_FALSE)) {
2047				/*
2048				 * NL7C has completed processing on the
2049				 * socket, close the socket and back to
2050				 * the top to await the next T_CONN_IND.
2051				 */
2052				mutex_exit(&nso->so_lock);
2053				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
2054				    cr, NULL);
2055				VN_RELE(nvp);
2056				goto again;
2057			}
2058			/* Pass the new socket out */
2059		}
2060
2061		mutex_exit(&nso->so_lock);
2062
2063		/*
2064		 * It's possible, through the use of autopush for example,
2065		 * that the acceptor stream may not support sti_direct
2066		 * semantics. If the new socket does not support sti_direct
2067		 * we issue a _SIOCSOCKFALLBACK to inform the transport
2068		 * as we would in the I_PUSH case.
2069		 */
2070		if (nsti->sti_direct == 0) {
2071			int	rval;
2072
2073			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2074			    0, 0, K_TO_K, cr, &rval)) != 0) {
2075				mutex_enter(&so->so_lock);
2076				so_lock_single(so);
2077				eprintsoline(so, error);
2078				goto disconnect_vp;
2079			}
2080		}
2081
2082		/*
2083		 * Pass out new socket.
2084		 */
2085		if (nsop != NULL)
2086			*nsop = nso;
2087
2088		return (0);
2089	}
2090
2091	/*
2092	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2093	 * which don't support the FireEngine accept fast-path. It is also
2094	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2095	 * again. Neither sockfs nor TCP attempt to find out if some other
2096	 * random module has been inserted in between (in which case we
2097	 * should follow TLI accept behaviour). We blindly assume the worst
2098	 * case and revert back to old behaviour i.e. TCP will not send us
2099	 * any option (eager) and the accept should happen on the listener
2100	 * queue. Any queued T_conn_ind have already got their options removed
2101	 * by so_sock2_stream() when "sockmod" was I_POP'd.
2102	 */
2103	/*
2104	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2105	 */
2106	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2107#ifdef	_ILP32
2108		queue_t	*q;
2109
2110		/*
2111		 * Find read queue in driver
2112		 * Can safely do this since we "own" nso/nvp.
2113		 */
2114		q = strvp2wq(nvp)->q_next;
2115		while (SAMESTR(q))
2116			q = q->q_next;
2117		q = RD(q);
2118		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2119#else
2120		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2121#endif	/* _ILP32 */
2122		conn_res->PRIM_type = O_T_CONN_RES;
2123		PRIM_type = O_T_CONN_RES;
2124	} else {
2125		conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2126		conn_res->PRIM_type = T_CONN_RES;
2127		PRIM_type = T_CONN_RES;
2128	}
2129	conn_res->SEQ_number = SEQ_number;
2130	conn_res->OPT_length = 0;
2131	conn_res->OPT_offset = 0;
2132
2133	mutex_enter(&so->so_lock);
2134	so_lock_single(so);	/* Set SOLOCKED */
2135	mutex_exit(&so->so_lock);
2136
2137	error = kstrputmsg(SOTOV(so), mp, NULL,
2138	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2139	mutex_enter(&so->so_lock);
2140	if (error) {
2141		eprintsoline(so, error);
2142		goto disconnect_vp;
2143	}
2144	error = sowaitprim(so, PRIM_type, T_OK_ACK,
2145	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2146	if (error) {
2147		eprintsoline(so, error);
2148		goto disconnect_vp;
2149	}
2150	/*
2151	 * If there is a sin/sin6 appended onto the T_OK_ACK use
2152	 * that to set the local address. If this is not present
2153	 * then we zero out the address and don't set the
2154	 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2155	 * the pathname from the listening socket.
2156	 */
2157	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2158	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2159	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2160		ack_mp->b_rptr += sizeof (struct T_ok_ack);
2161		bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2162		nsti->sti_laddr_len = sinlen;
2163		nsti->sti_laddr_valid = 1;
2164	} else if (nso->so_family == AF_UNIX) {
2165		ASSERT(so->so_family == AF_UNIX);
2166		nsti->sti_laddr_len = sti->sti_laddr_len;
2167		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2168		bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2169		    nsti->sti_laddr_len);
2170		nsti->sti_laddr_valid = 1;
2171	} else {
2172		nsti->sti_laddr_len = sti->sti_laddr_len;
2173		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2174		bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2175		nsti->sti_laddr_sa->sa_family = nso->so_family;
2176	}
2177	freemsg(ack_mp);
2178
2179	so_unlock_single(so, SOLOCKED);
2180	mutex_exit(&so->so_lock);
2181
2182	nso->so_state |= SS_ISCONNECTED;
2183
2184	/*
2185	 * Pass out new socket.
2186	 */
2187	if (nsop != NULL)
2188		*nsop = nso;
2189
2190	return (0);
2191
2192
2193eproto_disc_unl:
2194	error = EPROTO;
2195e_disc_unl:
2196	eprintsoline(so, error);
2197	goto disconnect_unlocked;
2198
2199pr_disc_vp_unl:
2200	eprintsoline(so, error);
2201disconnect_vp_unlocked:
2202	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2203	VN_RELE(nvp);
2204disconnect_unlocked:
2205	(void) sodisconnect(so, SEQ_number, 0);
2206	return (error);
2207
2208pr_disc_vp:
2209	eprintsoline(so, error);
2210disconnect_vp:
2211	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2212	so_unlock_single(so, SOLOCKED);
2213	mutex_exit(&so->so_lock);
2214	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2215	VN_RELE(nvp);
2216	return (error);
2217
2218conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2219	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2220	    ? EOPNOTSUPP : EINVAL;
2221e_bad:
2222	eprintsoline(so, error);
2223	return (error);
2224}
2225
2226/*
2227 * connect a socket.
2228 *
2229 * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2230 * unconnect (by specifying a null address).
2231 */
2232int
2233sotpi_connect(struct sonode *so,
2234	const struct sockaddr *name,
2235	socklen_t namelen,
2236	int fflag,
2237	int flags,
2238	struct cred *cr)
2239{
2240	struct T_conn_req	conn_req;
2241	int			error = 0;
2242	mblk_t			*mp;
2243	void			*src;
2244	socklen_t		srclen;
2245	void			*addr;
2246	socklen_t		addrlen;
2247	boolean_t		need_unlock;
2248	sotpi_info_t		*sti = SOTOTPI(so);
2249
2250	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2251	    (void *)so, (void *)name, namelen, fflag, flags,
2252	    pr_state(so->so_state, so->so_mode)));
2253
2254	/*
2255	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2256	 * avoid sleeping for memory with SOLOCKED held.
2257	 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2258	 * + sizeof (struct T_opthdr).
2259	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2260	 * exceed sti_faddr_maxlen).
2261	 */
2262	mp = soallocproto(sizeof (struct T_conn_req) +
2263	    2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2264	    cr);
2265	if (mp == NULL) {
2266		/*
2267		 * Connect can not fail with ENOBUFS. A signal was
2268		 * caught so return EINTR.
2269		 */
2270		error = EINTR;
2271		eprintsoline(so, error);
2272		return (error);
2273	}
2274
2275	mutex_enter(&so->so_lock);
2276	/*
2277	 * Make sure there is a preallocated T_unbind_req message
2278	 * before any binding. This message is allocated when the
2279	 * socket is created. Since another thread can consume
2280	 * so_unbind_mp by the time we return from so_lock_single(),
2281	 * we should check the availability of so_unbind_mp after
2282	 * we return from so_lock_single().
2283	 */
2284
2285	so_lock_single(so);	/* Set SOLOCKED */
2286	need_unlock = B_TRUE;
2287
2288	if (sti->sti_unbind_mp == NULL) {
2289		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2290		/* NOTE: holding so_lock while sleeping */
2291		sti->sti_unbind_mp =
2292		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2293		if (sti->sti_unbind_mp == NULL) {
2294			error = EINTR;
2295			goto done;
2296		}
2297	}
2298
2299	/*
2300	 * Can't have done a listen before connecting.
2301	 */
2302	if (so->so_state & SS_ACCEPTCONN) {
2303		error = EOPNOTSUPP;
2304		goto done;
2305	}
2306
2307	/*
2308	 * Must be bound with the transport
2309	 */
2310	if (!(so->so_state & SS_ISBOUND)) {
2311		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2312		    /*CONSTCOND*/
2313		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2314			/*
2315			 * Optimization for AF_INET{,6} transports
2316			 * that can handle a T_CONN_REQ without being bound.
2317			 */
2318			so_automatic_bind(so);
2319		} else {
2320			error = sotpi_bind(so, NULL, 0,
2321			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2322			if (error)
2323				goto done;
2324		}
2325		ASSERT(so->so_state & SS_ISBOUND);
2326		flags |= _SOCONNECT_DID_BIND;
2327	}
2328
2329	/*
2330	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2331	 * connect to a null address. This is the portable method to
2332	 * unconnect a socket.
2333	 */
2334	if ((namelen >= sizeof (sa_family_t)) &&
2335	    (name->sa_family == AF_UNSPEC)) {
2336		name = NULL;
2337		namelen = 0;
2338	}
2339
2340	/*
2341	 * Check that we are not already connected.
2342	 * A connection-oriented socket cannot be reconnected.
2343	 * A connected connection-less socket can be
2344	 * - connected to a different address by a subsequent connect
2345	 * - "unconnected" by a connect to the NULL address
2346	 */
2347	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2348		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2349		if (so->so_mode & SM_CONNREQUIRED) {
2350			/* Connection-oriented socket */
2351			error = so->so_state & SS_ISCONNECTED ?
2352			    EISCONN : EALREADY;
2353			goto done;
2354		}
2355		/* Connection-less socket */
2356		if (name == NULL) {
2357			/*
2358			 * Remove the connected state and clear SO_DGRAM_ERRIND
2359			 * since it was set when the socket was connected.
2360			 * If this is UDP also send down a T_DISCON_REQ.
2361			 */
2362			int val;
2363
2364			if ((so->so_family == AF_INET ||
2365			    so->so_family == AF_INET6) &&
2366			    (so->so_type == SOCK_DGRAM ||
2367			    so->so_type == SOCK_RAW) &&
2368			    /*CONSTCOND*/
2369			    !soconnect_tpi_udp) {
2370				/* XXX What about implicitly unbinding here? */
2371				error = sodisconnect(so, -1,
2372				    _SODISCONNECT_LOCK_HELD);
2373			} else {
2374				so->so_state &=
2375				    ~(SS_ISCONNECTED | SS_ISCONNECTING);
2376				sti->sti_faddr_valid = 0;
2377				sti->sti_faddr_len = 0;
2378			}
2379
2380			/* Remove SOLOCKED since setsockopt will grab it */
2381			so_unlock_single(so, SOLOCKED);
2382			mutex_exit(&so->so_lock);
2383
2384			val = 0;
2385			(void) sotpi_setsockopt(so, SOL_SOCKET,
2386			    SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2387			    cr);
2388
2389			mutex_enter(&so->so_lock);
2390			so_lock_single(so);	/* Set SOLOCKED */
2391			goto done;
2392		}
2393	}
2394	ASSERT(so->so_state & SS_ISBOUND);
2395
2396	if (name == NULL || namelen == 0) {
2397		error = EINVAL;
2398		goto done;
2399	}
2400	/*
2401	 * Mark the socket if sti_faddr_sa represents the transport level
2402	 * address.
2403	 */
2404	if (flags & _SOCONNECT_NOXLATE) {
2405		struct sockaddr_ux	*soaddr_ux;
2406
2407		ASSERT(so->so_family == AF_UNIX);
2408		if (namelen != sizeof (struct sockaddr_ux)) {
2409			error = EINVAL;
2410			goto done;
2411		}
2412		soaddr_ux = (struct sockaddr_ux *)name;
2413		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2414		namelen = sizeof (soaddr_ux->sou_addr);
2415		sti->sti_faddr_noxlate = 1;
2416	}
2417
2418	/*
2419	 * Length and family checks.
2420	 */
2421	error = so_addr_verify(so, name, namelen);
2422	if (error)
2423		goto bad;
2424
2425	/*
2426	 * Save foreign address. Needed for AF_UNIX as well as
2427	 * transport providers that do not support TI_GETPEERNAME.
2428	 * Also used for cached foreign address for TCP and UDP.
2429	 */
2430	if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2431		error = EINVAL;
2432		goto done;
2433	}
2434	sti->sti_faddr_len = (socklen_t)namelen;
2435	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2436	bcopy(name, sti->sti_faddr_sa, namelen);
2437	sti->sti_faddr_valid = 1;
2438
2439	if (so->so_family == AF_UNIX) {
2440		if (sti->sti_faddr_noxlate) {
2441			/*
2442			 * Already have a transport internal address. Do not
2443			 * pass any (transport internal) source address.
2444			 */
2445			addr = sti->sti_faddr_sa;
2446			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2447			src = NULL;
2448			srclen = 0;
2449		} else {
2450			/*
2451			 * Pass the sockaddr_un source address as an option
2452			 * and translate the remote address.
2453			 * Holding so_lock thus sti_laddr_sa can not change.
2454			 */
2455			src = sti->sti_laddr_sa;
2456			srclen = (t_uscalar_t)sti->sti_laddr_len;
2457			dprintso(so, 1,
2458			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2459			    srclen, src));
2460			error = so_ux_addr_xlate(so,
2461			    sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2462			    (flags & _SOCONNECT_XPG4_2),
2463			    &addr, &addrlen);
2464			if (error)
2465				goto bad;
2466		}
2467	} else {
2468		addr = sti->sti_faddr_sa;
2469		addrlen = (t_uscalar_t)sti->sti_faddr_len;
2470		src = NULL;
2471		srclen = 0;
2472	}
2473	/*
2474	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2475	 * option which asks the transport provider to send T_UDERR_IND
2476	 * messages. These T_UDERR_IND messages are used to return connected
2477	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2478	 *
2479	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2480	 * we send down a T_CONN_REQ. This is needed to let the
2481	 * transport assign a local address that is consistent with
2482	 * the remote address. Applications depend on a getsockname()
2483	 * after a connect() to retrieve the "source" IP address for
2484	 * the connected socket.  Invalidate the cached local address
2485	 * to force getsockname() to enquire of the transport.
2486	 */
2487	if (!(so->so_mode & SM_CONNREQUIRED)) {
2488		/*
2489		 * Datagram socket.
2490		 */
2491		int32_t val;
2492
2493		so_unlock_single(so, SOLOCKED);
2494		mutex_exit(&so->so_lock);
2495
2496		val = 1;
2497		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2498		    &val, (t_uscalar_t)sizeof (val), cr);
2499
2500		mutex_enter(&so->so_lock);
2501		so_lock_single(so);	/* Set SOLOCKED */
2502		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2503		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2504		    soconnect_tpi_udp) {
2505			soisconnected(so);
2506			goto done;
2507		}
2508		/*
2509		 * Send down T_CONN_REQ etc.
2510		 * Clear fflag to avoid returning EWOULDBLOCK.
2511		 */
2512		fflag = 0;
2513		ASSERT(so->so_family != AF_UNIX);
2514		sti->sti_laddr_valid = 0;
2515	} else if (sti->sti_laddr_len != 0) {
2516		/*
2517		 * If the local address or port was "any" then it may be
2518		 * changed by the transport as a result of the
2519		 * connect.  Invalidate the cached version if we have one.
2520		 */
2521		switch (so->so_family) {
2522		case AF_INET:
2523			ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2524			if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2525			    INADDR_ANY ||
2526			    ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2527				sti->sti_laddr_valid = 0;
2528			break;
2529
2530		case AF_INET6:
2531			ASSERT(sti->sti_laddr_len ==
2532			    (socklen_t)sizeof (sin6_t));
2533			if (IN6_IS_ADDR_UNSPECIFIED(
2534			    &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2535			    IN6_IS_ADDR_V4MAPPED_ANY(
2536			    &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2537			    ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2538				sti->sti_laddr_valid = 0;
2539			break;
2540
2541		default:
2542			break;
2543		}
2544	}
2545
2546	/*
2547	 * Check for failure of an earlier call
2548	 */
2549	if (so->so_error != 0)
2550		goto so_bad;
2551
2552	/*
2553	 * Send down T_CONN_REQ. Message was allocated above.
2554	 */
2555	conn_req.PRIM_type = T_CONN_REQ;
2556	conn_req.DEST_length = addrlen;
2557	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2558	if (srclen == 0) {
2559		conn_req.OPT_length = 0;
2560		conn_req.OPT_offset = 0;
2561		soappendmsg(mp, &conn_req, sizeof (conn_req));
2562		soappendmsg(mp, addr, addrlen);
2563	} else {
2564		/*
2565		 * There is a AF_UNIX sockaddr_un to include as a source
2566		 * address option.
2567		 */
2568		struct T_opthdr toh;
2569
2570		toh.level = SOL_SOCKET;
2571		toh.name = SO_SRCADDR;
2572		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2573		toh.status = 0;
2574		conn_req.OPT_length =
2575		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2576		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2577		    _TPI_ALIGN_TOPT(addrlen));
2578
2579		soappendmsg(mp, &conn_req, sizeof (conn_req));
2580		soappendmsg(mp, addr, addrlen);
2581		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2582		soappendmsg(mp, &toh, sizeof (toh));
2583		soappendmsg(mp, src, srclen);
2584		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2585		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2586	}
2587	/*
2588	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2589	 * in order to have the right state when the T_CONN_CON shows up.
2590	 */
2591	soisconnecting(so);
2592	mutex_exit(&so->so_lock);
2593
2594	if (audit_active)
2595		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2596
2597	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2598	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2599	mp = NULL;
2600	mutex_enter(&so->so_lock);
2601	if (error != 0)
2602		goto bad;
2603
2604	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2605		goto bad;
2606
2607	/* Allow other threads to access the socket */
2608	so_unlock_single(so, SOLOCKED);
2609	need_unlock = B_FALSE;
2610
2611	/*
2612	 * Wait until we get a T_CONN_CON or an error
2613	 */
2614	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2615		so_lock_single(so);	/* Set SOLOCKED */
2616		need_unlock = B_TRUE;
2617	}
2618
2619done:
2620	freemsg(mp);
2621	switch (error) {
2622	case EINPROGRESS:
2623	case EALREADY:
2624	case EISCONN:
2625	case EINTR:
2626		/* Non-fatal errors */
2627		sti->sti_laddr_valid = 0;
2628		/* FALLTHRU */
2629	case 0:
2630		break;
2631	default:
2632		ASSERT(need_unlock);
2633		/*
2634		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2635		 * and invalidate local-address cache
2636		 */
2637		so->so_state &= ~SS_ISCONNECTING;
2638		sti->sti_laddr_valid = 0;
2639		/* A discon_ind might have already unbound us */
2640		if ((flags & _SOCONNECT_DID_BIND) &&
2641		    (so->so_state & SS_ISBOUND)) {
2642			int err;
2643
2644			err = sotpi_unbind(so, 0);
2645			/* LINTED - statement has no conseq */
2646			if (err) {
2647				eprintsoline(so, err);
2648			}
2649		}
2650		break;
2651	}
2652	if (need_unlock)
2653		so_unlock_single(so, SOLOCKED);
2654	mutex_exit(&so->so_lock);
2655	return (error);
2656
2657so_bad:	error = sogeterr(so, B_TRUE);
2658bad:	eprintsoline(so, error);
2659	goto done;
2660}
2661
2662/* ARGSUSED */
2663int
2664sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2665{
2666	struct T_ordrel_req	ordrel_req;
2667	mblk_t			*mp;
2668	uint_t			old_state, state_change;
2669	int			error = 0;
2670	sotpi_info_t		*sti = SOTOTPI(so);
2671
2672	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2673	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2674
2675	mutex_enter(&so->so_lock);
2676	so_lock_single(so);	/* Set SOLOCKED */
2677
2678	/*
2679	 * SunOS 4.X has no check for datagram sockets.
2680	 * 5.X checks that it is connected (ENOTCONN)
2681	 * X/Open requires that we check the connected state.
2682	 */
2683	if (!(so->so_state & SS_ISCONNECTED)) {
2684		if (!xnet_skip_checks) {
2685			error = ENOTCONN;
2686			if (xnet_check_print) {
2687				printf("sockfs: X/Open shutdown check "
2688				    "caused ENOTCONN\n");
2689			}
2690		}
2691		goto done;
2692	}
2693	/*
2694	 * Record the current state and then perform any state changes.
2695	 * Then use the difference between the old and new states to
2696	 * determine which messages need to be sent.
2697	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2698	 * duplicate calls to shutdown().
2699	 */
2700	old_state = so->so_state;
2701
2702	switch (how) {
2703	case 0:
2704		socantrcvmore(so);
2705		break;
2706	case 1:
2707		socantsendmore(so);
2708		break;
2709	case 2:
2710		socantsendmore(so);
2711		socantrcvmore(so);
2712		break;
2713	default:
2714		error = EINVAL;
2715		goto done;
2716	}
2717
2718	/*
2719	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2720	 */
2721	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2722	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2723	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2724
2725	switch (state_change) {
2726	case 0:
2727		dprintso(so, 1,
2728		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2729		    so->so_state));
2730		goto done;
2731
2732	case SS_CANTRCVMORE:
2733		mutex_exit(&so->so_lock);
2734		strseteof(SOTOV(so), 1);
2735		/*
2736		 * strseteof takes care of read side wakeups,
2737		 * pollwakeups, and signals.
2738		 */
2739		/*
2740		 * Get the read lock before flushing data to avoid problems
2741		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2742		 */
2743		mutex_enter(&so->so_lock);
2744		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2745		mutex_exit(&so->so_lock);
2746
2747		/* Flush read side queue */
2748		strflushrq(SOTOV(so), FLUSHALL);
2749
2750		mutex_enter(&so->so_lock);
2751		so_unlock_read(so);		/* Clear SOREADLOCKED */
2752		break;
2753
2754	case SS_CANTSENDMORE:
2755		mutex_exit(&so->so_lock);
2756		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2757		mutex_enter(&so->so_lock);
2758		break;
2759
2760	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2761		mutex_exit(&so->so_lock);
2762		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2763		strseteof(SOTOV(so), 1);
2764		/*
2765		 * strseteof takes care of read side wakeups,
2766		 * pollwakeups, and signals.
2767		 */
2768		/*
2769		 * Get the read lock before flushing data to avoid problems
2770		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2771		 */
2772		mutex_enter(&so->so_lock);
2773		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2774		mutex_exit(&so->so_lock);
2775
2776		/* Flush read side queue */
2777		strflushrq(SOTOV(so), FLUSHALL);
2778
2779		mutex_enter(&so->so_lock);
2780		so_unlock_read(so);		/* Clear SOREADLOCKED */
2781		break;
2782	}
2783
2784	ASSERT(MUTEX_HELD(&so->so_lock));
2785
2786	/*
2787	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2788	 * was set due to this call and the new state has both of them set:
2789	 *	Send the AF_UNIX close indication
2790	 *	For T_COTS send a discon_ind
2791	 *
2792	 * If cantsend was set due to this call:
2793	 *	For T_COTSORD send an ordrel_ind
2794	 *
2795	 * Note that for T_CLTS there is no message sent here.
2796	 */
2797	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2798	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2799		/*
2800		 * For SunOS 4.X compatibility we tell the other end
2801		 * that we are unable to receive at this point.
2802		 */
2803		if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2804			so_unix_close(so);
2805
2806		if (sti->sti_serv_type == T_COTS)
2807			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2808	}
2809	if ((state_change & SS_CANTSENDMORE) &&
2810	    (sti->sti_serv_type == T_COTS_ORD)) {
2811		/* Send an orderly release */
2812		ordrel_req.PRIM_type = T_ORDREL_REQ;
2813
2814		mutex_exit(&so->so_lock);
2815		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2816		    0, _ALLOC_SLEEP, cr);
2817		/*
2818		 * Send down the T_ORDREL_REQ even if there is flow control.
2819		 * This prevents shutdown from blocking.
2820		 * Note that there is no T_OK_ACK for ordrel_req.
2821		 */
2822		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2823		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2824		mutex_enter(&so->so_lock);
2825		if (error) {
2826			eprintsoline(so, error);
2827			goto done;
2828		}
2829	}
2830
2831done:
2832	so_unlock_single(so, SOLOCKED);
2833	mutex_exit(&so->so_lock);
2834	return (error);
2835}
2836
2837/*
2838 * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2839 * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2840 * that we have closed.
2841 * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2842 * T_UNITDATA_REQ containing the same option.
2843 *
2844 * For SOCK_DGRAM half-connections (somebody connected to this end
2845 * but this end is not connect) we don't know where to send any
2846 * SO_UNIX_CLOSE.
2847 *
2848 * We have to ignore stream head errors just in case there has been
2849 * a shutdown(output).
2850 * Ignore any flow control to try to get the message more quickly to the peer.
2851 * While locally ignoring flow control solves the problem when there
2852 * is only the loopback transport on the stream it would not provide
2853 * the correct AF_UNIX socket semantics when one or more modules have
2854 * been pushed.
2855 */
2856void
2857so_unix_close(struct sonode *so)
2858{
2859	int		error;
2860	struct T_opthdr	toh;
2861	mblk_t		*mp;
2862	sotpi_info_t	*sti = SOTOTPI(so);
2863
2864	ASSERT(MUTEX_HELD(&so->so_lock));
2865
2866	ASSERT(so->so_family == AF_UNIX);
2867
2868	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2869	    (SS_ISCONNECTED|SS_ISBOUND))
2870		return;
2871
2872	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2873	    (void *)so, pr_state(so->so_state, so->so_mode)));
2874
2875	toh.level = SOL_SOCKET;
2876	toh.name = SO_UNIX_CLOSE;
2877
2878	/* zero length + header */
2879	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2880	toh.status = 0;
2881
2882	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2883		struct T_optdata_req tdr;
2884
2885		tdr.PRIM_type = T_OPTDATA_REQ;
2886		tdr.DATA_flag = 0;
2887
2888		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2889		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2890
2891		/* NOTE: holding so_lock while sleeping */
2892		mp = soallocproto2(&tdr, sizeof (tdr),
2893		    &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2894	} else {
2895		struct T_unitdata_req	tudr;
2896		void			*addr;
2897		socklen_t		addrlen;
2898		void			*src;
2899		socklen_t		srclen;
2900		struct T_opthdr		toh2;
2901		t_scalar_t		size;
2902
2903		/* Connecteded DGRAM socket */
2904
2905		/*
2906		 * For AF_UNIX the destination address is translated to
2907		 * an internal name and the source address is passed as
2908		 * an option.
2909		 */
2910		/*
2911		 * Length and family checks.
2912		 */
2913		error = so_addr_verify(so, sti->sti_faddr_sa,
2914		    (t_uscalar_t)sti->sti_faddr_len);
2915		if (error) {
2916			eprintsoline(so, error);
2917			return;
2918		}
2919		if (sti->sti_faddr_noxlate) {
2920			/*
2921			 * Already have a transport internal address. Do not
2922			 * pass any (transport internal) source address.
2923			 */
2924			addr = sti->sti_faddr_sa;
2925			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2926			src = NULL;
2927			srclen = 0;
2928		} else {
2929			/*
2930			 * Pass the sockaddr_un source address as an option
2931			 * and translate the remote address.
2932			 * Holding so_lock thus sti_laddr_sa can not change.
2933			 */
2934			src = sti->sti_laddr_sa;
2935			srclen = (socklen_t)sti->sti_laddr_len;
2936			dprintso(so, 1,
2937			    ("so_ux_close: srclen %d, src %p\n",
2938			    srclen, src));
2939			error = so_ux_addr_xlate(so,
2940			    sti->sti_faddr_sa,
2941			    (socklen_t)sti->sti_faddr_len, 0,
2942			    &addr, &addrlen);
2943			if (error) {
2944				eprintsoline(so, error);
2945				return;
2946			}
2947		}
2948		tudr.PRIM_type = T_UNITDATA_REQ;
2949		tudr.DEST_length = addrlen;
2950		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2951		if (srclen == 0) {
2952			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2953			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2954			    _TPI_ALIGN_TOPT(addrlen));
2955
2956			size = tudr.OPT_offset + tudr.OPT_length;
2957			/* NOTE: holding so_lock while sleeping */
2958			mp = soallocproto2(&tudr, sizeof (tudr),
2959			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2960			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2961			soappendmsg(mp, &toh, sizeof (toh));
2962		} else {
2963			/*
2964			 * There is a AF_UNIX sockaddr_un to include as a
2965			 * source address option.
2966			 */
2967			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2968			    _TPI_ALIGN_TOPT(srclen));
2969			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2970			    _TPI_ALIGN_TOPT(addrlen));
2971
2972			toh2.level = SOL_SOCKET;
2973			toh2.name = SO_SRCADDR;
2974			toh2.len = (t_uscalar_t)(srclen +
2975			    sizeof (struct T_opthdr));
2976			toh2.status = 0;
2977
2978			size = tudr.OPT_offset + tudr.OPT_length;
2979
2980			/* NOTE: holding so_lock while sleeping */
2981			mp = soallocproto2(&tudr, sizeof (tudr),
2982			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2983			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2984			soappendmsg(mp, &toh, sizeof (toh));
2985			soappendmsg(mp, &toh2, sizeof (toh2));
2986			soappendmsg(mp, src, srclen);
2987			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2988		}
2989		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2990	}
2991	mutex_exit(&so->so_lock);
2992	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2993	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2994	mutex_enter(&so->so_lock);
2995}
2996
2997/*
2998 * Called by sotpi_recvmsg when reading a non-zero amount of data.
2999 * In addition, the caller typically verifies that there is some
3000 * potential state to clear by checking
3001 *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
3002 * before calling this routine.
3003 * Note that such a check can be made without holding so_lock since
3004 * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
3005 * decrements sti_oobsigcnt.
3006 *
3007 * When data is read *after* the point that all pending
3008 * oob data has been consumed the oob indication is cleared.
3009 *
3010 * This logic keeps select/poll returning POLLRDBAND and
3011 * SIOCATMARK returning true until we have read past
3012 * the mark.
3013 */
3014static void
3015sorecv_update_oobstate(struct sonode *so)
3016{
3017	sotpi_info_t *sti = SOTOTPI(so);
3018
3019	mutex_enter(&so->so_lock);
3020	ASSERT(so_verify_oobstate(so));
3021	dprintso(so, 1,
3022	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
3023	    sti->sti_oobsigcnt,
3024	    sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
3025	if (sti->sti_oobsigcnt == 0) {
3026		/* No more pending oob indications */
3027		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
3028		freemsg(so->so_oobmsg);
3029		so->so_oobmsg = NULL;
3030	}
3031	ASSERT(so_verify_oobstate(so));
3032	mutex_exit(&so->so_lock);
3033}
3034
3035/*
3036 * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
3037 */
3038static int
3039nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
3040{
3041	sotpi_info_t *sti = SOTOTPI(so);
3042	int	error = 0;
3043	mblk_t *tmp = NULL;
3044	mblk_t *pmp = NULL;
3045	mblk_t *nmp = sti->sti_nl7c_rcv_mp;
3046
3047	ASSERT(nmp != NULL);
3048
3049	while (nmp != NULL && uiop->uio_resid > 0) {
3050		ssize_t n;
3051
3052		if (DB_TYPE(nmp) == M_DATA) {
3053			/*
3054			 * We have some data, uiomove up to resid bytes.
3055			 */
3056			n = MIN(MBLKL(nmp), uiop->uio_resid);
3057			if (n > 0)
3058				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
3059			nmp->b_rptr += n;
3060			if (nmp->b_rptr == nmp->b_wptr) {
3061				pmp = nmp;
3062				nmp = nmp->b_cont;
3063			}
3064			if (error)
3065				break;
3066		} else {
3067			/*
3068			 * We only handle data, save for caller to handle.
3069			 */
3070			if (pmp != NULL) {
3071				pmp->b_cont = nmp->b_cont;
3072			}
3073			nmp->b_cont = NULL;
3074			if (*rmp == NULL) {
3075				*rmp = nmp;
3076			} else {
3077				tmp->b_cont = nmp;
3078			}
3079			nmp = nmp->b_cont;
3080			tmp = nmp;
3081		}
3082	}
3083	if (pmp != NULL) {
3084		/* Free any mblk_t(s) which we have consumed */
3085		pmp->b_cont = NULL;
3086		freemsg(sti->sti_nl7c_rcv_mp);
3087	}
3088	if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3089		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
3090		if (error == 0) {
3091			rval_t	*p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3092
3093			error = p->r_v.r_v2;
3094			p->r_v.r_v2 = 0;
3095		}
3096		rp->r_vals = sti->sti_nl7c_rcv_rval;
3097		sti->sti_nl7c_rcv_rval = 0;
3098	} else {
3099		/* More mblk_t(s) to process so no rval to return */
3100		rp->r_vals = 0;
3101	}
3102	return (error);
3103}
3104/*
3105 * Receive the next message on the queue.
3106 * If msg_controllen is non-zero when called the caller is interested in
3107 * any received control info (options).
3108 * If msg_namelen is non-zero when called the caller is interested in
3109 * any received source address.
3110 * The routine returns with msg_control and msg_name pointing to
3111 * kmem_alloc'ed memory which the caller has to free.
3112 */
3113/* ARGSUSED */
3114int
3115sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3116    struct cred *cr)
3117{
3118	union T_primitives	*tpr;
3119	mblk_t			*mp;
3120	uchar_t			pri;
3121	int			pflag, opflag;
3122	void			*control;
3123	t_uscalar_t		controllen;
3124	t_uscalar_t		namelen;
3125	int			so_state = so->so_state; /* Snapshot */
3126	ssize_t			saved_resid;
3127	rval_t			rval;
3128	int			flags;
3129	clock_t			timout;
3130	int			error = 0;
3131	sotpi_info_t		*sti = SOTOTPI(so);
3132
3133	flags = msg->msg_flags;
3134	msg->msg_flags = 0;
3135
3136	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3137	    (void *)so, (void *)msg, flags,
3138	    pr_state(so->so_state, so->so_mode), so->so_error));
3139
3140	if (so->so_version == SOV_STREAM) {
3141		so_update_attrs(so, SOACC);
3142		/* The imaginary "sockmod" has been popped - act as a stream */
3143		return (strread(SOTOV(so), uiop, cr));
3144	}
3145
3146	/*
3147	 * If we are not connected because we have never been connected
3148	 * we return ENOTCONN. If we have been connected (but are no longer
3149	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3150	 * the EOF.
3151	 *
3152	 * An alternative would be to post an ENOTCONN error in stream head
3153	 * (read+write) and clear it when we're connected. However, that error
3154	 * would cause incorrect poll/select behavior!
3155	 */
3156	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3157	    (so->so_mode & SM_CONNREQUIRED)) {
3158		return (ENOTCONN);
3159	}
3160
3161	/*
3162	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3163	 * after checking that the read queue is empty) and returns zero.
3164	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
3165	 * is zero.
3166	 */
3167
3168	if (flags & MSG_OOB) {
3169		/* Check that the transport supports OOB */
3170		if (!(so->so_mode & SM_EXDATA))
3171			return (EOPNOTSUPP);
3172		so_update_attrs(so, SOACC);
3173		return (sorecvoob(so, msg, uiop, flags,
3174		    (so->so_options & SO_OOBINLINE)));
3175	}
3176
3177	so_update_attrs(so, SOACC);
3178
3179	/*
3180	 * Set msg_controllen and msg_namelen to zero here to make it
3181	 * simpler in the cases that no control or name is returned.
3182	 */
3183	controllen = msg->msg_controllen;
3184	namelen = msg->msg_namelen;
3185	msg->msg_controllen = 0;
3186	msg->msg_namelen = 0;
3187
3188	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3189	    namelen, controllen));
3190
3191	mutex_enter(&so->so_lock);
3192	/*
3193	 * If an NL7C enabled socket and not waiting for write data.
3194	 */
3195	if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3196	    NL7C_ENABLED) {
3197		if (sti->sti_nl7c_uri) {
3198			/* Close uri processing for a previous request */
3199			nl7c_close(so);
3200		}
3201		if ((so_state & SS_CANTRCVMORE) &&
3202		    sti->sti_nl7c_rcv_mp == NULL) {
3203			/* Nothing to process, EOF */
3204			mutex_exit(&so->so_lock);
3205			return (0);
3206		} else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3207			/* Persistent NL7C socket, try to process request */
3208			boolean_t ret;
3209
3210			ret = nl7c_process(so,
3211			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3212			rval.r_vals = sti->sti_nl7c_rcv_rval;
3213			error = rval.r_v.r_v2;
3214			if (error) {
3215				/* Error of some sort, return it */
3216				mutex_exit(&so->so_lock);
3217				return (error);
3218			}
3219			if (sti->sti_nl7c_flags &&
3220			    ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3221				/*
3222				 * Still an NL7C socket and no data
3223				 * to pass up to the caller.
3224				 */
3225				mutex_exit(&so->so_lock);
3226				if (ret) {
3227					/* EOF */
3228					return (0);
3229				} else {
3230					/* Need more data */
3231					return (EAGAIN);
3232				}
3233			}
3234		} else {
3235			/*
3236			 * Not persistent so no further NL7C processing.
3237			 */
3238			sti->sti_nl7c_flags = 0;
3239		}
3240	}
3241	/*
3242	 * Only one reader is allowed at any given time. This is needed
3243	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3244	 *
3245	 * This is slightly different that BSD behavior in that it fails with
3246	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3247	 * is single-threaded using sblock(), which is dropped while waiting
3248	 * for data to appear. The difference shows up e.g. if one
3249	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3250	 * does use nonblocking io and different threads are reading each
3251	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3252	 * in this case as long as the read queue doesn't get empty.
3253	 * In this implementation the thread using nonblocking io can
3254	 * get an EWOULDBLOCK error due to the blocking thread executing
3255	 * e.g. in the uiomove in kstrgetmsg.
3256	 * This difference is not believed to be significant.
3257	 */
3258	/* Set SOREADLOCKED */
3259	error = so_lock_read_intr(so,
3260	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3261	mutex_exit(&so->so_lock);
3262	if (error)
3263		return (error);
3264
3265	/*
3266	 * Tell kstrgetmsg to not inspect the stream head errors until all
3267	 * queued data has been consumed.
3268	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3269	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3270	 *
3271	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3272	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3273	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3274	 */
3275	pflag = MSG_ANY | MSG_DELAYERROR;
3276	if (flags & MSG_PEEK) {
3277		pflag |= MSG_IPEEK;
3278		flags &= ~MSG_WAITALL;
3279	}
3280	if (so->so_mode & SM_ATOMIC)
3281		pflag |= MSG_DISCARDTAIL;
3282
3283	if (flags & MSG_DONTWAIT)
3284		timout = 0;
3285	else
3286		timout = -1;
3287	opflag = pflag;
3288retry:
3289	saved_resid = uiop->uio_resid;
3290	pri = 0;
3291	mp = NULL;
3292	if (sti->sti_nl7c_rcv_mp != NULL) {
3293		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3294		error = nl7c_sorecv(so, &mp, uiop, &rval);
3295	} else {
3296		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3297		    timout, &rval);
3298	}
3299	if (error != 0) {
3300		/* kstrgetmsg returns ETIME when timeout expires */
3301		if (error == ETIME)
3302			error = EWOULDBLOCK;
3303		goto out;
3304	}
3305	/*
3306	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3307	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3308	 */
3309	ASSERT(!(rval.r_val1 & MORECTL));
3310	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3311		msg->msg_flags |= MSG_TRUNC;
3312
3313	if (mp == NULL) {
3314		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3315		/*
3316		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3317		 * The draft Posix socket spec states that the mark should
3318		 * not be cleared when peeking. We follow the latter.
3319		 */
3320		if ((so->so_state &
3321		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3322		    (uiop->uio_resid != saved_resid) &&
3323		    !(flags & MSG_PEEK)) {
3324			sorecv_update_oobstate(so);
3325		}
3326
3327		mutex_enter(&so->so_lock);
3328		/* Set MSG_EOR based on MOREDATA */
3329		if (!(rval.r_val1 & MOREDATA)) {
3330			if (so->so_state & SS_SAVEDEOR) {
3331				msg->msg_flags |= MSG_EOR;
3332				so->so_state &= ~SS_SAVEDEOR;
3333			}
3334		}
3335		/*
3336		 * If some data was received (i.e. not EOF) and the
3337		 * read/recv* has not been satisfied wait for some more.
3338		 */
3339		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3340		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3341			mutex_exit(&so->so_lock);
3342			pflag = opflag | MSG_NOMARK;
3343			goto retry;
3344		}
3345		goto out_locked;
3346	}
3347
3348	/* strsock_proto has already verified length and alignment */
3349	tpr = (union T_primitives *)mp->b_rptr;
3350	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3351
3352	switch (tpr->type) {
3353	case T_DATA_IND: {
3354		if ((so->so_state &
3355		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3356		    (uiop->uio_resid != saved_resid) &&
3357		    !(flags & MSG_PEEK)) {
3358			sorecv_update_oobstate(so);
3359		}
3360
3361		/*
3362		 * Set msg_flags to MSG_EOR based on
3363		 * MORE_flag and MOREDATA.
3364		 */
3365		mutex_enter(&so->so_lock);
3366		so->so_state &= ~SS_SAVEDEOR;
3367		if (!(tpr->data_ind.MORE_flag & 1)) {
3368			if (!(rval.r_val1 & MOREDATA))
3369				msg->msg_flags |= MSG_EOR;
3370			else
3371				so->so_state |= SS_SAVEDEOR;
3372		}
3373		freemsg(mp);
3374		/*
3375		 * If some data was received (i.e. not EOF) and the
3376		 * read/recv* has not been satisfied wait for some more.
3377		 */
3378		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3379		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3380			mutex_exit(&so->so_lock);
3381			pflag = opflag | MSG_NOMARK;
3382			goto retry;
3383		}
3384		goto out_locked;
3385	}
3386	case T_UNITDATA_IND: {
3387		void *addr;
3388		t_uscalar_t addrlen;
3389		void *abuf;
3390		t_uscalar_t optlen;
3391		void *opt;
3392
3393		if ((so->so_state &
3394		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3395		    (uiop->uio_resid != saved_resid) &&
3396		    !(flags & MSG_PEEK)) {
3397			sorecv_update_oobstate(so);
3398		}
3399
3400		if (namelen != 0) {
3401			/* Caller wants source address */
3402			addrlen = tpr->unitdata_ind.SRC_length;
3403			addr = sogetoff(mp,
3404			    tpr->unitdata_ind.SRC_offset,
3405			    addrlen, 1);
3406			if (addr == NULL) {
3407				freemsg(mp);
3408				error = EPROTO;
3409				eprintsoline(so, error);
3410				goto out;
3411			}
3412			if (so->so_family == AF_UNIX) {
3413				/*
3414				 * Can not use the transport level address.
3415				 * If there is a SO_SRCADDR option carrying
3416				 * the socket level address it will be
3417				 * extracted below.
3418				 */
3419				addr = NULL;
3420				addrlen = 0;
3421			}
3422		}
3423		optlen = tpr->unitdata_ind.OPT_length;
3424		if (optlen != 0) {
3425			t_uscalar_t ncontrollen;
3426
3427			/*
3428			 * Extract any source address option.
3429			 * Determine how large cmsg buffer is needed.
3430			 */
3431			opt = sogetoff(mp,
3432			    tpr->unitdata_ind.OPT_offset,
3433			    optlen, __TPI_ALIGN_SIZE);
3434
3435			if (opt == NULL) {
3436				freemsg(mp);
3437				error = EPROTO;
3438				eprintsoline(so, error);
3439				goto out;
3440			}
3441			if (so->so_family == AF_UNIX)
3442				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3443			ncontrollen = so_cmsglen(mp, opt, optlen,
3444			    !(flags & MSG_XPG4_2));
3445			if (controllen != 0)
3446				controllen = ncontrollen;
3447			else if (ncontrollen != 0)
3448				msg->msg_flags |= MSG_CTRUNC;
3449		} else {
3450			controllen = 0;
3451		}
3452
3453		if (namelen != 0) {
3454			/*
3455			 * Return address to caller.
3456			 * Caller handles truncation if length
3457			 * exceeds msg_namelen.
3458			 * NOTE: AF_UNIX NUL termination is ensured by
3459			 * the sender's copyin_name().
3460			 */
3461			abuf = kmem_alloc(addrlen, KM_SLEEP);
3462
3463			bcopy(addr, abuf, addrlen);
3464			msg->msg_name = abuf;
3465			msg->msg_namelen = addrlen;
3466		}
3467
3468		if (controllen != 0) {
3469			/*
3470			 * Return control msg to caller.
3471			 * Caller handles truncation if length
3472			 * exceeds msg_controllen.
3473			 */
3474			control = kmem_zalloc(controllen, KM_SLEEP);
3475
3476			error = so_opt2cmsg(mp, opt, optlen,
3477			    !(flags & MSG_XPG4_2),
3478			    control, controllen);
3479			if (error) {
3480				freemsg(mp);
3481				if (msg->msg_namelen != 0)
3482					kmem_free(msg->msg_name,
3483					    msg->msg_namelen);
3484				kmem_free(control, controllen);
3485				eprintsoline(so, error);
3486				goto out;
3487			}
3488			msg->msg_control = control;
3489			msg->msg_controllen = controllen;
3490		}
3491
3492		freemsg(mp);
3493		goto out;
3494	}
3495	case T_OPTDATA_IND: {
3496		struct T_optdata_req *tdr;
3497		void *opt;
3498		t_uscalar_t optlen;
3499
3500		if ((so->so_state &
3501		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3502		    (uiop->uio_resid != saved_resid) &&
3503		    !(flags & MSG_PEEK)) {
3504			sorecv_update_oobstate(so);
3505		}
3506
3507		tdr = (struct T_optdata_req *)mp->b_rptr;
3508		optlen = tdr->OPT_length;
3509		if (optlen != 0) {
3510			t_uscalar_t ncontrollen;
3511			/*
3512			 * Determine how large cmsg buffer is needed.
3513			 */
3514			opt = sogetoff(mp,
3515			    tpr->optdata_ind.OPT_offset,
3516			    optlen, __TPI_ALIGN_SIZE);
3517
3518			if (opt == NULL) {
3519				freemsg(mp);
3520				error = EPROTO;
3521				eprintsoline(so, error);
3522				goto out;
3523			}
3524
3525			ncontrollen = so_cmsglen(mp, opt, optlen,
3526			    !(flags & MSG_XPG4_2));
3527			if (controllen != 0)
3528				controllen = ncontrollen;
3529			else if (ncontrollen != 0)
3530				msg->msg_flags |= MSG_CTRUNC;
3531		} else {
3532			controllen = 0;
3533		}
3534
3535		if (controllen != 0) {
3536			/*
3537			 * Return control msg to caller.
3538			 * Caller handles truncation if length
3539			 * exceeds msg_controllen.
3540			 */
3541			control = kmem_zalloc(controllen, KM_SLEEP);
3542
3543			error = so_opt2cmsg(mp, opt, optlen,
3544			    !(flags & MSG_XPG4_2),
3545			    control, controllen);
3546			if (error) {
3547				freemsg(mp);
3548				kmem_free(control, controllen);
3549				eprintsoline(so, error);
3550				goto out;
3551			}
3552			msg->msg_control = control;
3553			msg->msg_controllen = controllen;
3554		}
3555
3556		/*
3557		 * Set msg_flags to MSG_EOR based on
3558		 * DATA_flag and MOREDATA.
3559		 */
3560		mutex_enter(&so->so_lock);
3561		so->so_state &= ~SS_SAVEDEOR;
3562		if (!(tpr->data_ind.MORE_flag & 1)) {
3563			if (!(rval.r_val1 & MOREDATA))
3564				msg->msg_flags |= MSG_EOR;
3565			else
3566				so->so_state |= SS_SAVEDEOR;
3567		}
3568		freemsg(mp);
3569		/*
3570		 * If some data was received (i.e. not EOF) and the
3571		 * read/recv* has not been satisfied wait for some more.
3572		 * Not possible to wait if control info was received.
3573		 */
3574		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3575		    controllen == 0 &&
3576		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3577			mutex_exit(&so->so_lock);
3578			pflag = opflag | MSG_NOMARK;
3579			goto retry;
3580		}
3581		goto out_locked;
3582	}
3583	case T_EXDATA_IND: {
3584		dprintso(so, 1,
3585		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3586		    "state %s\n",
3587		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3588		    saved_resid - uiop->uio_resid,
3589		    pr_state(so->so_state, so->so_mode)));
3590		/*
3591		 * kstrgetmsg handles MSGMARK so there is nothing to
3592		 * inspect in the T_EXDATA_IND.
3593		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3594		 * as a separate message with no M_DATA component. Furthermore,
3595		 * the stream head does not consolidate M_DATA messages onto
3596		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3597		 * remains a message by itself. This is needed since MSGMARK
3598		 * marks both the whole message as well as the last byte
3599		 * of the message.
3600		 */
3601		freemsg(mp);
3602		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3603		if (flags & MSG_PEEK) {
3604			/*
3605			 * Even though we are peeking we consume the
3606			 * T_EXDATA_IND thereby moving the mark information
3607			 * to SS_RCVATMARK. Then the oob code below will
3608			 * retry the peeking kstrgetmsg.
3609			 * Note that the stream head read queue is
3610			 * never flushed without holding SOREADLOCKED
3611			 * thus the T_EXDATA_IND can not disappear
3612			 * underneath us.
3613			 */
3614			dprintso(so, 1,
3615			    ("sotpi_recvmsg: consume EXDATA_IND "
3616			    "counts %d/%d state %s\n",
3617			    sti->sti_oobsigcnt,
3618			    sti->sti_oobcnt,
3619			    pr_state(so->so_state, so->so_mode)));
3620
3621			pflag = MSG_ANY | MSG_DELAYERROR;
3622			if (so->so_mode & SM_ATOMIC)
3623				pflag |= MSG_DISCARDTAIL;
3624
3625			pri = 0;
3626			mp = NULL;
3627
3628			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3629			    &pri, &pflag, (clock_t)-1, &rval);
3630			ASSERT(uiop->uio_resid == saved_resid);
3631
3632			if (error) {
3633#ifdef SOCK_DEBUG
3634				if (error != EWOULDBLOCK && error != EINTR) {
3635					eprintsoline(so, error);
3636				}
3637#endif /* SOCK_DEBUG */
3638				goto out;
3639			}
3640			ASSERT(mp);
3641			tpr = (union T_primitives *)mp->b_rptr;
3642			ASSERT(tpr->type == T_EXDATA_IND);
3643			freemsg(mp);
3644		} /* end "if (flags & MSG_PEEK)" */
3645
3646		/*
3647		 * Decrement the number of queued and pending oob.
3648		 *
3649		 * SS_RCVATMARK is cleared when we read past a mark.
3650		 * SS_HAVEOOBDATA is cleared when we've read past the
3651		 * last mark.
3652		 * SS_OOBPEND is cleared if we've read past the last
3653		 * mark and no (new) SIGURG has been posted.
3654		 */
3655		mutex_enter(&so->so_lock);
3656		ASSERT(so_verify_oobstate(so));
3657		ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3658		ASSERT(sti->sti_oobsigcnt > 0);
3659		sti->sti_oobsigcnt--;
3660		ASSERT(sti->sti_oobcnt > 0);
3661		sti->sti_oobcnt--;
3662		/*
3663		 * Since the T_EXDATA_IND has been removed from the stream
3664		 * head, but we have not read data past the mark,
3665		 * sockfs needs to track that the socket is still at the mark.
3666		 *
3667		 * Since no data was received call kstrgetmsg again to wait
3668		 * for data.
3669		 */
3670		so->so_state |= SS_RCVATMARK;
3671		mutex_exit(&so->so_lock);
3672		dprintso(so, 1,
3673		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3674		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3675		    pr_state(so->so_state, so->so_mode)));
3676		pflag = opflag;
3677		goto retry;
3678	}
3679	default:
3680		cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3681		    (void *)so, tpr->type, (void *)mp);
3682		ASSERT(0);
3683		freemsg(mp);
3684		error = EPROTO;
3685		eprintsoline(so, error);
3686		goto out;
3687	}
3688	/* NOTREACHED */
3689out:
3690	mutex_enter(&so->so_lock);
3691out_locked:
3692	so_unlock_read(so);	/* Clear SOREADLOCKED */
3693	mutex_exit(&so->so_lock);
3694	return (error);
3695}
3696
3697/*
3698 * Sending data with options on a datagram socket.
3699 * Assumes caller has verified that SS_ISBOUND etc. are set.
3700 */
3701static int
3702sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3703    struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3704{
3705	struct T_unitdata_req	tudr;
3706	mblk_t			*mp;
3707	int			error;
3708	void			*addr;
3709	socklen_t		addrlen;
3710	void			*src;
3711	socklen_t		srclen;
3712	ssize_t			len;
3713	int			size;
3714	struct T_opthdr		toh;
3715	struct fdbuf		*fdbuf;
3716	t_uscalar_t		optlen;
3717	void			*fds;
3718	int			fdlen;
3719	sotpi_info_t		*sti = SOTOTPI(so);
3720
3721	ASSERT(name && namelen);
3722	ASSERT(control && controllen);
3723
3724	len = uiop->uio_resid;
3725	if (len > (ssize_t)sti->sti_tidu_size) {
3726		return (EMSGSIZE);
3727	}
3728
3729	/*
3730	 * For AF_UNIX the destination address is translated to an internal
3731	 * name and the source address is passed as an option.
3732	 * Also, file descriptors are passed as file pointers in an
3733	 * option.
3734	 */
3735
3736	/*
3737	 * Length and family checks.
3738	 */
3739	error = so_addr_verify(so, name, namelen);
3740	if (error) {
3741		eprintsoline(so, error);
3742		return (error);
3743	}
3744	if (so->so_family == AF_UNIX) {
3745		if (sti->sti_faddr_noxlate) {
3746			/*
3747			 * Already have a transport internal address. Do not
3748			 * pass any (transport internal) source address.
3749			 */
3750			addr = name;
3751			addrlen = namelen;
3752			src = NULL;
3753			srclen = 0;
3754		} else {
3755			/*
3756			 * Pass the sockaddr_un source address as an option
3757			 * and translate the remote address.
3758			 *
3759			 * Note that this code does not prevent sti_laddr_sa
3760			 * from changing while it is being used. Thus
3761			 * if an unbind+bind occurs concurrently with this
3762			 * send the peer might see a partially new and a
3763			 * partially old "from" address.
3764			 */
3765			src = sti->sti_laddr_sa;
3766			srclen = (t_uscalar_t)sti->sti_laddr_len;
3767			dprintso(so, 1,
3768			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3769			    srclen, src));
3770			error = so_ux_addr_xlate(so, name, namelen,
3771			    (flags & MSG_XPG4_2),
3772			    &addr, &addrlen);
3773			if (error) {
3774				eprintsoline(so, error);
3775				return (error);
3776			}
3777		}
3778	} else {
3779		addr = name;
3780		addrlen = namelen;
3781		src = NULL;
3782		srclen = 0;
3783	}
3784	optlen = so_optlen(control, controllen,
3785	    !(flags & MSG_XPG4_2));
3786	tudr.PRIM_type = T_UNITDATA_REQ;
3787	tudr.DEST_length = addrlen;
3788	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3789	if (srclen != 0)
3790		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3791		    _TPI_ALIGN_TOPT(srclen));
3792	else
3793		tudr.OPT_length = optlen;
3794	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3795	    _TPI_ALIGN_TOPT(addrlen));
3796
3797	size = tudr.OPT_offset + tudr.OPT_length;
3798
3799	/*
3800	 * File descriptors only when SM_FDPASSING set.
3801	 */
3802	error = so_getfdopt(control, controllen,
3803	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3804	if (error)
3805		return (error);
3806	if (fdlen != -1) {
3807		if (!(so->so_mode & SM_FDPASSING))
3808			return (EOPNOTSUPP);
3809
3810		error = fdbuf_create(fds, fdlen, &fdbuf);
3811		if (error)
3812			return (error);
3813		mp = fdbuf_allocmsg(size, fdbuf);
3814	} else {
3815		mp = soallocproto(size, _ALLOC_INTR, CRED());
3816		if (mp == NULL) {
3817			/*
3818			 * Caught a signal waiting for memory.
3819			 * Let send* return EINTR.
3820			 */
3821			return (EINTR);
3822		}
3823	}
3824	soappendmsg(mp, &tudr, sizeof (tudr));
3825	soappendmsg(mp, addr, addrlen);
3826	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3827
3828	if (fdlen != -1) {
3829		ASSERT(fdbuf != NULL);
3830		toh.level = SOL_SOCKET;
3831		toh.name = SO_FILEP;
3832		toh.len = fdbuf->fd_size +
3833		    (t_uscalar_t)sizeof (struct T_opthdr);
3834		toh.status = 0;
3835		soappendmsg(mp, &toh, sizeof (toh));
3836		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3837		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3838	}
3839	if (srclen != 0) {
3840		/*
3841		 * There is a AF_UNIX sockaddr_un to include as a source
3842		 * address option.
3843		 */
3844		toh.level = SOL_SOCKET;
3845		toh.name = SO_SRCADDR;
3846		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3847		toh.status = 0;
3848		soappendmsg(mp, &toh, sizeof (toh));
3849		soappendmsg(mp, src, srclen);
3850		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3851		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3852	}
3853	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3854	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3855	/* At most 3 bytes left in the message */
3856	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3857	ASSERT(MBLKL(mp) <= (ssize_t)size);
3858
3859	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3860	if (audit_active)
3861		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3862
3863	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3864#ifdef SOCK_DEBUG
3865	if (error) {
3866		eprintsoline(so, error);
3867	}
3868#endif /* SOCK_DEBUG */
3869	return (error);
3870}
3871
3872/*
3873 * Sending data with options on a connected stream socket.
3874 * Assumes caller has verified that SS_ISCONNECTED is set.
3875 */
3876static int
3877sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3878    t_uscalar_t controllen, int flags)
3879{
3880	struct T_optdata_req	tdr;
3881	mblk_t			*mp;
3882	int			error;
3883	ssize_t			iosize;
3884	int			size;
3885	struct fdbuf		*fdbuf;
3886	t_uscalar_t		optlen;
3887	void			*fds;
3888	int			fdlen;
3889	struct T_opthdr		toh;
3890	sotpi_info_t		*sti = SOTOTPI(so);
3891
3892	dprintso(so, 1,
3893	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3894
3895	/*
3896	 * Has to be bound and connected. However, since no locks are
3897	 * held the state could have changed after sotpi_sendmsg checked it
3898	 * thus it is not possible to ASSERT on the state.
3899	 */
3900
3901	/* Options on connection-oriented only when SM_OPTDATA set. */
3902	if (!(so->so_mode & SM_OPTDATA))
3903		return (EOPNOTSUPP);
3904
3905	do {
3906		/*
3907		 * Set the MORE flag if uio_resid does not fit in this
3908		 * message or if the caller passed in "more".
3909		 * Error for transports with zero tidu_size.
3910		 */
3911		tdr.PRIM_type = T_OPTDATA_REQ;
3912		iosize = sti->sti_tidu_size;
3913		if (iosize <= 0)
3914			return (EMSGSIZE);
3915		if (uiop->uio_resid > iosize) {
3916			tdr.DATA_flag = 1;
3917		} else {
3918			if (more)
3919				tdr.DATA_flag = 1;
3920			else
3921				tdr.DATA_flag = 0;
3922			iosize = uiop->uio_resid;
3923		}
3924		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3925		    tdr.DATA_flag, iosize));
3926
3927		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3928		tdr.OPT_length = optlen;
3929		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3930
3931		size = (int)sizeof (tdr) + optlen;
3932		/*
3933		 * File descriptors only when SM_FDPASSING set.
3934		 */
3935		error = so_getfdopt(control, controllen,
3936		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3937		if (error)
3938			return (error);
3939		if (fdlen != -1) {
3940			if (!(so->so_mode & SM_FDPASSING))
3941				return (EOPNOTSUPP);
3942
3943			error = fdbuf_create(fds, fdlen, &fdbuf);
3944			if (error)
3945				return (error);
3946			mp = fdbuf_allocmsg(size, fdbuf);
3947		} else {
3948			mp = soallocproto(size, _ALLOC_INTR, CRED());
3949			if (mp == NULL) {
3950				/*
3951				 * Caught a signal waiting for memory.
3952				 * Let send* return EINTR.
3953				 */
3954				return (EINTR);
3955			}
3956		}
3957		soappendmsg(mp, &tdr, sizeof (tdr));
3958
3959		if (fdlen != -1) {
3960			ASSERT(fdbuf != NULL);
3961			toh.level = SOL_SOCKET;
3962			toh.name = SO_FILEP;
3963			toh.len = fdbuf->fd_size +
3964			    (t_uscalar_t)sizeof (struct T_opthdr);
3965			toh.status = 0;
3966			soappendmsg(mp, &toh, sizeof (toh));
3967			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3968			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3969		}
3970		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3971		/* At most 3 bytes left in the message */
3972		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3973		ASSERT(MBLKL(mp) <= (ssize_t)size);
3974
3975		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3976
3977		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3978		    0, MSG_BAND, 0);
3979		if (error) {
3980			eprintsoline(so, error);
3981			return (error);
3982		}
3983		control = NULL;
3984		if (uiop->uio_resid > 0) {
3985			/*
3986			 * Recheck for fatal errors. Fail write even though
3987			 * some data have been written. This is consistent
3988			 * with strwrite semantics and BSD sockets semantics.
3989			 */
3990			if (so->so_state & SS_CANTSENDMORE) {
3991				eprintsoline(so, error);
3992				return (EPIPE);
3993			}
3994			if (so->so_error != 0) {
3995				mutex_enter(&so->so_lock);
3996				error = sogeterr(so, B_TRUE);
3997				mutex_exit(&so->so_lock);
3998				if (error != 0) {
3999					eprintsoline(so, error);
4000					return (error);
4001				}
4002			}
4003		}
4004	} while (uiop->uio_resid > 0);
4005	return (0);
4006}
4007
4008/*
4009 * Sending data on a datagram socket.
4010 * Assumes caller has verified that SS_ISBOUND etc. are set.
4011 *
4012 * For AF_UNIX the destination address is translated to an internal
4013 * name and the source address is passed as an option.
4014 */
4015int
4016sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
4017    struct uio *uiop, int flags)
4018{
4019	struct T_unitdata_req	tudr;
4020	mblk_t			*mp;
4021	int			error;
4022	void			*addr;
4023	socklen_t		addrlen;
4024	void			*src;
4025	socklen_t		srclen;
4026	ssize_t			len;
4027	sotpi_info_t		*sti = SOTOTPI(so);
4028
4029	ASSERT(name != NULL && namelen != 0);
4030
4031	len = uiop->uio_resid;
4032	if (len > sti->sti_tidu_size) {
4033		error = EMSGSIZE;
4034		goto done;
4035	}
4036
4037	/* Length and family checks */
4038	error = so_addr_verify(so, name, namelen);
4039	if (error != 0)
4040		goto done;
4041
4042	if (sti->sti_direct)
4043		return (sodgram_direct(so, name, namelen, uiop, flags));
4044
4045	if (so->so_family == AF_UNIX) {
4046		if (sti->sti_faddr_noxlate) {
4047			/*
4048			 * Already have a transport internal address. Do not
4049			 * pass any (transport internal) source address.
4050			 */
4051			addr = name;
4052			addrlen = namelen;
4053			src = NULL;
4054			srclen = 0;
4055		} else {
4056			/*
4057			 * Pass the sockaddr_un source address as an option
4058			 * and translate the remote address.
4059			 *
4060			 * Note that this code does not prevent sti_laddr_sa
4061			 * from changing while it is being used. Thus
4062			 * if an unbind+bind occurs concurrently with this
4063			 * send the peer might see a partially new and a
4064			 * partially old "from" address.
4065			 */
4066			src = sti->sti_laddr_sa;
4067			srclen = (socklen_t)sti->sti_laddr_len;
4068			dprintso(so, 1,
4069			    ("sosend_dgram UNIX: srclen %d, src %p\n",
4070			    srclen, src));
4071			error = so_ux_addr_xlate(so, name, namelen,
4072			    (flags & MSG_XPG4_2),
4073			    &addr, &addrlen);
4074			if (error) {
4075				eprintsoline(so, error);
4076				goto done;
4077			}
4078		}
4079	} else {
4080		addr = name;
4081		addrlen = namelen;
4082		src = NULL;
4083		srclen = 0;
4084	}
4085	tudr.PRIM_type = T_UNITDATA_REQ;
4086	tudr.DEST_length = addrlen;
4087	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4088	if (srclen == 0) {
4089		tudr.OPT_length = 0;
4090		tudr.OPT_offset = 0;
4091
4092		mp = soallocproto2(&tudr, sizeof (tudr),
4093		    addr, addrlen, 0, _ALLOC_INTR, CRED());
4094		if (mp == NULL) {
4095			/*
4096			 * Caught a signal waiting for memory.
4097			 * Let send* return EINTR.
4098			 */
4099			error = EINTR;
4100			goto done;
4101		}
4102	} else {
4103		/*
4104		 * There is a AF_UNIX sockaddr_un to include as a source
4105		 * address option.
4106		 */
4107		struct T_opthdr toh;
4108		ssize_t size;
4109
4110		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4111		    _TPI_ALIGN_TOPT(srclen));
4112		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4113		    _TPI_ALIGN_TOPT(addrlen));
4114
4115		toh.level = SOL_SOCKET;
4116		toh.name = SO_SRCADDR;
4117		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4118		toh.status = 0;
4119
4120		size = tudr.OPT_offset + tudr.OPT_length;
4121		mp = soallocproto2(&tudr, sizeof (tudr),
4122		    addr, addrlen, size, _ALLOC_INTR, CRED());
4123		if (mp == NULL) {
4124			/*
4125			 * Caught a signal waiting for memory.
4126			 * Let send* return EINTR.
4127			 */
4128			error = EINTR;
4129			goto done;
4130		}
4131		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4132		soappendmsg(mp, &toh, sizeof (toh));
4133		soappendmsg(mp, src, srclen);
4134		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4135		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4136	}
4137
4138	if (audit_active)
4139		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4140
4141	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4142done:
4143#ifdef SOCK_DEBUG
4144	if (error) {
4145		eprintsoline(so, error);
4146	}
4147#endif /* SOCK_DEBUG */
4148	return (error);
4149}
4150
4151/*
4152 * Sending data on a connected stream socket.
4153 * Assumes caller has verified that SS_ISCONNECTED is set.
4154 */
4155int
4156sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4157    int sflag)
4158{
4159	struct T_data_req	tdr;
4160	mblk_t			*mp;
4161	int			error;
4162	ssize_t			iosize;
4163	sotpi_info_t		*sti = SOTOTPI(so);
4164
4165	dprintso(so, 1,
4166	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4167	    (void *)so, uiop->uio_resid, prim, sflag));
4168
4169	/*
4170	 * Has to be bound and connected. However, since no locks are
4171	 * held the state could have changed after sotpi_sendmsg checked it
4172	 * thus it is not possible to ASSERT on the state.
4173	 */
4174
4175	do {
4176		/*
4177		 * Set the MORE flag if uio_resid does not fit in this
4178		 * message or if the caller passed in "more".
4179		 * Error for transports with zero tidu_size.
4180		 */
4181		tdr.PRIM_type = prim;
4182		iosize = sti->sti_tidu_size;
4183		if (iosize <= 0)
4184			return (EMSGSIZE);
4185		if (uiop->uio_resid > iosize) {
4186			tdr.MORE_flag = 1;
4187		} else {
4188			if (more)
4189				tdr.MORE_flag = 1;
4190			else
4191				tdr.MORE_flag = 0;
4192			iosize = uiop->uio_resid;
4193		}
4194		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4195		    prim, tdr.MORE_flag, iosize));
4196		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4197		if (mp == NULL) {
4198			/*
4199			 * Caught a signal waiting for memory.
4200			 * Let send* return EINTR.
4201			 */
4202			return (EINTR);
4203		}
4204
4205		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4206		    0, sflag | MSG_BAND, 0);
4207		if (error) {
4208			eprintsoline(so, error);
4209			return (error);
4210		}
4211		if (uiop->uio_resid > 0) {
4212			/*
4213			 * Recheck for fatal errors. Fail write even though
4214			 * some data have been written. This is consistent
4215			 * with strwrite semantics and BSD sockets semantics.
4216			 */
4217			if (so->so_state & SS_CANTSENDMORE) {
4218				eprintsoline(so, error);
4219				return (EPIPE);
4220			}
4221			if (so->so_error != 0) {
4222				mutex_enter(&so->so_lock);
4223				error = sogeterr(so, B_TRUE);
4224				mutex_exit(&so->so_lock);
4225				if (error != 0) {
4226					eprintsoline(so, error);
4227					return (error);
4228				}
4229			}
4230		}
4231	} while (uiop->uio_resid > 0);
4232	return (0);
4233}
4234
4235/*
4236 * Check the state for errors and call the appropriate send function.
4237 *
4238 * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4239 * this function issues a setsockopt to toggle SO_DONTROUTE before and
4240 * after sending the message.
4241 */
4242static int
4243sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4244    struct cred *cr)
4245{
4246	int		so_state;
4247	int		so_mode;
4248	int		error;
4249	struct sockaddr *name;
4250	t_uscalar_t	namelen;
4251	int		dontroute;
4252	int		flags;
4253	sotpi_info_t	*sti = SOTOTPI(so);
4254
4255	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4256	    (void *)so, (void *)msg, msg->msg_flags,
4257	    pr_state(so->so_state, so->so_mode), so->so_error));
4258
4259	if (so->so_version == SOV_STREAM) {
4260		/* The imaginary "sockmod" has been popped - act as a stream */
4261		so_update_attrs(so, SOMOD);
4262		return (strwrite(SOTOV(so), uiop, cr));
4263	}
4264
4265	mutex_enter(&so->so_lock);
4266	so_state = so->so_state;
4267
4268	if (so_state & SS_CANTSENDMORE) {
4269		mutex_exit(&so->so_lock);
4270		return (EPIPE);
4271	}
4272
4273	if (so->so_error != 0) {
4274		error = sogeterr(so, B_TRUE);
4275		if (error != 0) {
4276			mutex_exit(&so->so_lock);
4277			return (error);
4278		}
4279	}
4280
4281	name = (struct sockaddr *)msg->msg_name;
4282	namelen = msg->msg_namelen;
4283
4284	so_mode = so->so_mode;
4285
4286	if (name == NULL) {
4287		if (!(so_state & SS_ISCONNECTED)) {
4288			mutex_exit(&so->so_lock);
4289			if (so_mode & SM_CONNREQUIRED)
4290				return (ENOTCONN);
4291			else
4292				return (EDESTADDRREQ);
4293		}
4294		if (so_mode & SM_CONNREQUIRED) {
4295			name = NULL;
4296			namelen = 0;
4297		} else {
4298			/*
4299			 * Note that this code does not prevent sti_faddr_sa
4300			 * from changing while it is being used. Thus
4301			 * if an "unconnect"+connect occurs concurrently with
4302			 * this send the datagram might be delivered to a
4303			 * garbaled address.
4304			 */
4305			ASSERT(sti->sti_faddr_sa);
4306			name = sti->sti_faddr_sa;
4307			namelen = (t_uscalar_t)sti->sti_faddr_len;
4308		}
4309	} else {
4310		if (!(so_state & SS_ISCONNECTED) &&
4311		    (so_mode & SM_CONNREQUIRED)) {
4312			/* Required but not connected */
4313			mutex_exit(&so->so_lock);
4314			return (ENOTCONN);
4315		}
4316		/*
4317		 * Ignore the address on connection-oriented sockets.
4318		 * Just like BSD this code does not generate an error for
4319		 * TCP (a CONNREQUIRED socket) when sending to an address
4320		 * passed in with sendto/sendmsg. Instead the data is
4321		 * delivered on the connection as if no address had been
4322		 * supplied.
4323		 */
4324		if ((so_state & SS_ISCONNECTED) &&
4325		    !(so_mode & SM_CONNREQUIRED)) {
4326			mutex_exit(&so->so_lock);
4327			return (EISCONN);
4328		}
4329		if (!(so_state & SS_ISBOUND)) {
4330			so_lock_single(so);	/* Set SOLOCKED */
4331			error = sotpi_bind(so, NULL, 0,
4332			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4333			so_unlock_single(so, SOLOCKED);
4334			if (error) {
4335				mutex_exit(&so->so_lock);
4336				eprintsoline(so, error);
4337				return (error);
4338			}
4339		}
4340		/*
4341		 * Handle delayed datagram errors. These are only queued
4342		 * when the application sets SO_DGRAM_ERRIND.
4343		 * Return the error if we are sending to the address
4344		 * that was returned in the last T_UDERROR_IND.
4345		 * If sending to some other address discard the delayed
4346		 * error indication.
4347		 */
4348		if (sti->sti_delayed_error) {
4349			struct T_uderror_ind	*tudi;
4350			void			*addr;
4351			t_uscalar_t		addrlen;
4352			boolean_t		match = B_FALSE;
4353
4354			ASSERT(sti->sti_eaddr_mp);
4355			error = sti->sti_delayed_error;
4356			sti->sti_delayed_error = 0;
4357			tudi =
4358			    (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4359			addrlen = tudi->DEST_length;
4360			addr = sogetoff(sti->sti_eaddr_mp,
4361			    tudi->DEST_offset, addrlen, 1);
4362			ASSERT(addr);	/* Checked by strsock_proto */
4363			switch (so->so_family) {
4364			case AF_INET: {
4365				/* Compare just IP address and port */
4366				sin_t *sin1 = (sin_t *)name;
4367				sin_t *sin2 = (sin_t *)addr;
4368
4369				if (addrlen == sizeof (sin_t) &&
4370				    namelen == addrlen &&
4371				    sin1->sin_port == sin2->sin_port &&
4372				    sin1->sin_addr.s_addr ==
4373				    sin2->sin_addr.s_addr)
4374					match = B_TRUE;
4375				break;
4376			}
4377			case AF_INET6: {
4378				/* Compare just IP address and port. Not flow */
4379				sin6_t *sin1 = (sin6_t *)name;
4380				sin6_t *sin2 = (sin6_t *)addr;
4381
4382				if (addrlen == sizeof (sin6_t) &&
4383				    namelen == addrlen &&
4384				    sin1->sin6_port == sin2->sin6_port &&
4385				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4386				    &sin2->sin6_addr))
4387					match = B_TRUE;
4388				break;
4389			}
4390			case AF_UNIX:
4391			default:
4392				if (namelen == addrlen &&
4393				    bcmp(name, addr, namelen) == 0)
4394					match = B_TRUE;
4395			}
4396			if (match) {
4397				freemsg(sti->sti_eaddr_mp);
4398				sti->sti_eaddr_mp = NULL;
4399				mutex_exit(&so->so_lock);
4400#ifdef DEBUG
4401				dprintso(so, 0,
4402				    ("sockfs delayed error %d for %s\n",
4403				    error,
4404				    pr_addr(so->so_family, name, namelen)));
4405#endif /* DEBUG */
4406				return (error);
4407			}
4408			freemsg(sti->sti_eaddr_mp);
4409			sti->sti_eaddr_mp = NULL;
4410		}
4411	}
4412	mutex_exit(&so->so_lock);
4413
4414	flags = msg->msg_flags;
4415	dontroute = 0;
4416	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4417		uint32_t	val;
4418
4419		val = 1;
4420		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4421		    &val, (t_uscalar_t)sizeof (val), cr);
4422		if (error)
4423			return (error);
4424		dontroute = 1;
4425	}
4426
4427	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4428		error = EOPNOTSUPP;
4429		goto done;
4430	}
4431	if (msg->msg_controllen != 0) {
4432		if (!(so_mode & SM_CONNREQUIRED)) {
4433			so_update_attrs(so, SOMOD);
4434			error = sosend_dgramcmsg(so, name, namelen, uiop,
4435			    msg->msg_control, msg->msg_controllen, flags);
4436		} else {
4437			if (flags & MSG_OOB) {
4438				/* Can't generate T_EXDATA_REQ with options */
4439				error = EOPNOTSUPP;
4440				goto done;
4441			}
4442			so_update_attrs(so, SOMOD);
4443			error = sosend_svccmsg(so, uiop,
4444			    !(flags & MSG_EOR),
4445			    msg->msg_control, msg->msg_controllen,
4446			    flags);
4447		}
4448		goto done;
4449	}
4450
4451	so_update_attrs(so, SOMOD);
4452	if (!(so_mode & SM_CONNREQUIRED)) {
4453		/*
4454		 * If there is no SO_DONTROUTE to turn off return immediately
4455		 * from send_dgram. This can allow tail-call optimizations.
4456		 */
4457		if (!dontroute) {
4458			return (sosend_dgram(so, name, namelen, uiop, flags));
4459		}
4460		error = sosend_dgram(so, name, namelen, uiop, flags);
4461	} else {
4462		t_scalar_t prim;
4463		int sflag;
4464
4465		/* Ignore msg_name in the connected state */
4466		if (flags & MSG_OOB) {
4467			prim = T_EXDATA_REQ;
4468			/*
4469			 * Send down T_EXDATA_REQ even if there is flow
4470			 * control for data.
4471			 */
4472			sflag = MSG_IGNFLOW;
4473		} else {
4474			if (so_mode & SM_BYTESTREAM) {
4475				/* Byte stream transport - use write */
4476				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4477
4478				/* Send M_DATA messages */
4479				if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4480				    (error = nl7c_data(so, uiop)) >= 0) {
4481					/* NL7C consumed the data */
4482					return (error);
4483				}
4484				/*
4485				 * If there is no SO_DONTROUTE to turn off,
4486				 * sti_direct is on, and there is no flow
4487				 * control, we can take the fast path.
4488				 */
4489				if (!dontroute && sti->sti_direct != 0 &&
4490				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4491					return (sostream_direct(so, uiop,
4492					    NULL, cr));
4493				}
4494				error = strwrite(SOTOV(so), uiop, cr);
4495				goto done;
4496			}
4497			prim = T_DATA_REQ;
4498			sflag = 0;
4499		}
4500		/*
4501		 * If there is no SO_DONTROUTE to turn off return immediately
4502		 * from sosend_svc. This can allow tail-call optimizations.
4503		 */
4504		if (!dontroute)
4505			return (sosend_svc(so, uiop, prim,
4506			    !(flags & MSG_EOR), sflag));
4507		error = sosend_svc(so, uiop, prim,
4508		    !(flags & MSG_EOR), sflag);
4509	}
4510	ASSERT(dontroute);
4511done:
4512	if (dontroute) {
4513		uint32_t	val;
4514
4515		val = 0;
4516		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4517		    &val, (t_uscalar_t)sizeof (val), cr);
4518	}
4519	return (error);
4520}
4521
4522/*
4523 * kstrwritemp() has very similar semantics as that of strwrite().
4524 * The main difference is it obtains mblks from the caller and also
4525 * does not do any copy as done in strwrite() from user buffers to
4526 * kernel buffers.
4527 *
4528 * Currently, this routine is used by sendfile to send data allocated
4529 * within the kernel without any copying. This interface does not use the
4530 * synchronous stream interface as synch. stream interface implies
4531 * copying.
4532 */
4533int
4534kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4535{
4536	struct stdata *stp;
4537	struct queue *wqp;
4538	mblk_t *newmp;
4539	char waitflag;
4540	int tempmode;
4541	int error = 0;
4542	int done = 0;
4543	struct sonode *so;
4544	boolean_t direct;
4545
4546	ASSERT(vp->v_stream);
4547	stp = vp->v_stream;
4548
4549	so = VTOSO(vp);
4550	direct = _SOTOTPI(so)->sti_direct;
4551
4552	/*
4553	 * This is the sockfs direct fast path. canputnext() need
4554	 * not be accurate so we don't grab the sd_lock here. If
4555	 * we get flow-controlled, we grab sd_lock just before the
4556	 * do..while loop below to emulate what strwrite() does.
4557	 */
4558	wqp = stp->sd_wrq;
4559	if (canputnext(wqp) && direct &&
4560	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4561		return (sostream_direct(so, NULL, mp, CRED()));
4562	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4563		/* Fast check of flags before acquiring the lock */
4564		mutex_enter(&stp->sd_lock);
4565		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4566		mutex_exit(&stp->sd_lock);
4567		if (error != 0) {
4568			if (!(stp->sd_flag & STPLEX) &&
4569			    (stp->sd_wput_opt & SW_SIGPIPE)) {
4570				error = EPIPE;
4571			}
4572			return (error);
4573		}
4574	}
4575
4576	waitflag = WRITEWAIT;
4577	if (stp->sd_flag & OLDNDELAY)
4578		tempmode = fmode & ~FNDELAY;
4579	else
4580		tempmode = fmode;
4581
4582	mutex_enter(&stp->sd_lock);
4583	do {
4584		if (canputnext(wqp)) {
4585			mutex_exit(&stp->sd_lock);
4586			if (stp->sd_wputdatafunc != NULL) {
4587				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4588				    NULL, NULL, NULL);
4589				if (newmp == NULL) {
4590					/* The caller will free mp */
4591					return (ECOMM);
4592				}
4593				mp = newmp;
4594			}
4595			putnext(wqp, mp);
4596			return (0);
4597		}
4598		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4599		    &done);
4600	} while (error == 0 && !done);
4601
4602	mutex_exit(&stp->sd_lock);
4603	/*
4604	 * EAGAIN tells the application to try again. ENOMEM
4605	 * is returned only if the memory allocation size
4606	 * exceeds the physical limits of the system. ENOMEM
4607	 * can't be true here.
4608	 */
4609	if (error == ENOMEM)
4610		error = EAGAIN;
4611	return (error);
4612}
4613
4614/* ARGSUSED */
4615static int
4616sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4617    struct cred *cr, mblk_t **mpp)
4618{
4619	int error;
4620
4621	if (so->so_family != AF_INET && so->so_family != AF_INET6)
4622		return (EAFNOSUPPORT);
4623
4624	if (so->so_state & SS_CANTSENDMORE)
4625		return (EPIPE);
4626
4627	if (so->so_type != SOCK_STREAM)
4628		return (EOPNOTSUPP);
4629
4630	if ((so->so_state & SS_ISCONNECTED) == 0)
4631		return (ENOTCONN);
4632
4633	error = kstrwritemp(so->so_vnode, *mpp, fflag);
4634	if (error == 0)
4635		*mpp = NULL;
4636	return (error);
4637}
4638
4639/*
4640 * Sending data on a datagram socket.
4641 * Assumes caller has verified that SS_ISBOUND etc. are set.
4642 */
4643/* ARGSUSED */
4644static int
4645sodgram_direct(struct sonode *so, struct sockaddr *name,
4646    socklen_t namelen, struct uio *uiop, int flags)
4647{
4648	struct T_unitdata_req	tudr;
4649	mblk_t			*mp = NULL;
4650	int			error = 0;
4651	void			*addr;
4652	socklen_t		addrlen;
4653	ssize_t			len;
4654	struct stdata		*stp = SOTOV(so)->v_stream;
4655	int			so_state;
4656	queue_t			*udp_wq;
4657	boolean_t		connected;
4658	mblk_t			*mpdata = NULL;
4659	sotpi_info_t		*sti = SOTOTPI(so);
4660
4661	ASSERT(name != NULL && namelen != 0);
4662	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4663	ASSERT(!(so->so_mode & SM_EXDATA));
4664	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4665	ASSERT(SOTOV(so)->v_type == VSOCK);
4666
4667	/* Caller checked for proper length */
4668	len = uiop->uio_resid;
4669	ASSERT(len <= sti->sti_tidu_size);
4670
4671	/* Length and family checks have been done by caller */
4672	ASSERT(name->sa_family == so->so_family);
4673	ASSERT(so->so_family == AF_INET ||
4674	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4675	ASSERT(so->so_family == AF_INET6 ||
4676	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4677
4678	addr = name;
4679	addrlen = namelen;
4680
4681	if (stp->sd_sidp != NULL &&
4682	    (error = straccess(stp, JCWRITE)) != 0)
4683		goto done;
4684
4685	so_state = so->so_state;
4686
4687	connected = so_state & SS_ISCONNECTED;
4688	if (!connected) {
4689		tudr.PRIM_type = T_UNITDATA_REQ;
4690		tudr.DEST_length = addrlen;
4691		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4692		tudr.OPT_length = 0;
4693		tudr.OPT_offset = 0;
4694
4695		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4696		    _ALLOC_INTR, CRED());
4697		if (mp == NULL) {
4698			/*
4699			 * Caught a signal waiting for memory.
4700			 * Let send* return EINTR.
4701			 */
4702			error = EINTR;
4703			goto done;
4704		}
4705	}
4706
4707	/*
4708	 * For UDP we don't break up the copyin into smaller pieces
4709	 * as in the TCP case.  That means if ENOMEM is returned by
4710	 * mcopyinuio() then the uio vector has not been modified at
4711	 * all and we fallback to either strwrite() or kstrputmsg()
4712	 * below.  Note also that we never generate priority messages
4713	 * from here.
4714	 */
4715	udp_wq = stp->sd_wrq->q_next;
4716	if (canput(udp_wq) &&
4717	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4718		ASSERT(DB_TYPE(mpdata) == M_DATA);
4719		ASSERT(uiop->uio_resid == 0);
4720		if (!connected)
4721			linkb(mp, mpdata);
4722		else
4723			mp = mpdata;
4724		if (audit_active)
4725			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4726
4727		udp_wput(udp_wq, mp);
4728		return (0);
4729	}
4730
4731	ASSERT(mpdata == NULL);
4732	if (error != 0 && error != ENOMEM) {
4733		freemsg(mp);
4734		return (error);
4735	}
4736
4737	/*
4738	 * For connected, let strwrite() handle the blocking case.
4739	 * Otherwise we fall thru and use kstrputmsg().
4740	 */
4741	if (connected)
4742		return (strwrite(SOTOV(so), uiop, CRED()));
4743
4744	if (audit_active)
4745		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4746
4747	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4748done:
4749#ifdef SOCK_DEBUG
4750	if (error != 0) {
4751		eprintsoline(so, error);
4752	}
4753#endif /* SOCK_DEBUG */
4754	return (error);
4755}
4756
4757int
4758sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4759{
4760	struct stdata *stp = SOTOV(so)->v_stream;
4761	ssize_t iosize, rmax, maxblk;
4762	queue_t *tcp_wq = stp->sd_wrq->q_next;
4763	mblk_t *newmp;
4764	int error = 0, wflag = 0;
4765
4766	ASSERT(so->so_mode & SM_BYTESTREAM);
4767	ASSERT(SOTOV(so)->v_type == VSOCK);
4768
4769	if (stp->sd_sidp != NULL &&
4770	    (error = straccess(stp, JCWRITE)) != 0)
4771		return (error);
4772
4773	if (uiop == NULL) {
4774		/*
4775		 * kstrwritemp() should have checked sd_flag and
4776		 * flow-control before coming here.  If we end up
4777		 * here it means that we can simply pass down the
4778		 * data to tcp.
4779		 */
4780		ASSERT(mp != NULL);
4781		if (stp->sd_wputdatafunc != NULL) {
4782			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4783			    NULL, NULL, NULL);
4784			if (newmp == NULL) {
4785				/* The caller will free mp */
4786				return (ECOMM);
4787			}
4788			mp = newmp;
4789		}
4790		tcp_wput(tcp_wq, mp);
4791		return (0);
4792	}
4793
4794	/* Fallback to strwrite() to do proper error handling */
4795	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4796		return (strwrite(SOTOV(so), uiop, cr));
4797
4798	rmax = stp->sd_qn_maxpsz;
4799	ASSERT(rmax >= 0 || rmax == INFPSZ);
4800	if (rmax == 0 || uiop->uio_resid <= 0)
4801		return (0);
4802
4803	if (rmax == INFPSZ)
4804		rmax = uiop->uio_resid;
4805
4806	maxblk = stp->sd_maxblk;
4807
4808	for (;;) {
4809		iosize = MIN(uiop->uio_resid, rmax);
4810
4811		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4812		if (mp == NULL) {
4813			/*
4814			 * Fallback to strwrite() for ENOMEM; if this
4815			 * is our first time in this routine and the uio
4816			 * vector has not been modified, we will end up
4817			 * calling strwrite() without any flag set.
4818			 */
4819			if (error == ENOMEM)
4820				goto slow_send;
4821			else
4822				return (error);
4823		}
4824		ASSERT(uiop->uio_resid >= 0);
4825		/*
4826		 * If mp is non-NULL and ENOMEM is set, it means that
4827		 * mcopyinuio() was able to break down some of the user
4828		 * data into one or more mblks.  Send the partial data
4829		 * to tcp and let the rest be handled in strwrite().
4830		 */
4831		ASSERT(error == 0 || error == ENOMEM);
4832		if (stp->sd_wputdatafunc != NULL) {
4833			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4834			    NULL, NULL, NULL);
4835			if (newmp == NULL) {
4836				/* The caller will free mp */
4837				return (ECOMM);
4838			}
4839			mp = newmp;
4840		}
4841		tcp_wput(tcp_wq, mp);
4842
4843		wflag |= NOINTR;
4844
4845		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4846			ASSERT(error == 0);
4847			break;
4848		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4849		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4850slow_send:
4851			/*
4852			 * We were able to send down partial data using
4853			 * the direct call interface, but are now relying
4854			 * on strwrite() to handle the non-fastpath cases.
4855			 * If the socket is blocking we will sleep in
4856			 * strwaitq() until write is permitted, otherwise,
4857			 * we will need to return the amount of bytes
4858			 * written so far back to the app.  This is the
4859			 * reason why we pass NOINTR flag to strwrite()
4860			 * for non-blocking socket, because we don't want
4861			 * to return EAGAIN when portion of the user data
4862			 * has actually been sent down.
4863			 */
4864			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4865		}
4866	}
4867	return (0);
4868}
4869
4870/*
4871 * Update sti_faddr by asking the transport (unless AF_UNIX).
4872 */
4873/* ARGSUSED */
4874int
4875sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4876    boolean_t accept, struct cred *cr)
4877{
4878	struct strbuf	strbuf;
4879	int		error = 0, res;
4880	void		*addr;
4881	t_uscalar_t	addrlen;
4882	k_sigset_t	smask;
4883	sotpi_info_t	*sti = SOTOTPI(so);
4884
4885	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4886	    (void *)so, pr_state(so->so_state, so->so_mode)));
4887
4888	ASSERT(*namelen > 0);
4889	mutex_enter(&so->so_lock);
4890	so_lock_single(so);	/* Set SOLOCKED */
4891
4892	if (accept) {
4893		bcopy(sti->sti_faddr_sa, name,
4894		    MIN(*namelen, sti->sti_faddr_len));
4895		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4896		goto done;
4897	}
4898
4899	if (!(so->so_state & SS_ISCONNECTED)) {
4900		error = ENOTCONN;
4901		goto done;
4902	}
4903	/* Added this check for X/Open */
4904	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4905		error = EINVAL;
4906		if (xnet_check_print) {
4907			printf("sockfs: X/Open getpeername check => EINVAL\n");
4908		}
4909		goto done;
4910	}
4911
4912	if (sti->sti_faddr_valid) {
4913		bcopy(sti->sti_faddr_sa, name,
4914		    MIN(*namelen, sti->sti_faddr_len));
4915		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4916		goto done;
4917	}
4918
4919#ifdef DEBUG
4920	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4921	    pr_addr(so->so_family, sti->sti_faddr_sa,
4922	    (t_uscalar_t)sti->sti_faddr_len)));
4923#endif /* DEBUG */
4924
4925	if (so->so_family == AF_UNIX) {
4926		/* Transport has different name space - return local info */
4927		if (sti->sti_faddr_noxlate)
4928			*namelen = 0;
4929		error = 0;
4930		goto done;
4931	}
4932
4933	ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4934
4935	ASSERT(sti->sti_faddr_sa);
4936	/* Allocate local buffer to use with ioctl */
4937	addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4938	mutex_exit(&so->so_lock);
4939	addr = kmem_alloc(addrlen, KM_SLEEP);
4940
4941	/*
4942	 * Issue TI_GETPEERNAME with signals masked.
4943	 * Put the result in sti_faddr_sa so that getpeername works after
4944	 * a shutdown(output).
4945	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4946	 * back to the socket.
4947	 */
4948	strbuf.buf = addr;
4949	strbuf.maxlen = addrlen;
4950	strbuf.len = 0;
4951
4952	sigintr(&smask, 0);
4953	res = 0;
4954	ASSERT(cr);
4955	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4956	    0, K_TO_K, cr, &res);
4957	sigunintr(&smask);
4958
4959	mutex_enter(&so->so_lock);
4960	/*
4961	 * If there is an error record the error in so_error put don't fail
4962	 * the getpeername. Instead fallback on the recorded
4963	 * sti->sti_faddr_sa.
4964	 */
4965	if (error) {
4966		/*
4967		 * Various stream head errors can be returned to the ioctl.
4968		 * However, it is impossible to determine which ones of
4969		 * these are really socket level errors that were incorrectly
4970		 * consumed by the ioctl. Thus this code silently ignores the
4971		 * error - to code explicitly does not reinstate the error
4972		 * using soseterror().
4973		 * Experiments have shows that at least this set of
4974		 * errors are reported and should not be reinstated on the
4975		 * socket:
4976		 *	EINVAL	E.g. if an I_LINK was in effect when
4977		 *		getpeername was called.
4978		 *	EPIPE	The ioctl error semantics prefer the write
4979		 *		side error over the read side error.
4980		 *	ENOTCONN The transport just got disconnected but
4981		 *		sockfs had not yet seen the T_DISCON_IND
4982		 *		when issuing the ioctl.
4983		 */
4984		error = 0;
4985	} else if (res == 0 && strbuf.len > 0 &&
4986	    (so->so_state & SS_ISCONNECTED)) {
4987		ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
4988		sti->sti_faddr_len = (socklen_t)strbuf.len;
4989		bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
4990		sti->sti_faddr_valid = 1;
4991
4992		bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
4993		*namelen = sti->sti_faddr_len;
4994	}
4995	kmem_free(addr, addrlen);
4996#ifdef DEBUG
4997	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4998	    pr_addr(so->so_family, sti->sti_faddr_sa,
4999	    (t_uscalar_t)sti->sti_faddr_len)));
5000#endif /* DEBUG */
5001done:
5002	so_unlock_single(so, SOLOCKED);
5003	mutex_exit(&so->so_lock);
5004	return (error);
5005}
5006
5007/*
5008 * Update sti_laddr by asking the transport (unless AF_UNIX).
5009 */
5010int
5011sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
5012    struct cred *cr)
5013{
5014	struct strbuf	strbuf;
5015	int		error = 0, res;
5016	void		*addr;
5017	t_uscalar_t	addrlen;
5018	k_sigset_t	smask;
5019	sotpi_info_t	*sti = SOTOTPI(so);
5020
5021	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
5022	    (void *)so, pr_state(so->so_state, so->so_mode)));
5023
5024	ASSERT(*namelen > 0);
5025	mutex_enter(&so->so_lock);
5026	so_lock_single(so);	/* Set SOLOCKED */
5027
5028#ifdef DEBUG
5029
5030	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
5031	    pr_addr(so->so_family, sti->sti_laddr_sa,
5032	    (t_uscalar_t)sti->sti_laddr_len)));
5033#endif /* DEBUG */
5034	if (sti->sti_laddr_valid) {
5035		bcopy(sti->sti_laddr_sa, name,
5036		    MIN(*namelen, sti->sti_laddr_len));
5037		*namelen = sti->sti_laddr_len;
5038		goto done;
5039	}
5040
5041	if (so->so_family == AF_UNIX) {
5042		/* Transport has different name space - return local info */
5043		error = 0;
5044		*namelen = 0;
5045		goto done;
5046	}
5047	if (!(so->so_state & SS_ISBOUND)) {
5048		/* If not bound, then nothing to return. */
5049		error = 0;
5050		goto done;
5051	}
5052
5053	/* Allocate local buffer to use with ioctl */
5054	addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
5055	mutex_exit(&so->so_lock);
5056	addr = kmem_alloc(addrlen, KM_SLEEP);
5057
5058	/*
5059	 * Issue TI_GETMYNAME with signals masked.
5060	 * Put the result in sti_laddr_sa so that getsockname works after
5061	 * a shutdown(output).
5062	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5063	 * back to the socket.
5064	 */
5065	strbuf.buf = addr;
5066	strbuf.maxlen = addrlen;
5067	strbuf.len = 0;
5068
5069	sigintr(&smask, 0);
5070	res = 0;
5071	ASSERT(cr);
5072	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
5073	    0, K_TO_K, cr, &res);
5074	sigunintr(&smask);
5075
5076	mutex_enter(&so->so_lock);
5077	/*
5078	 * If there is an error record the error in so_error put don't fail
5079	 * the getsockname. Instead fallback on the recorded
5080	 * sti->sti_laddr_sa.
5081	 */
5082	if (error) {
5083		/*
5084		 * Various stream head errors can be returned to the ioctl.
5085		 * However, it is impossible to determine which ones of
5086		 * these are really socket level errors that were incorrectly
5087		 * consumed by the ioctl. Thus this code silently ignores the
5088		 * error - to code explicitly does not reinstate the error
5089		 * using soseterror().
5090		 * Experiments have shows that at least this set of
5091		 * errors are reported and should not be reinstated on the
5092		 * socket:
5093		 *	EINVAL	E.g. if an I_LINK was in effect when
5094		 *		getsockname was called.
5095		 *	EPIPE	The ioctl error semantics prefer the write
5096		 *		side error over the read side error.
5097		 */
5098		error = 0;
5099	} else if (res == 0 && strbuf.len > 0 &&
5100	    (so->so_state & SS_ISBOUND)) {
5101		ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5102		sti->sti_laddr_len = (socklen_t)strbuf.len;
5103		bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5104		sti->sti_laddr_valid = 1;
5105
5106		bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5107		*namelen = sti->sti_laddr_len;
5108	}
5109	kmem_free(addr, addrlen);
5110#ifdef DEBUG
5111	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5112	    pr_addr(so->so_family, sti->sti_laddr_sa,
5113	    (t_uscalar_t)sti->sti_laddr_len)));
5114#endif /* DEBUG */
5115done:
5116	so_unlock_single(so, SOLOCKED);
5117	mutex_exit(&so->so_lock);
5118	return (error);
5119}
5120
5121/*
5122 * Get socket options. For SOL_SOCKET options some options are handled
5123 * by the sockfs while others use the value recorded in the sonode as a
5124 * fallback should the T_SVR4_OPTMGMT_REQ fail.
5125 *
5126 * On the return most *optlenp bytes are copied to optval.
5127 */
5128/* ARGSUSED */
5129int
5130sotpi_getsockopt(struct sonode *so, int level, int option_name,
5131		void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5132{
5133	struct T_optmgmt_req	optmgmt_req;
5134	struct T_optmgmt_ack	*optmgmt_ack;
5135	struct opthdr		oh;
5136	struct opthdr		*opt_res;
5137	mblk_t			*mp = NULL;
5138	int			error = 0;
5139	void			*option = NULL;	/* Set if fallback value */
5140	t_uscalar_t		maxlen = *optlenp;
5141	t_uscalar_t		len;
5142	uint32_t		value;
5143	struct timeval		tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5144	struct timeval32	tmo_val32;
5145	struct so_snd_bufinfo	snd_bufinfo;	/* used for zero copy */
5146
5147	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5148	    (void *)so, level, option_name, optval, (void *)optlenp,
5149	    pr_state(so->so_state, so->so_mode)));
5150
5151	mutex_enter(&so->so_lock);
5152	so_lock_single(so);	/* Set SOLOCKED */
5153
5154	/*
5155	 * Check for SOL_SOCKET options.
5156	 * Certain SOL_SOCKET options are returned directly whereas
5157	 * others only provide a default (fallback) value should
5158	 * the T_SVR4_OPTMGMT_REQ fail.
5159	 */
5160	if (level == SOL_SOCKET) {
5161		/* Check parameters */
5162		switch (option_name) {
5163		case SO_TYPE:
5164		case SO_ERROR:
5165		case SO_DEBUG:
5166		case SO_ACCEPTCONN:
5167		case SO_REUSEADDR:
5168		case SO_KEEPALIVE:
5169		case SO_DONTROUTE:
5170		case SO_BROADCAST:
5171		case SO_USELOOPBACK:
5172		case SO_OOBINLINE:
5173		case SO_SNDBUF:
5174		case SO_RCVBUF:
5175#ifdef notyet
5176		case SO_SNDLOWAT:
5177		case SO_RCVLOWAT:
5178#endif /* notyet */
5179		case SO_DOMAIN:
5180		case SO_DGRAM_ERRIND:
5181			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5182				error = EINVAL;
5183				eprintsoline(so, error);
5184				goto done2;
5185			}
5186			break;
5187		case SO_RCVTIMEO:
5188		case SO_SNDTIMEO:
5189			if (get_udatamodel() == DATAMODEL_NONE ||
5190			    get_udatamodel() == DATAMODEL_NATIVE) {
5191				if (maxlen < sizeof (struct timeval)) {
5192					error = EINVAL;
5193					eprintsoline(so, error);
5194					goto done2;
5195				}
5196			} else {
5197				if (maxlen < sizeof (struct timeval32)) {
5198					error = EINVAL;
5199					eprintsoline(so, error);
5200					goto done2;
5201				}
5202
5203			}
5204			break;
5205		case SO_LINGER:
5206			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5207				error = EINVAL;
5208				eprintsoline(so, error);
5209				goto done2;
5210			}
5211			break;
5212		case SO_SND_BUFINFO:
5213			if (maxlen < (t_uscalar_t)
5214			    sizeof (struct so_snd_bufinfo)) {
5215				error = EINVAL;
5216				eprintsoline(so, error);
5217				goto done2;
5218			}
5219			break;
5220		}
5221
5222		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
5223
5224		switch (option_name) {
5225		case SO_TYPE:
5226			value = so->so_type;
5227			option = &value;
5228			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5229
5230		case SO_ERROR:
5231			value = sogeterr(so, B_TRUE);
5232			option = &value;
5233			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5234
5235		case SO_ACCEPTCONN:
5236			if (so->so_state & SS_ACCEPTCONN)
5237				value = SO_ACCEPTCONN;
5238			else
5239				value = 0;
5240#ifdef DEBUG
5241			if (value) {
5242				dprintso(so, 1,
5243				    ("sotpi_getsockopt: 0x%x is set\n",
5244				    option_name));
5245			} else {
5246				dprintso(so, 1,
5247				    ("sotpi_getsockopt: 0x%x not set\n",
5248				    option_name));
5249			}
5250#endif /* DEBUG */
5251			option = &value;
5252			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5253
5254		case SO_DEBUG:
5255		case SO_REUSEADDR:
5256		case SO_KEEPALIVE:
5257		case SO_DONTROUTE:
5258		case SO_BROADCAST:
5259		case SO_USELOOPBACK:
5260		case SO_OOBINLINE:
5261		case SO_DGRAM_ERRIND:
5262			value = (so->so_options & option_name);
5263#ifdef DEBUG
5264			if (value) {
5265				dprintso(so, 1,
5266				    ("sotpi_getsockopt: 0x%x is set\n",
5267				    option_name));
5268			} else {
5269				dprintso(so, 1,
5270				    ("sotpi_getsockopt: 0x%x not set\n",
5271				    option_name));
5272			}
5273#endif /* DEBUG */
5274			option = &value;
5275			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5276
5277		/*
5278		 * The following options are only returned by sockfs when the
5279		 * T_SVR4_OPTMGMT_REQ fails.
5280		 */
5281		case SO_LINGER:
5282			option = &so->so_linger;
5283			len = (t_uscalar_t)sizeof (struct linger);
5284			break;
5285		case SO_SNDBUF: {
5286			ssize_t lvalue;
5287
5288			/*
5289			 * If the option has not been set then get a default
5290			 * value from the read queue. This value is
5291			 * returned if the transport fails
5292			 * the T_SVR4_OPTMGMT_REQ.
5293			 */
5294			lvalue = so->so_sndbuf;
5295			if (lvalue == 0) {
5296				mutex_exit(&so->so_lock);
5297				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5298				    QHIWAT, 0, &lvalue);
5299				mutex_enter(&so->so_lock);
5300				dprintso(so, 1,
5301				    ("got SO_SNDBUF %ld from q\n", lvalue));
5302			}
5303			value = (int)lvalue;
5304			option = &value;
5305			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5306			break;
5307		}
5308		case SO_RCVBUF: {
5309			ssize_t lvalue;
5310
5311			/*
5312			 * If the option has not been set then get a default
5313			 * value from the read queue. This value is
5314			 * returned if the transport fails
5315			 * the T_SVR4_OPTMGMT_REQ.
5316			 *
5317			 * XXX If SO_RCVBUF has been set and this is an
5318			 * XPG 4.2 application then do not ask the transport
5319			 * since the transport might adjust the value and not
5320			 * return exactly what was set by the application.
5321			 * For non-XPG 4.2 application we return the value
5322			 * that the transport is actually using.
5323			 */
5324			lvalue = so->so_rcvbuf;
5325			if (lvalue == 0) {
5326				mutex_exit(&so->so_lock);
5327				(void) strqget(RD(strvp2wq(SOTOV(so))),
5328				    QHIWAT, 0, &lvalue);
5329				mutex_enter(&so->so_lock);
5330				dprintso(so, 1,
5331				    ("got SO_RCVBUF %ld from q\n", lvalue));
5332			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5333				value = (int)lvalue;
5334				option = &value;
5335				goto copyout;	/* skip asking transport */
5336			}
5337			value = (int)lvalue;
5338			option = &value;
5339			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5340			break;
5341		}
5342		case SO_DOMAIN:
5343			value = so->so_family;
5344			option = &value;
5345			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5346
5347#ifdef notyet
5348		/*
5349		 * We do not implement the semantics of these options
5350		 * thus we shouldn't implement the options either.
5351		 */
5352		case SO_SNDLOWAT:
5353			value = so->so_sndlowat;
5354			option = &value;
5355			break;
5356		case SO_RCVLOWAT:
5357			value = so->so_rcvlowat;
5358			option = &value;
5359			break;
5360#endif /* notyet */
5361		case SO_SNDTIMEO:
5362		case SO_RCVTIMEO: {
5363			clock_t val;
5364
5365			if (option_name == SO_RCVTIMEO)
5366				val = drv_hztousec(so->so_rcvtimeo);
5367			else
5368				val = drv_hztousec(so->so_sndtimeo);
5369			tmo_val.tv_sec = val / (1000 * 1000);
5370			tmo_val.tv_usec = val % (1000 * 1000);
5371			if (get_udatamodel() == DATAMODEL_NONE ||
5372			    get_udatamodel() == DATAMODEL_NATIVE) {
5373				option = &tmo_val;
5374				len = sizeof (struct timeval);
5375			} else {
5376				TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5377				option = &tmo_val32;
5378				len = sizeof (struct timeval32);
5379			}
5380			break;
5381		}
5382		case SO_SND_BUFINFO: {
5383			snd_bufinfo.sbi_wroff =
5384			    (so->so_proto_props).sopp_wroff;
5385			snd_bufinfo.sbi_maxblk =
5386			    (so->so_proto_props).sopp_maxblk;
5387			snd_bufinfo.sbi_maxpsz =
5388			    (so->so_proto_props).sopp_maxpsz;
5389			snd_bufinfo.sbi_tail =
5390			    (so->so_proto_props).sopp_tail;
5391			option = &snd_bufinfo;
5392			len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5393			break;
5394		}
5395		}
5396	}
5397
5398	mutex_exit(&so->so_lock);
5399
5400	/* Send request */
5401	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5402	optmgmt_req.MGMT_flags = T_CHECK;
5403	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5404	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5405
5406	oh.level = level;
5407	oh.name = option_name;
5408	oh.len = maxlen;
5409
5410	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5411	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5412	/* Let option management work in the presence of data flow control */
5413	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5414	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5415	mp = NULL;
5416	mutex_enter(&so->so_lock);
5417	if (error) {
5418		eprintsoline(so, error);
5419		goto done2;
5420	}
5421	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5422	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5423	if (error) {
5424		if (option != NULL) {
5425			/* We have a fallback value */
5426			error = 0;
5427			goto copyout;
5428		}
5429		eprintsoline(so, error);
5430		goto done2;
5431	}
5432	ASSERT(mp);
5433	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5434	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5435	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5436	if (opt_res == NULL) {
5437		if (option != NULL) {
5438			/* We have a fallback value */
5439			error = 0;
5440			goto copyout;
5441		}
5442		error = EPROTO;
5443		eprintsoline(so, error);
5444		goto done;
5445	}
5446	option = &opt_res[1];
5447
5448	/* check to ensure that the option is within bounds */
5449	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5450	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5451		if (option != NULL) {
5452			/* We have a fallback value */
5453			error = 0;
5454			goto copyout;
5455		}
5456		error = EPROTO;
5457		eprintsoline(so, error);
5458		goto done;
5459	}
5460
5461	len = opt_res->len;
5462
5463copyout: {
5464		t_uscalar_t size = MIN(len, maxlen);
5465		bcopy(option, optval, size);
5466		bcopy(&size, optlenp, sizeof (size));
5467	}
5468done:
5469	freemsg(mp);
5470done2:
5471	so_unlock_single(so, SOLOCKED);
5472	mutex_exit(&so->so_lock);
5473
5474	return (error);
5475}
5476
5477/*
5478 * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5479 * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5480 * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5481 * setsockopt has to work even if the transport does not support the option.
5482 */
5483/* ARGSUSED */
5484int
5485sotpi_setsockopt(struct sonode *so, int level, int option_name,
5486	const void *optval, t_uscalar_t optlen, struct cred *cr)
5487{
5488	struct T_optmgmt_req	optmgmt_req;
5489	struct opthdr		oh;
5490	mblk_t			*mp;
5491	int			error = 0;
5492	boolean_t		handled = B_FALSE;
5493
5494	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5495	    (void *)so, level, option_name, optval, optlen,
5496	    pr_state(so->so_state, so->so_mode)));
5497
5498	/* X/Open requires this check */
5499	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5500		if (xnet_check_print)
5501			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5502		return (EINVAL);
5503	}
5504
5505	mutex_enter(&so->so_lock);
5506	so_lock_single(so);	/* Set SOLOCKED */
5507	mutex_exit(&so->so_lock);
5508
5509	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5510	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5511	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5512	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5513
5514	oh.level = level;
5515	oh.name = option_name;
5516	oh.len = optlen;
5517
5518	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5519	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5520	/* Let option management work in the presence of data flow control */
5521	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5522	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5523	mp = NULL;
5524	mutex_enter(&so->so_lock);
5525	if (error) {
5526		eprintsoline(so, error);
5527		goto done2;
5528	}
5529	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5530	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5531	if (error) {
5532		eprintsoline(so, error);
5533		goto done;
5534	}
5535	ASSERT(mp);
5536	/* No need to verify T_optmgmt_ack */
5537	freemsg(mp);
5538done:
5539	/*
5540	 * Check for SOL_SOCKET options and record their values.
5541	 * If we know about a SOL_SOCKET parameter and the transport
5542	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5543	 * EPROTO) we let the setsockopt succeed.
5544	 */
5545	if (level == SOL_SOCKET) {
5546		/* Check parameters */
5547		switch (option_name) {
5548		case SO_DEBUG:
5549		case SO_REUSEADDR:
5550		case SO_KEEPALIVE:
5551		case SO_DONTROUTE:
5552		case SO_BROADCAST:
5553		case SO_USELOOPBACK:
5554		case SO_OOBINLINE:
5555		case SO_SNDBUF:
5556		case SO_RCVBUF:
5557#ifdef notyet
5558		case SO_SNDLOWAT:
5559		case SO_RCVLOWAT:
5560#endif /* notyet */
5561		case SO_DGRAM_ERRIND:
5562			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5563				error = EINVAL;
5564				eprintsoline(so, error);
5565				goto done2;
5566			}
5567			ASSERT(optval);
5568			handled = B_TRUE;
5569			break;
5570		case SO_SNDTIMEO:
5571		case SO_RCVTIMEO:
5572			if (get_udatamodel() == DATAMODEL_NONE ||
5573			    get_udatamodel() == DATAMODEL_NATIVE) {
5574				if (optlen != sizeof (struct timeval)) {
5575					error = EINVAL;
5576					eprintsoline(so, error);
5577					goto done2;
5578				}
5579			} else {
5580				if (optlen != sizeof (struct timeval32)) {
5581					error = EINVAL;
5582					eprintsoline(so, error);
5583					goto done2;
5584				}
5585			}
5586			ASSERT(optval);
5587			handled = B_TRUE;
5588			break;
5589		case SO_LINGER:
5590			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5591				error = EINVAL;
5592				eprintsoline(so, error);
5593				goto done2;
5594			}
5595			ASSERT(optval);
5596			handled = B_TRUE;
5597			break;
5598		}
5599
5600#define	intvalue	(*(int32_t *)optval)
5601
5602		switch (option_name) {
5603		case SO_TYPE:
5604		case SO_ERROR:
5605		case SO_ACCEPTCONN:
5606			/* Can't be set */
5607			error = ENOPROTOOPT;
5608			goto done2;
5609		case SO_LINGER: {
5610			struct linger *l = (struct linger *)optval;
5611
5612			so->so_linger.l_linger = l->l_linger;
5613			if (l->l_onoff) {
5614				so->so_linger.l_onoff = SO_LINGER;
5615				so->so_options |= SO_LINGER;
5616			} else {
5617				so->so_linger.l_onoff = 0;
5618				so->so_options &= ~SO_LINGER;
5619			}
5620			break;
5621		}
5622
5623		case SO_DEBUG:
5624#ifdef SOCK_TEST
5625			if (intvalue & 2)
5626				sock_test_timelimit = 10 * hz;
5627			else
5628				sock_test_timelimit = 0;
5629
5630			if (intvalue & 4)
5631				do_useracc = 0;
5632			else
5633				do_useracc = 1;
5634#endif /* SOCK_TEST */
5635			/* FALLTHRU */
5636		case SO_REUSEADDR:
5637		case SO_KEEPALIVE:
5638		case SO_DONTROUTE:
5639		case SO_BROADCAST:
5640		case SO_USELOOPBACK:
5641		case SO_OOBINLINE:
5642		case SO_DGRAM_ERRIND:
5643			if (intvalue != 0) {
5644				dprintso(so, 1,
5645				    ("socket_setsockopt: setting 0x%x\n",
5646				    option_name));
5647				so->so_options |= option_name;
5648			} else {
5649				dprintso(so, 1,
5650				    ("socket_setsockopt: clearing 0x%x\n",
5651				    option_name));
5652				so->so_options &= ~option_name;
5653			}
5654			break;
5655		/*
5656		 * The following options are only returned by us when the
5657		 * transport layer fails.
5658		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5659		 * since the transport might adjust the value and not
5660		 * return exactly what was set by the application.
5661		 */
5662		case SO_SNDBUF:
5663			so->so_sndbuf = intvalue;
5664			break;
5665		case SO_RCVBUF:
5666			so->so_rcvbuf = intvalue;
5667			break;
5668		case SO_RCVPSH:
5669			so->so_rcv_timer_interval = intvalue;
5670			break;
5671#ifdef notyet
5672		/*
5673		 * We do not implement the semantics of these options
5674		 * thus we shouldn't implement the options either.
5675		 */
5676		case SO_SNDLOWAT:
5677			so->so_sndlowat = intvalue;
5678			break;
5679		case SO_RCVLOWAT:
5680			so->so_rcvlowat = intvalue;
5681			break;
5682#endif /* notyet */
5683		case SO_SNDTIMEO:
5684		case SO_RCVTIMEO: {
5685			struct timeval tl;
5686			clock_t val;
5687
5688			if (get_udatamodel() == DATAMODEL_NONE ||
5689			    get_udatamodel() == DATAMODEL_NATIVE)
5690				bcopy(&tl, (struct timeval *)optval,
5691				    sizeof (struct timeval));
5692			else
5693				TIMEVAL32_TO_TIMEVAL(&tl,
5694				    (struct timeval32 *)optval);
5695			val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5696			if (option_name == SO_RCVTIMEO)
5697				so->so_rcvtimeo = drv_usectohz(val);
5698			else
5699				so->so_sndtimeo = drv_usectohz(val);
5700			break;
5701		}
5702		}
5703#undef	intvalue
5704
5705		if (error) {
5706			if ((error == ENOPROTOOPT || error == EPROTO ||
5707			    error == EINVAL) && handled) {
5708				dprintso(so, 1,
5709				    ("setsockopt: ignoring error %d for 0x%x\n",
5710				    error, option_name));
5711				error = 0;
5712			}
5713		}
5714	}
5715done2:
5716	so_unlock_single(so, SOLOCKED);
5717	mutex_exit(&so->so_lock);
5718	return (error);
5719}
5720
5721/*
5722 * sotpi_close() is called when the last open reference goes away.
5723 */
5724/* ARGSUSED */
5725int
5726sotpi_close(struct sonode *so, int flag, struct cred *cr)
5727{
5728	struct vnode *vp = SOTOV(so);
5729	dev_t dev;
5730	int error = 0;
5731	sotpi_info_t *sti = SOTOTPI(so);
5732
5733	dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5734	    (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5735
5736	dev = sti->sti_dev;
5737
5738	ASSERT(STREAMSTAB(getmajor(dev)));
5739
5740	mutex_enter(&so->so_lock);
5741	so_lock_single(so);	/* Set SOLOCKED */
5742
5743	ASSERT(so_verify_oobstate(so));
5744
5745	if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5746		sti->sti_nl7c_flags = 0;
5747		nl7c_close(so);
5748	}
5749
5750	if (vp->v_stream != NULL) {
5751		vnode_t *ux_vp;
5752
5753		if (so->so_family == AF_UNIX) {
5754			/* Could avoid this when CANTSENDMORE for !dgram */
5755			so_unix_close(so);
5756		}
5757
5758		mutex_exit(&so->so_lock);
5759		/*
5760		 * Disassemble the linkage from the AF_UNIX underlying file
5761		 * system vnode to this socket (by atomically clearing
5762		 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5763		 * and frees the stream head.
5764		 */
5765		if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5766			ASSERT(ux_vp->v_stream);
5767			sti->sti_ux_bound_vp = NULL;
5768			vn_rele_stream(ux_vp);
5769		}
5770		if (so->so_family == AF_INET || so->so_family == AF_INET6) {
5771			strsetrwputdatahooks(SOTOV(so), NULL, NULL);
5772			if (sti->sti_kssl_ent != NULL) {
5773				kssl_release_ent(sti->sti_kssl_ent, so,
5774				    sti->sti_kssl_type);
5775				sti->sti_kssl_ent = NULL;
5776			}
5777			if (sti->sti_kssl_ctx != NULL) {
5778				kssl_release_ctx(sti->sti_kssl_ctx);
5779				sti->sti_kssl_ctx = NULL;
5780			}
5781			sti->sti_kssl_type = KSSL_NO_PROXY;
5782		}
5783		error = strclose(vp, flag, cr);
5784		vp->v_stream = NULL;
5785		mutex_enter(&so->so_lock);
5786	}
5787
5788	/*
5789	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5790	 */
5791	so_flush_discon_ind(so);
5792
5793	so_unlock_single(so, SOLOCKED);
5794	mutex_exit(&so->so_lock);
5795
5796	/*
5797	 * Needed for STREAMs.
5798	 * Decrement the device driver's reference count for streams
5799	 * opened via the clone dip. The driver was held in clone_open().
5800	 * The absence of clone_close() forces this asymmetry.
5801	 */
5802	if (so->so_flag & SOCLONE)
5803		ddi_rele_driver(getmajor(dev));
5804
5805	return (error);
5806}
5807
5808static int
5809sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5810    struct cred *cr, int32_t *rvalp)
5811{
5812	struct vnode *vp = SOTOV(so);
5813	sotpi_info_t *sti = SOTOTPI(so);
5814	int error = 0;
5815
5816	dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5817	    cmd, arg, pr_state(so->so_state, so->so_mode)));
5818
5819	switch (cmd) {
5820	case SIOCSQPTR:
5821		/*
5822		 * SIOCSQPTR is valid only when helper stream is created
5823		 * by the protocol.
5824		 */
5825	case _I_INSERT:
5826	case _I_REMOVE:
5827		/*
5828		 * Since there's no compelling reason to support these ioctls
5829		 * on sockets, and doing so would increase the complexity
5830		 * markedly, prevent it.
5831		 */
5832		return (EOPNOTSUPP);
5833
5834	case I_FIND:
5835	case I_LIST:
5836	case I_LOOK:
5837	case I_POP:
5838	case I_PUSH:
5839		/*
5840		 * To prevent races and inconsistencies between the actual
5841		 * state of the stream and the state according to the sonode,
5842		 * we serialize all operations which modify or operate on the
5843		 * list of modules on the socket's stream.
5844		 */
5845		mutex_enter(&sti->sti_plumb_lock);
5846		error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5847		mutex_exit(&sti->sti_plumb_lock);
5848		return (error);
5849
5850	default:
5851		if (so->so_version != SOV_STREAM)
5852			break;
5853
5854		/*
5855		 * The imaginary "sockmod" has been popped; act as a stream.
5856		 */
5857		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5858	}
5859
5860	ASSERT(so->so_version != SOV_STREAM);
5861
5862	/*
5863	 * Process socket-specific ioctls.
5864	 */
5865	switch (cmd) {
5866	case FIONBIO: {
5867		int32_t value;
5868
5869		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5870		    (mode & (int)FKIOCTL)))
5871			return (EFAULT);
5872
5873		mutex_enter(&so->so_lock);
5874		if (value) {
5875			so->so_state |= SS_NDELAY;
5876		} else {
5877			so->so_state &= ~SS_NDELAY;
5878		}
5879		mutex_exit(&so->so_lock);
5880		return (0);
5881	}
5882
5883	case FIOASYNC: {
5884		int32_t value;
5885
5886		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5887		    (mode & (int)FKIOCTL)))
5888			return (EFAULT);
5889
5890		mutex_enter(&so->so_lock);
5891		/*
5892		 * SS_ASYNC flag not already set correctly?
5893		 * (!value != !(so->so_state & SS_ASYNC))
5894		 * but some engineers find that too hard to read.
5895		 */
5896		if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
5897		    value != 0 && (so->so_state & SS_ASYNC) == 0)
5898			error = so_flip_async(so, vp, mode, cr);
5899		mutex_exit(&so->so_lock);
5900		return (error);
5901	}
5902
5903	case SIOCSPGRP:
5904	case FIOSETOWN: {
5905		pid_t pgrp;
5906
5907		if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5908		    (mode & (int)FKIOCTL)))
5909			return (EFAULT);
5910
5911		mutex_enter(&so->so_lock);
5912		dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5913		/* Any change? */
5914		if (pgrp != so->so_pgrp)
5915			error = so_set_siggrp(so, vp, pgrp, mode, cr);
5916		mutex_exit(&so->so_lock);
5917		return (error);
5918	}
5919	case SIOCGPGRP:
5920	case FIOGETOWN:
5921		if (so_copyout(&so->so_pgrp, (void *)arg,
5922		    sizeof (pid_t), (mode & (int)FKIOCTL)))
5923			return (EFAULT);
5924		return (0);
5925
5926	case SIOCATMARK: {
5927		int retval;
5928		uint_t so_state;
5929
5930		/*
5931		 * strwaitmark has a finite timeout after which it
5932		 * returns -1 if the mark state is undetermined.
5933		 * In order to avoid any race between the mark state
5934		 * in sockfs and the mark state in the stream head this
5935		 * routine loops until the mark state can be determined
5936		 * (or the urgent data indication has been removed by some
5937		 * other thread).
5938		 */
5939		do {
5940			mutex_enter(&so->so_lock);
5941			so_state = so->so_state;
5942			mutex_exit(&so->so_lock);
5943			if (so_state & SS_RCVATMARK) {
5944				retval = 1;
5945			} else if (!(so_state & SS_OOBPEND)) {
5946				/*
5947				 * No SIGURG has been generated -- there is no
5948				 * pending or present urgent data. Thus can't
5949				 * possibly be at the mark.
5950				 */
5951				retval = 0;
5952			} else {
5953				/*
5954				 * Have the stream head wait until there is
5955				 * either some messages on the read queue, or
5956				 * STRATMARK or STRNOTATMARK gets set. The
5957				 * STRNOTATMARK flag is used so that the
5958				 * transport can send up a MSGNOTMARKNEXT
5959				 * M_DATA to indicate that it is not
5960				 * at the mark and additional data is not about
5961				 * to be send upstream.
5962				 *
5963				 * If the mark state is undetermined this will
5964				 * return -1 and we will loop rechecking the
5965				 * socket state.
5966				 */
5967				retval = strwaitmark(vp);
5968			}
5969		} while (retval == -1);
5970
5971		if (so_copyout(&retval, (void *)arg, sizeof (int),
5972		    (mode & (int)FKIOCTL)))
5973			return (EFAULT);
5974		return (0);
5975	}
5976
5977	case I_FDINSERT:
5978	case I_SENDFD:
5979	case I_RECVFD:
5980	case I_ATMARK:
5981	case _SIOCSOCKFALLBACK:
5982		/*
5983		 * These ioctls do not apply to sockets. I_FDINSERT can be
5984		 * used to send M_PROTO messages without modifying the socket
5985		 * state. I_SENDFD/RECVFD should not be used for socket file
5986		 * descriptor passing since they assume a twisted stream.
5987		 * SIOCATMARK must be used instead of I_ATMARK.
5988		 *
5989		 * _SIOCSOCKFALLBACK from an application should never be
5990		 * processed.  It is only generated by socktpi_open() or
5991		 * in response to I_POP or I_PUSH.
5992		 */
5993#ifdef DEBUG
5994		zcmn_err(getzoneid(), CE_WARN,
5995		    "Unsupported STREAMS ioctl 0x%x on socket. "
5996		    "Pid = %d\n", cmd, curproc->p_pid);
5997#endif /* DEBUG */
5998		return (EOPNOTSUPP);
5999
6000	case _I_GETPEERCRED:
6001		if ((mode & FKIOCTL) == 0)
6002			return (EINVAL);
6003
6004		mutex_enter(&so->so_lock);
6005		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
6006			error = ENOTSUP;
6007		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
6008			error = ENOTCONN;
6009		} else if (so->so_peercred != NULL) {
6010			k_peercred_t *kp = (k_peercred_t *)arg;
6011			kp->pc_cr = so->so_peercred;
6012			kp->pc_cpid = so->so_cpid;
6013			crhold(so->so_peercred);
6014		} else {
6015			error = EINVAL;
6016		}
6017		mutex_exit(&so->so_lock);
6018		return (error);
6019
6020	default:
6021		/*
6022		 * Do the higher-order bits of the ioctl cmd indicate
6023		 * that it is an I_* streams ioctl?
6024		 */
6025		if ((cmd & 0xffffff00U) == STR &&
6026		    so->so_version == SOV_SOCKBSD) {
6027#ifdef DEBUG
6028			zcmn_err(getzoneid(), CE_WARN,
6029			    "Unsupported STREAMS ioctl 0x%x on socket. "
6030			    "Pid = %d\n", cmd, 	curproc->p_pid);
6031#endif /* DEBUG */
6032			return (EOPNOTSUPP);
6033		}
6034		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6035	}
6036}
6037
6038/*
6039 * Handle plumbing-related ioctls.
6040 */
6041static int
6042socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
6043    struct cred *cr, int32_t *rvalp)
6044{
6045	static const char sockmod_name[] = "sockmod";
6046	struct sonode	*so = VTOSO(vp);
6047	char		mname[FMNAMESZ + 1];
6048	int		error;
6049	sotpi_info_t	*sti = SOTOTPI(so);
6050
6051	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
6052
6053	if (so->so_version == SOV_SOCKBSD)
6054		return (EOPNOTSUPP);
6055
6056	if (so->so_version == SOV_STREAM) {
6057		/*
6058		 * The imaginary "sockmod" has been popped - act as a stream.
6059		 * If this is a push of sockmod then change back to a socket.
6060		 */
6061		if (cmd == I_PUSH) {
6062			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6063			    (void *)arg, mname, sizeof (mname), NULL);
6064
6065			if (error == 0 && strcmp(mname, sockmod_name) == 0) {
6066				dprintso(so, 0, ("socktpi_ioctl: going to "
6067				    "socket version\n"));
6068				so_stream2sock(so);
6069				return (0);
6070			}
6071		}
6072		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6073	}
6074
6075	switch (cmd) {
6076	case I_PUSH:
6077		if (sti->sti_direct) {
6078			mutex_enter(&so->so_lock);
6079			so_lock_single(so);
6080			mutex_exit(&so->so_lock);
6081
6082			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6083			    cr, rvalp);
6084
6085			mutex_enter(&so->so_lock);
6086			if (error == 0)
6087				sti->sti_direct = 0;
6088			so_unlock_single(so, SOLOCKED);
6089			mutex_exit(&so->so_lock);
6090
6091			if (error != 0)
6092				return (error);
6093		}
6094
6095		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6096		if (error == 0)
6097			sti->sti_pushcnt++;
6098		return (error);
6099
6100	case I_POP:
6101		if (sti->sti_pushcnt == 0) {
6102			/* Emulate sockmod being popped */
6103			dprintso(so, 0,
6104			    ("socktpi_ioctl: going to STREAMS version\n"));
6105			return (so_sock2stream(so));
6106		}
6107
6108		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6109		if (error == 0)
6110			sti->sti_pushcnt--;
6111		return (error);
6112
6113	case I_LIST: {
6114		struct str_mlist *kmlistp, *umlistp;
6115		struct str_list	kstrlist;
6116		ssize_t		kstrlistsize;
6117		int		i, nmods;
6118
6119		STRUCT_DECL(str_list, ustrlist);
6120		STRUCT_INIT(ustrlist, mode);
6121
6122		if (arg == NULL) {
6123			error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6124			if (error == 0)
6125				(*rvalp)++;	/* Add one for sockmod */
6126			return (error);
6127		}
6128
6129		error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6130		    STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6131		if (error != 0)
6132			return (error);
6133
6134		nmods = STRUCT_FGET(ustrlist, sl_nmods);
6135		if (nmods <= 0)
6136			return (EINVAL);
6137		/*
6138		 * Ceiling nmods at nstrpush to prevent someone from
6139		 * maliciously consuming lots of kernel memory.
6140		 */
6141		nmods = MIN(nmods, nstrpush);
6142
6143		kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6144		kstrlist.sl_nmods = nmods;
6145		kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6146
6147		error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6148		    cr, rvalp);
6149		if (error != 0)
6150			goto done;
6151
6152		/*
6153		 * Considering the module list as a 0-based array of sl_nmods
6154		 * modules, sockmod should conceptually exist at slot
6155		 * sti_pushcnt.  Insert sockmod at this location by sliding all
6156		 * of the module names after so_pushcnt over by one.  We know
6157		 * that there will be room to do this since we allocated
6158		 * sl_modlist with an additional slot.
6159		 */
6160		for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6161			kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6162
6163		(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6164		kstrlist.sl_nmods++;
6165
6166		/*
6167		 * Copy all of the entries out to ustrlist.
6168		 */
6169		kmlistp = kstrlist.sl_modlist;
6170		umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6171		for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6172			error = so_copyout(kmlistp++, umlistp++,
6173			    sizeof (struct str_mlist), mode & FKIOCTL);
6174			if (error != 0)
6175				goto done;
6176		}
6177
6178		error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6179		    mode & FKIOCTL);
6180		if (error == 0)
6181			*rvalp = 0;
6182	done:
6183		kmem_free(kstrlist.sl_modlist, kstrlistsize);
6184		return (error);
6185	}
6186	case I_LOOK:
6187		if (sti->sti_pushcnt == 0) {
6188			return (so_copyout(sockmod_name, (void *)arg,
6189			    sizeof (sockmod_name), mode & FKIOCTL));
6190		}
6191		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6192
6193	case I_FIND:
6194		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6195		if (error && error != EINVAL)
6196			return (error);
6197
6198		/* if not found and string was sockmod return 1 */
6199		if (*rvalp == 0 || error == EINVAL) {
6200			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6201			    (void *)arg, mname, sizeof (mname), NULL);
6202			if (error == ENAMETOOLONG)
6203				error = EINVAL;
6204
6205			if (error == 0 && strcmp(mname, sockmod_name) == 0)
6206				*rvalp = 1;
6207		}
6208		return (error);
6209
6210	default:
6211		panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6212		break;
6213	}
6214
6215	return (0);
6216}
6217
6218/*
6219 * Wrapper around the streams poll routine that implements socket poll
6220 * semantics.
6221 * The sockfs never calls pollwakeup itself - the stream head take care
6222 * of all pollwakeups. Since sockfs never holds so_lock when calling the
6223 * stream head there can never be a deadlock due to holding so_lock across
6224 * pollwakeup and acquiring so_lock in this routine.
6225 *
6226 * However, since the performance of VOP_POLL is critical we avoid
6227 * acquiring so_lock here. This is based on two assumptions:
6228 *  - The poll implementation holds locks to serialize the VOP_POLL call
6229 *    and a pollwakeup for the same pollhead. This ensures that should
6230 *    e.g. so_state change during a socktpi_poll call the pollwakeup
6231 *    (which strsock_* and strrput conspire to issue) is issued after
6232 *    the state change. Thus the pollwakeup will block until VOP_POLL has
6233 *    returned and then wake up poll and have it call VOP_POLL again.
6234 *  - The reading of so_state without holding so_lock does not result in
6235 *    stale data that is older than the latest state change that has dropped
6236 *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6237 *    memory barrier to force the data into the coherency domain.
6238 */
6239static int
6240sotpi_poll(
6241	struct sonode	*so,
6242	short		events,
6243	int		anyyet,
6244	short		*reventsp,
6245	struct pollhead **phpp)
6246{
6247	short origevents = events;
6248	struct vnode *vp = SOTOV(so);
6249	int error;
6250	int so_state = so->so_state;	/* snapshot */
6251	sotpi_info_t *sti = SOTOTPI(so);
6252
6253	dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6254	    (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6255
6256	ASSERT(vp->v_type == VSOCK);
6257	ASSERT(vp->v_stream != NULL);
6258
6259	if (so->so_version == SOV_STREAM) {
6260		/* The imaginary "sockmod" has been popped - act as a stream */
6261		return (strpoll(vp->v_stream, events, anyyet,
6262		    reventsp, phpp));
6263	}
6264
6265	if (!(so_state & SS_ISCONNECTED) &&
6266	    (so->so_mode & SM_CONNREQUIRED)) {
6267		/* Not connected yet - turn off write side events */
6268		events &= ~(POLLOUT|POLLWRBAND);
6269	}
6270	/*
6271	 * Check for errors without calling strpoll if the caller wants them.
6272	 * In sockets the errors are represented as input/output events
6273	 * and there is no need to ask the stream head for this information.
6274	 */
6275	if (so->so_error != 0 &&
6276	    ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6277		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6278		return (0);
6279	}
6280	/*
6281	 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6282	 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6283	 * will not trigger a POLLIN event with POLLRDDATA set.
6284	 * The handling of urgent data (causing POLLRDBAND) is done by
6285	 * inspecting SS_OOBPEND below.
6286	 */
6287	events |= POLLRDDATA;
6288
6289	/*
6290	 * After shutdown(output) a stream head write error is set.
6291	 * However, we should not return output events.
6292	 */
6293	events |= POLLNOERR;
6294	error = strpoll(vp->v_stream, events, anyyet,
6295	    reventsp, phpp);
6296	if (error)
6297		return (error);
6298
6299	ASSERT(!(*reventsp & POLLERR));
6300
6301	/*
6302	 * Notes on T_CONN_IND handling for sockets.
6303	 *
6304	 * If strpoll() returned without events, SR_POLLIN is guaranteed
6305	 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6306	 *
6307	 * Since the so_lock is not held, soqueueconnind() may have run
6308	 * and a T_CONN_IND may be waiting. We now check for any queued
6309	 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6310	 * to ensure poll returns.
6311	 *
6312	 * However:
6313	 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6314	 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6315	 * the following actions will occur; taken together they ensure the
6316	 * syscall will return.
6317	 *
6318	 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6319	 *    the accept() was run on a non-blocking socket sowaitconnind()
6320	 *    may have already returned EWOULDBLOCK, so not be waiting to
6321	 *    process the message. Additionally socktpi_poll() has probably
6322	 *    proceeded past the sti_conn_ind_head check below.
6323	 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6324	 *    this thread,  however that could occur before poll_common()
6325	 *    has entered cv_wait.
6326	 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6327	 *
6328	 * Before proceeding to cv_wait() in poll_common() for an event,
6329	 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6330	 * and if set, re-calls strpoll() to ensure the late arriving
6331	 * T_CONN_IND is recognized, and pollsys() returns.
6332	 */
6333
6334	if (sti->sti_conn_ind_head != NULL)
6335		*reventsp |= (POLLIN|POLLRDNORM) & events;
6336
6337	if (so->so_state & SS_OOBPEND)
6338		*reventsp |= POLLRDBAND & events;
6339
6340	if (sti->sti_nl7c_rcv_mp != NULL) {
6341		*reventsp |= (POLLIN|POLLRDNORM) & events;
6342	}
6343	if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6344	    ((POLLIN|POLLRDNORM) & *reventsp)) {
6345		sti->sti_nl7c_flags |= NL7C_POLLIN;
6346	}
6347
6348	return (0);
6349}
6350
6351/*ARGSUSED*/
6352static int
6353socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6354{
6355	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6356	int error = 0;
6357
6358	error = sonode_constructor(buf, cdrarg, kmflags);
6359	if (error != 0)
6360		return (error);
6361
6362	error = i_sotpi_info_constructor(&st->st_info);
6363	if (error != 0)
6364		sonode_destructor(buf, cdrarg);
6365
6366	st->st_sonode.so_priv = &st->st_info;
6367
6368	return (error);
6369}
6370
6371/*ARGSUSED1*/
6372static void
6373socktpi_destructor(void *buf, void *cdrarg)
6374{
6375	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6376
6377	ASSERT(st->st_sonode.so_priv == &st->st_info);
6378	st->st_sonode.so_priv = NULL;
6379
6380	i_sotpi_info_destructor(&st->st_info);
6381	sonode_destructor(buf, cdrarg);
6382}
6383
6384static int
6385socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6386{
6387	int retval;
6388
6389	if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6390		struct sonode *so = (struct sonode *)buf;
6391		sotpi_info_t *sti = SOTOTPI(so);
6392
6393		mutex_enter(&socklist.sl_lock);
6394
6395		sti->sti_next_so = socklist.sl_list;
6396		sti->sti_prev_so = NULL;
6397		if (sti->sti_next_so != NULL)
6398			SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6399		socklist.sl_list = so;
6400
6401		mutex_exit(&socklist.sl_lock);
6402
6403	}
6404	return (retval);
6405}
6406
6407static void
6408socktpi_unix_destructor(void *buf, void *cdrarg)
6409{
6410	struct sonode	*so = (struct sonode *)buf;
6411	sotpi_info_t	*sti = SOTOTPI(so);
6412
6413	mutex_enter(&socklist.sl_lock);
6414
6415	if (sti->sti_next_so != NULL)
6416		SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6417	if (sti->sti_prev_so != NULL)
6418		SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6419	else
6420		socklist.sl_list = sti->sti_next_so;
6421
6422	mutex_exit(&socklist.sl_lock);
6423
6424	socktpi_destructor(buf, cdrarg);
6425}
6426
6427int
6428socktpi_init(void)
6429{
6430	/*
6431	 * Create sonode caches.  We create a special one for AF_UNIX so
6432	 * that we can track them for netstat(1m).
6433	 */
6434	socktpi_cache = kmem_cache_create("socktpi_cache",
6435	    sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6436	    socktpi_destructor, NULL, NULL, NULL, 0);
6437
6438	socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6439	    sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6440	    socktpi_unix_destructor, NULL, NULL, NULL, 0);
6441
6442	return (0);
6443}
6444
6445/*
6446 * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6447 *
6448 * Caller must still update state and mode using sotpi_update_state().
6449 */
6450int
6451sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6452    boolean_t *direct, queue_t **qp, struct cred *cr)
6453{
6454	sotpi_info_t *sti;
6455	struct sockparams *origsp = so->so_sockparams;
6456	sock_lower_handle_t handle = so->so_proto_handle;
6457	struct stdata *stp;
6458	struct vnode *vp;
6459	queue_t *q;
6460	int error = 0;
6461
6462	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6463	    SS_FALLBACK_PENDING);
6464	ASSERT(SOCK_IS_NONSTR(so));
6465
6466	*qp = NULL;
6467	*direct = B_FALSE;
6468	so->so_sockparams = newsp;
6469	/*
6470	 * Allocate and initalize fields required by TPI.
6471	 */
6472	(void) sotpi_info_create(so, KM_SLEEP);
6473	sotpi_info_init(so);
6474
6475	if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6476		sotpi_info_fini(so);
6477		sotpi_info_destroy(so);
6478		return (error);
6479	}
6480	ASSERT(handle == so->so_proto_handle);
6481	sti = SOTOTPI(so);
6482	if (sti->sti_direct != 0)
6483		*direct = B_TRUE;
6484
6485	/*
6486	 * When it comes to urgent data we have two cases to deal with;
6487	 * (1) The oob byte has already arrived, or (2) the protocol has
6488	 * notified that oob data is pending, but it has not yet arrived.
6489	 *
6490	 * For (1) all we need to do is send a T_EXDATA_IND to indicate were
6491	 * in the byte stream the oob byte is. For (2) we have to send a
6492	 * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
6493	 * the oob byte will be the next byte from the protocol.
6494	 *
6495	 * So in the worst case we need two mblks, one for the signal, another
6496	 * for mark indication. In that case we use the exdata_mp for the sig.
6497	 */
6498	sti->sti_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind), BPRI_MED,
6499	    STR_NOSIG, NULL);
6500	sti->sti_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
6501
6502	/*
6503	 * Keep the original sp around so we can properly dispose of the
6504	 * sonode when the socket is being closed.
6505	 */
6506	sti->sti_orig_sp = origsp;
6507
6508	so_basic_strinit(so);	/* skips the T_CAPABILITY_REQ */
6509	so_alloc_addr(so, so->so_max_addr_len);
6510
6511	/*
6512	 * If the application has done a SIOCSPGRP, make sure the
6513	 * STREAM head is aware. This needs to take place before
6514	 * the protocol start sending up messages. Otherwise we
6515	 * might miss to generate SIGPOLL.
6516	 *
6517	 * It is possible that the application will receive duplicate
6518	 * signals if some were already generated for either data or
6519	 * connection indications.
6520	 */
6521	if (so->so_pgrp != 0) {
6522		if (so_set_events(so, so->so_vnode, cr) != 0)
6523			so->so_pgrp = 0;
6524	}
6525
6526	/*
6527	 * Determine which queue to use.
6528	 */
6529	vp = SOTOV(so);
6530	stp = vp->v_stream;
6531	ASSERT(stp != NULL);
6532	q = stp->sd_wrq->q_next;
6533
6534	/*
6535	 * Skip any modules that may have been auto pushed when the device
6536	 * was opened
6537	 */
6538	while (q->q_next != NULL)
6539		q = q->q_next;
6540	*qp = _RD(q);
6541
6542	/* This is now a STREAMS sockets */
6543	so->so_not_str = B_FALSE;
6544
6545	return (error);
6546}
6547
6548/*
6549 * Revert a TPI sonode. It is only allowed to revert the sonode during
6550 * the fallback process.
6551 */
6552void
6553sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6554{
6555	vnode_t *vp = SOTOV(so);
6556
6557	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6558	    SS_FALLBACK_PENDING);
6559	ASSERT(!SOCK_IS_NONSTR(so));
6560	ASSERT(vp->v_stream != NULL);
6561
6562	if (SOTOTPI(so)->sti_exdata_mp != NULL) {
6563		freeb(SOTOTPI(so)->sti_exdata_mp);
6564		SOTOTPI(so)->sti_exdata_mp = NULL;
6565	}
6566
6567	if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
6568		freeb(SOTOTPI(so)->sti_urgmark_mp);
6569		SOTOTPI(so)->sti_urgmark_mp = NULL;
6570	}
6571
6572	strclean(vp);
6573	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6574
6575	/*
6576	 * Restore the original sockparams. The caller is responsible for
6577	 * dropping the ref to the new sp.
6578	 */
6579	so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6580
6581	sotpi_info_fini(so);
6582	sotpi_info_destroy(so);
6583
6584	/* This is no longer a STREAMS sockets */
6585	so->so_not_str = B_TRUE;
6586}
6587
6588void
6589sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6590    struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6591    socklen_t faddrlen, short opts)
6592{
6593	sotpi_info_t *sti = SOTOTPI(so);
6594
6595	so_proc_tcapability_ack(so, tcap);
6596
6597	so->so_options |= opts;
6598
6599	/*
6600	 * Determine whether the foreign and local address are valid
6601	 */
6602	if (laddrlen != 0) {
6603		ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6604		sti->sti_laddr_len = laddrlen;
6605		bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6606		sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6607	}
6608
6609	if (faddrlen != 0) {
6610		ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6611		sti->sti_faddr_len = faddrlen;
6612		bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6613		sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6614	}
6615
6616}
6617
6618/*
6619 * Allocate enough space to cache the local and foreign addresses.
6620 */
6621void
6622so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6623{
6624	sotpi_info_t *sti = SOTOTPI(so);
6625
6626	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6627	ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6628	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6629	    P2ROUNDUP(maxlen, KMEM_ALIGN);
6630	so->so_max_addr_len = sti->sti_laddr_maxlen;
6631	sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6632	sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6633	    + sti->sti_laddr_maxlen);
6634
6635	if (so->so_family == AF_UNIX) {
6636		/*
6637		 * Initialize AF_UNIX related fields.
6638		 */
6639		bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6640		bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6641	}
6642}
6643
6644
6645sotpi_info_t *
6646sotpi_sototpi(struct sonode *so)
6647{
6648	sotpi_info_t *sti;
6649
6650	ASSERT(so != NULL);
6651
6652	sti = (sotpi_info_t *)so->so_priv;
6653
6654	ASSERT(sti != NULL);
6655	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6656
6657	return (sti);
6658}
6659
6660static int
6661i_sotpi_info_constructor(sotpi_info_t *sti)
6662{
6663	sti->sti_magic		= SOTPI_INFO_MAGIC;
6664	sti->sti_ack_mp		= NULL;
6665	sti->sti_discon_ind_mp	= NULL;
6666	sti->sti_ux_bound_vp	= NULL;
6667	sti->sti_unbind_mp	= NULL;
6668
6669	sti->sti_conn_ind_head	= NULL;
6670	sti->sti_conn_ind_tail	= NULL;
6671
6672	sti->sti_laddr_sa	= NULL;
6673	sti->sti_faddr_sa	= NULL;
6674
6675	sti->sti_nl7c_flags	= 0;
6676	sti->sti_nl7c_uri	= NULL;
6677	sti->sti_nl7c_rcv_mp	= NULL;
6678
6679	sti->sti_exdata_mp	= NULL;
6680	sti->sti_urgmark_mp	= NULL;
6681
6682	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6683	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6684
6685	return (0);
6686}
6687
6688static void
6689i_sotpi_info_destructor(sotpi_info_t *sti)
6690{
6691	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6692	ASSERT(sti->sti_ack_mp == NULL);
6693	ASSERT(sti->sti_discon_ind_mp == NULL);
6694	ASSERT(sti->sti_ux_bound_vp == NULL);
6695	ASSERT(sti->sti_unbind_mp == NULL);
6696
6697	ASSERT(sti->sti_conn_ind_head == NULL);
6698	ASSERT(sti->sti_conn_ind_tail == NULL);
6699
6700	ASSERT(sti->sti_laddr_sa == NULL);
6701	ASSERT(sti->sti_faddr_sa == NULL);
6702
6703	ASSERT(sti->sti_nl7c_flags == 0);
6704	ASSERT(sti->sti_nl7c_uri == NULL);
6705	ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6706
6707	ASSERT(sti->sti_exdata_mp == NULL);
6708	ASSERT(sti->sti_urgmark_mp == NULL);
6709
6710	mutex_destroy(&sti->sti_plumb_lock);
6711	cv_destroy(&sti->sti_ack_cv);
6712}
6713
6714/*
6715 * Creates and attaches TPI information to the given sonode
6716 */
6717static boolean_t
6718sotpi_info_create(struct sonode *so, int kmflags)
6719{
6720	sotpi_info_t *sti;
6721
6722	ASSERT(so->so_priv == NULL);
6723
6724	if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6725		return (B_FALSE);
6726
6727	if (i_sotpi_info_constructor(sti) != 0) {
6728		kmem_free(sti, sizeof (*sti));
6729		return (B_FALSE);
6730	}
6731
6732	so->so_priv = (void *)sti;
6733	return (B_TRUE);
6734}
6735
6736/*
6737 * Initializes the TPI information.
6738 */
6739static void
6740sotpi_info_init(struct sonode *so)
6741{
6742	struct vnode *vp = SOTOV(so);
6743	sotpi_info_t *sti = SOTOTPI(so);
6744	time_t now;
6745
6746	sti->sti_dev 	= so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6747	vp->v_rdev	= sti->sti_dev;
6748
6749	sti->sti_orig_sp = NULL;
6750
6751	sti->sti_pushcnt = 0;
6752
6753	now = gethrestime_sec();
6754	sti->sti_atime	= now;
6755	sti->sti_mtime	= now;
6756	sti->sti_ctime	= now;
6757
6758	sti->sti_eaddr_mp = NULL;
6759	sti->sti_delayed_error = 0;
6760
6761	sti->sti_provinfo = NULL;
6762
6763	sti->sti_oobcnt = 0;
6764	sti->sti_oobsigcnt = 0;
6765
6766	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6767
6768	sti->sti_laddr_sa	= 0;
6769	sti->sti_faddr_sa	= 0;
6770	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6771	sti->sti_laddr_len = sti->sti_faddr_len = 0;
6772
6773	sti->sti_laddr_valid = 0;
6774	sti->sti_faddr_valid = 0;
6775	sti->sti_faddr_noxlate = 0;
6776
6777	sti->sti_direct = 0;
6778
6779	ASSERT(sti->sti_ack_mp == NULL);
6780	ASSERT(sti->sti_ux_bound_vp == NULL);
6781	ASSERT(sti->sti_unbind_mp == NULL);
6782
6783	ASSERT(sti->sti_conn_ind_head == NULL);
6784	ASSERT(sti->sti_conn_ind_tail == NULL);
6785
6786	/* Initialize the kernel SSL proxy fields */
6787	sti->sti_kssl_type = KSSL_NO_PROXY;
6788	sti->sti_kssl_ent = NULL;
6789	sti->sti_kssl_ctx = NULL;
6790}
6791
6792/*
6793 * Given a sonode, grab the TPI info and free any data.
6794 */
6795static void
6796sotpi_info_fini(struct sonode *so)
6797{
6798	sotpi_info_t *sti = SOTOTPI(so);
6799	mblk_t *mp;
6800
6801	ASSERT(sti->sti_discon_ind_mp == NULL);
6802
6803	if ((mp = sti->sti_conn_ind_head) != NULL) {
6804		mblk_t *mp1;
6805
6806		while (mp) {
6807			mp1 = mp->b_next;
6808			mp->b_next = NULL;
6809			freemsg(mp);
6810			mp = mp1;
6811		}
6812		sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6813	}
6814
6815	/*
6816	 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6817	 * indirect them.  It also uses so_count as a validity test.
6818	 */
6819	mutex_enter(&so->so_lock);
6820
6821	if (sti->sti_laddr_sa) {
6822		ASSERT((caddr_t)sti->sti_faddr_sa ==
6823		    (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6824		ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6825		sti->sti_laddr_valid = 0;
6826		sti->sti_faddr_valid = 0;
6827		kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6828		sti->sti_laddr_sa = NULL;
6829		sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6830		sti->sti_faddr_sa = NULL;
6831		sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6832	}
6833
6834	mutex_exit(&so->so_lock);
6835
6836	if ((mp = sti->sti_eaddr_mp) != NULL) {
6837		freemsg(mp);
6838		sti->sti_eaddr_mp = NULL;
6839		sti->sti_delayed_error = 0;
6840	}
6841
6842	if ((mp = sti->sti_ack_mp) != NULL) {
6843		freemsg(mp);
6844		sti->sti_ack_mp = NULL;
6845	}
6846
6847	if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
6848		sti->sti_nl7c_rcv_mp = NULL;
6849		freemsg(mp);
6850	}
6851	sti->sti_nl7c_rcv_rval = 0;
6852	if (sti->sti_nl7c_uri != NULL) {
6853		nl7c_urifree(so);
6854		/* urifree() cleared nl7c_uri */
6855	}
6856	if (sti->sti_nl7c_flags) {
6857		sti->sti_nl7c_flags = 0;
6858	}
6859
6860	ASSERT(sti->sti_ux_bound_vp == NULL);
6861	if ((mp = sti->sti_unbind_mp) != NULL) {
6862		freemsg(mp);
6863		sti->sti_unbind_mp = NULL;
6864	}
6865}
6866
6867/*
6868 * Destroys the TPI information attached to a sonode.
6869 */
6870static void
6871sotpi_info_destroy(struct sonode *so)
6872{
6873	sotpi_info_t *sti = SOTOTPI(so);
6874
6875	i_sotpi_info_destructor(sti);
6876	kmem_free(sti, sizeof (*sti));
6877
6878	so->so_priv = NULL;
6879}
6880
6881/*
6882 * Create the global sotpi socket module entry. It will never be freed.
6883 */
6884smod_info_t *
6885sotpi_smod_create(void)
6886{
6887	smod_info_t *smodp;
6888
6889	smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6890	smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6891	(void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6892	/*
6893	 * Initialize the smod_refcnt to 1 so it will never be freed.
6894	 */
6895	smodp->smod_refcnt = 1;
6896	smodp->smod_uc_version = SOCK_UC_VERSION;
6897	smodp->smod_dc_version = SOCK_DC_VERSION;
6898	smodp->smod_sock_create_func = &sotpi_create;
6899	smodp->smod_sock_destroy_func = &sotpi_destroy;
6900	return (smodp);
6901}
6902