sockstr.c revision 8348:4137e18bfaf0
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/inttypes.h>
29#include <sys/t_lock.h>
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/buf.h>
33#include <sys/conf.h>
34#include <sys/cred.h>
35#include <sys/kmem.h>
36#include <sys/sysmacros.h>
37#include <sys/vfs.h>
38#include <sys/vnode.h>
39#include <sys/debug.h>
40#include <sys/errno.h>
41#include <sys/time.h>
42#include <sys/file.h>
43#include <sys/user.h>
44#include <sys/stream.h>
45#include <sys/strsubr.h>
46#include <sys/esunddi.h>
47#include <sys/flock.h>
48#include <sys/modctl.h>
49#include <sys/vtrace.h>
50#include <sys/strsun.h>
51#include <sys/cmn_err.h>
52#include <sys/proc.h>
53#include <sys/ddi.h>
54
55#include <sys/suntpi.h>
56#include <sys/socket.h>
57#include <sys/sockio.h>
58#include <sys/socketvar.h>
59#include <sys/sodirect.h>
60#include <netinet/in.h>
61#include <inet/common.h>
62#include <inet/proto_set.h>
63
64#include <sys/tiuser.h>
65#define	_SUN_TPI_VERSION	2
66#include <sys/tihdr.h>
67
68#include <inet/kssl/ksslapi.h>
69
70#include <c2/audit.h>
71
72#include <fs/sockfs/socktpi.h>
73#include <fs/sockfs/socktpi_impl.h>
74#include <sys/dcopy.h>
75
76int so_default_version = SOV_SOCKSTREAM;
77
78#ifdef DEBUG
79/* Set sockdebug to print debug messages when SO_DEBUG is set */
80int sockdebug = 0;
81
82/* Set sockprinterr to print error messages when SO_DEBUG is set */
83int sockprinterr = 0;
84
85/*
86 * Set so_default_options to SO_DEBUG is all sockets should be created
87 * with SO_DEBUG set. This is needed to get debug printouts from the
88 * socket() call itself.
89 */
90int so_default_options = 0;
91#endif /* DEBUG */
92
93#ifdef SOCK_TEST
94/*
95 * Set to number of ticks to limit cv_waits for code coverage testing.
96 * Set to 1000 when SO_DEBUG is set to 2.
97 */
98clock_t sock_test_timelimit = 0;
99#endif /* SOCK_TEST */
100
101/*
102 * For concurrency testing of e.g. opening /dev/ip which does not
103 * handle T_INFO_REQ messages.
104 */
105int so_no_tinfo = 0;
106
107/*
108 * Timeout for getting a T_CAPABILITY_ACK - it is possible for a provider
109 * to simply ignore the T_CAPABILITY_REQ.
110 */
111clock_t	sock_capability_timeout	= 2;	/* seconds */
112
113static int	do_tcapability(struct sonode *so, t_uscalar_t cap_bits1);
114static void	so_removehooks(struct sonode *so);
115
116static mblk_t *strsock_proto(vnode_t *vp, mblk_t *mp,
117		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
118		strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
119static mblk_t *strsock_misc(vnode_t *vp, mblk_t *mp,
120		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
121		strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
122/*
123 * STREAMS based sodirect put/wakeup functions.
124 */
125static int sodput(sodirect_t *, mblk_t *);
126static void sodwakeup(sodirect_t *);
127
128/*
129 * Called by sockinit() when sockfs is loaded.
130 */
131int
132sostr_init()
133{
134	sod_init();
135	return (0);
136}
137
138/*
139 * Convert a socket to a stream. Invoked when the illusory sockmod
140 * is popped from the stream.
141 * Change the stream head back to default operation without losing
142 * any messages (T_conn_ind's are moved to the stream head queue).
143 */
144int
145so_sock2stream(struct sonode *so)
146{
147	struct vnode		*vp = SOTOV(so);
148	queue_t			*rq;
149	mblk_t			*mp;
150	int			error = 0;
151	sotpi_info_t		*sti = SOTOTPI(so);
152
153	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
154
155	mutex_enter(&so->so_lock);
156	so_lock_single(so);
157
158	ASSERT(so->so_version != SOV_STREAM);
159
160	if (sti->sti_direct) {
161		mblk_t **mpp;
162		int rval;
163
164		/*
165		 * Tell the transport below that sockmod is being popped
166		 */
167		mutex_exit(&so->so_lock);
168		error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, CRED(),
169		    &rval);
170		mutex_enter(&so->so_lock);
171		if (error != 0) {
172			dprintso(so, 0, ("so_sock2stream(%p): "
173			    "_SIOCSOCKFALLBACK failed\n", (void *)so));
174			goto exit;
175		}
176		sti->sti_direct = 0;
177
178		for (mpp = &sti->sti_conn_ind_head; (mp = *mpp) != NULL;
179		    mpp = &mp->b_next) {
180			struct T_conn_ind	*conn_ind;
181
182			/*
183			 * strsock_proto() has already verified the length of
184			 * this message block.
185			 */
186			ASSERT(MBLKL(mp) >= sizeof (struct T_conn_ind));
187
188			conn_ind = (struct T_conn_ind *)mp->b_rptr;
189			if (conn_ind->OPT_length == 0 &&
190			    conn_ind->OPT_offset == 0)
191				continue;
192
193			if (DB_REF(mp) > 1) {
194				mblk_t	*newmp;
195				size_t	length;
196				cred_t	*cr;
197
198				/*
199				 * Copy the message block because it is used
200				 * elsewhere, too.
201				 */
202				length = MBLKL(mp);
203				newmp = soallocproto(length, _ALLOC_INTR);
204				if (newmp == NULL) {
205					error = EINTR;
206					goto exit;
207				}
208				bcopy(mp->b_rptr, newmp->b_wptr, length);
209				newmp->b_wptr += length;
210				newmp->b_next = mp->b_next;
211				cr = DB_CRED(mp);
212				if (cr != NULL)
213					mblk_setcred(newmp, cr);
214				DB_CPID(newmp) = DB_CPID(mp);
215
216				/*
217				 * Link the new message block into the queue
218				 * and free the old one.
219				 */
220				*mpp = newmp;
221				mp->b_next = NULL;
222				freemsg(mp);
223
224				mp = newmp;
225				conn_ind = (struct T_conn_ind *)mp->b_rptr;
226			}
227
228			/*
229			 * Remove options added by TCP for accept fast-path.
230			 */
231			conn_ind->OPT_length = 0;
232			conn_ind->OPT_offset = 0;
233		}
234	}
235
236	so->so_version = SOV_STREAM;
237	so->so_proto_handle = NULL;
238
239	/*
240	 * Remove the hooks in the stream head to avoid queuing more
241	 * packets in sockfs.
242	 */
243	mutex_exit(&so->so_lock);
244	so_removehooks(so);
245	mutex_enter(&so->so_lock);
246
247	/*
248	 * Clear any state related to urgent data. Leave any T_EXDATA_IND
249	 * on the queue - the behavior of urgent data after a switch is
250	 * left undefined.
251	 */
252	so->so_error = sti->sti_delayed_error = 0;
253	freemsg(so->so_oobmsg);
254	so->so_oobmsg = NULL;
255	sti->sti_oobsigcnt = sti->sti_oobcnt = 0;
256
257	so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
258	    SS_SAVEDEOR);
259	ASSERT(so_verify_oobstate(so));
260
261	freemsg(sti->sti_ack_mp);
262	sti->sti_ack_mp = NULL;
263
264	/*
265	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
266	 */
267	so_flush_discon_ind(so);
268
269	/*
270	 * Move any queued T_CONN_IND messages to stream head queue.
271	 */
272	rq = RD(strvp2wq(vp));
273	while ((mp = sti->sti_conn_ind_head) != NULL) {
274		sti->sti_conn_ind_head = mp->b_next;
275		mp->b_next = NULL;
276		if (sti->sti_conn_ind_head == NULL) {
277			ASSERT(sti->sti_conn_ind_tail == mp);
278			sti->sti_conn_ind_tail = NULL;
279		}
280		dprintso(so, 0,
281		    ("so_sock2stream(%p): moving T_CONN_IND\n", (void *)so));
282
283		/* Drop lock across put() */
284		mutex_exit(&so->so_lock);
285		put(rq, mp);
286		mutex_enter(&so->so_lock);
287	}
288
289exit:
290	ASSERT(MUTEX_HELD(&so->so_lock));
291	so_unlock_single(so, SOLOCKED);
292	mutex_exit(&so->so_lock);
293	return (error);
294}
295
296/*
297 * Covert a stream back to a socket. This is invoked when the illusory
298 * sockmod is pushed on a stream (where the stream was "created" by
299 * popping the illusory sockmod).
300 * This routine can not recreate the socket state (certain aspects of
301 * it like urgent data state and the bound/connected addresses for AF_UNIX
302 * sockets can not be recreated by asking the transport for information).
303 * Thus this routine implicitly assumes that the socket is in an initial
304 * state (as if it was just created). It flushes any messages queued on the
305 * read queue to avoid dealing with e.g. TPI acks or T_exdata_ind messages.
306 */
307void
308so_stream2sock(struct sonode *so)
309{
310	struct vnode *vp = SOTOV(so);
311	sotpi_info_t *sti = SOTOTPI(so);
312
313	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
314
315	mutex_enter(&so->so_lock);
316	so_lock_single(so);
317	ASSERT(so->so_version == SOV_STREAM);
318	so->so_version = SOV_SOCKSTREAM;
319	sti->sti_pushcnt = 0;
320	mutex_exit(&so->so_lock);
321
322	/*
323	 * Set a permenent error to force any thread in sorecvmsg to
324	 * return (and drop SOREADLOCKED). Clear the error once
325	 * we have SOREADLOCKED.
326	 * This makes a read sleeping during the I_PUSH of sockmod return
327	 * EIO.
328	 */
329	strsetrerror(SOTOV(so), EIO, 1, NULL);
330
331	/*
332	 * Get the read lock before flushing data to avoid
333	 * problems with the T_EXDATA_IND MSG_PEEK code in sorecvmsg.
334	 */
335	mutex_enter(&so->so_lock);
336	(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
337	mutex_exit(&so->so_lock);
338
339	strsetrerror(SOTOV(so), 0, 0, NULL);
340	so_installhooks(so);
341
342	/*
343	 * Flush everything on the read queue.
344	 * This ensures that no T_CONN_IND remain and that no T_EXDATA_IND
345	 * remain; those types of messages would confuse sockfs.
346	 */
347	strflushrq(vp, FLUSHALL);
348	mutex_enter(&so->so_lock);
349
350	/*
351	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
352	 */
353	so_flush_discon_ind(so);
354	so_unlock_read(so);	/* Clear SOREADLOCKED */
355
356	so_unlock_single(so, SOLOCKED);
357	mutex_exit(&so->so_lock);
358}
359
360/*
361 * Install the hooks in the stream head.
362 */
363void
364so_installhooks(struct sonode *so)
365{
366	struct vnode *vp = SOTOV(so);
367
368	strsetrputhooks(vp, SH_SIGALLDATA | SH_IGN_ZEROLEN | SH_CONSOL_DATA,
369	    strsock_proto, strsock_misc);
370	strsetwputhooks(vp, SH_SIGPIPE | SH_RECHECK_ERR, 0);
371}
372
373/*
374 * Remove the hooks in the stream head.
375 */
376static void
377so_removehooks(struct sonode *so)
378{
379	struct vnode *vp = SOTOV(so);
380
381	strsetrputhooks(vp, 0, NULL, NULL);
382	strsetwputhooks(vp, 0, STRTIMOUT);
383	/*
384	 * Leave read behavior as it would have been for a normal
385	 * stream i.e. a read of an M_PROTO will fail.
386	 */
387}
388
389void
390so_basic_strinit(struct sonode *so)
391{
392	struct vnode *vp = SOTOV(so);
393	struct stdata *stp;
394	mblk_t *mp;
395	sotpi_info_t *sti = SOTOTPI(so);
396
397	/* Preallocate an unbind_req message */
398	mp = soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
399	mutex_enter(&so->so_lock);
400	sti->sti_unbind_mp = mp;
401#ifdef DEBUG
402	so->so_options = so_default_options;
403#endif /* DEBUG */
404	mutex_exit(&so->so_lock);
405
406	so_installhooks(so);
407
408	stp = vp->v_stream;
409	/*
410	 * Have to keep minpsz at zero in order to allow write/send of zero
411	 * bytes.
412	 */
413	mutex_enter(&stp->sd_lock);
414	if (stp->sd_qn_minpsz == 1)
415		stp->sd_qn_minpsz = 0;
416	mutex_exit(&stp->sd_lock);
417
418	/*
419	 * If sodirect capable allocate and initialize sodirect_t.
420	 * Note, SS_SODIRECT is set in socktpi_open().
421	 */
422	if ((so->so_state & SS_SODIRECT) &&
423	    !(so->so_state & SS_FALLBACK_PENDING)) {
424		sod_sock_init(so, stp, sodput, sodwakeup, &stp->sd_lock);
425	}
426}
427
428/*
429 * Initialize the streams side of a socket including
430 * T_info_req/ack processing. If tso is not NULL its values are used thereby
431 * avoiding the T_INFO_REQ.
432 */
433int
434so_strinit(struct sonode *so, struct sonode *tso)
435{
436	sotpi_info_t *sti = SOTOTPI(so);
437	sotpi_info_t *tsti;
438	int error;
439
440	so_basic_strinit(so);
441
442	/*
443	 * The T_CAPABILITY_REQ should be the first message sent down because
444	 * at least TCP has a fast-path for this which avoids timeouts while
445	 * waiting for the T_CAPABILITY_ACK under high system load.
446	 */
447	if (tso == NULL) {
448		error = do_tcapability(so, TC1_ACCEPTOR_ID | TC1_INFO);
449		if (error)
450			return (error);
451	} else {
452		tsti = SOTOTPI(tso);
453
454		mutex_enter(&so->so_lock);
455		sti->sti_tsdu_size = tsti->sti_tsdu_size;
456		sti->sti_etsdu_size = tsti->sti_etsdu_size;
457		sti->sti_addr_size = tsti->sti_addr_size;
458		sti->sti_opt_size = tsti->sti_opt_size;
459		sti->sti_tidu_size = tsti->sti_tidu_size;
460		sti->sti_serv_type = tsti->sti_serv_type;
461		so->so_mode = tso->so_mode & ~SM_ACCEPTOR_ID;
462		mutex_exit(&so->so_lock);
463
464		/* the following do_tcapability may update so->so_mode */
465		if ((tsti->sti_serv_type != T_CLTS) &&
466		    (sti->sti_direct == 0)) {
467			error = do_tcapability(so, TC1_ACCEPTOR_ID);
468			if (error)
469				return (error);
470		}
471	}
472	/*
473	 * If the addr_size is 0 we treat it as already bound
474	 * and connected. This is used by the routing socket.
475	 * We set the addr_size to something to allocate a the address
476	 * structures.
477	 */
478	if (sti->sti_addr_size == 0) {
479		so->so_state |= SS_ISBOUND | SS_ISCONNECTED;
480		/* Address size can vary with address families. */
481		if (so->so_family == AF_INET6)
482			sti->sti_addr_size =
483			    (t_scalar_t)sizeof (struct sockaddr_in6);
484		else
485			sti->sti_addr_size =
486			    (t_scalar_t)sizeof (struct sockaddr_in);
487		ASSERT(sti->sti_unbind_mp);
488	}
489
490	so_alloc_addr(so, sti->sti_addr_size);
491
492	return (0);
493}
494
495static void
496copy_tinfo(struct sonode *so, struct T_info_ack *tia)
497{
498	sotpi_info_t *sti = SOTOTPI(so);
499
500	sti->sti_tsdu_size = tia->TSDU_size;
501	sti->sti_etsdu_size = tia->ETSDU_size;
502	sti->sti_addr_size = tia->ADDR_size;
503	sti->sti_opt_size = tia->OPT_size;
504	sti->sti_tidu_size = tia->TIDU_size;
505	sti->sti_serv_type = tia->SERV_type;
506	switch (tia->CURRENT_state) {
507	case TS_UNBND:
508		break;
509	case TS_IDLE:
510		so->so_state |= SS_ISBOUND;
511		sti->sti_laddr_len = 0;
512		sti->sti_laddr_valid = 0;
513		break;
514	case TS_DATA_XFER:
515		so->so_state |= SS_ISBOUND|SS_ISCONNECTED;
516		sti->sti_laddr_len = 0;
517		sti->sti_faddr_len = 0;
518		sti->sti_laddr_valid = 0;
519		sti->sti_faddr_valid = 0;
520		break;
521	}
522
523	/*
524	 * Heuristics for determining the socket mode flags
525	 * (SM_ATOMIC, SM_CONNREQUIRED, SM_ADDR, SM_FDPASSING,
526	 * and SM_EXDATA, SM_OPTDATA, and SM_BYTESTREAM)
527	 * from the info ack.
528	 */
529	if (sti->sti_serv_type == T_CLTS) {
530		so->so_mode |= SM_ATOMIC | SM_ADDR;
531	} else {
532		so->so_mode |= SM_CONNREQUIRED;
533		if (sti->sti_etsdu_size != 0 && sti->sti_etsdu_size != -2)
534			so->so_mode |= SM_EXDATA;
535	}
536	if (so->so_type == SOCK_SEQPACKET || so->so_type == SOCK_RAW) {
537		/* Semantics are to discard tail end of messages */
538		so->so_mode |= SM_ATOMIC;
539	}
540	if (so->so_family == AF_UNIX) {
541		so->so_mode |= SM_FDPASSING | SM_OPTDATA;
542		if (sti->sti_addr_size == -1) {
543			/* MAXPATHLEN + soun_family + nul termination */
544			sti->sti_addr_size = (t_scalar_t)(MAXPATHLEN +
545			    sizeof (short) + 1);
546		}
547		if (so->so_type == SOCK_STREAM) {
548			/*
549			 * Make it into a byte-stream transport.
550			 * SOCK_SEQPACKET sockets are unchanged.
551			 */
552			sti->sti_tsdu_size = 0;
553		}
554	} else if (sti->sti_addr_size == -1) {
555		/*
556		 * Logic extracted from sockmod - have to pick some max address
557		 * length in order to preallocate the addresses.
558		 */
559		sti->sti_addr_size = SOA_DEFSIZE;
560	}
561	if (sti->sti_tsdu_size == 0)
562		so->so_mode |= SM_BYTESTREAM;
563}
564
565static int
566check_tinfo(struct sonode *so)
567{
568	sotpi_info_t *sti = SOTOTPI(so);
569
570	/* Consistency checks */
571	if (so->so_type == SOCK_DGRAM && sti->sti_serv_type != T_CLTS) {
572		eprintso(so, ("service type and socket type mismatch\n"));
573		eprintsoline(so, EPROTO);
574		return (EPROTO);
575	}
576	if (so->so_type == SOCK_STREAM && sti->sti_serv_type == T_CLTS) {
577		eprintso(so, ("service type and socket type mismatch\n"));
578		eprintsoline(so, EPROTO);
579		return (EPROTO);
580	}
581	if (so->so_type == SOCK_SEQPACKET && sti->sti_serv_type == T_CLTS) {
582		eprintso(so, ("service type and socket type mismatch\n"));
583		eprintsoline(so, EPROTO);
584		return (EPROTO);
585	}
586	if (so->so_family == AF_INET &&
587	    sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in)) {
588		eprintso(so,
589		    ("AF_INET must have sockaddr_in address length. Got %d\n",
590		    sti->sti_addr_size));
591		eprintsoline(so, EMSGSIZE);
592		return (EMSGSIZE);
593	}
594	if (so->so_family == AF_INET6 &&
595	    sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in6)) {
596		eprintso(so,
597		    ("AF_INET6 must have sockaddr_in6 address length. Got %d\n",
598		    sti->sti_addr_size));
599		eprintsoline(so, EMSGSIZE);
600		return (EMSGSIZE);
601	}
602
603	dprintso(so, 1, (
604	    "tinfo: serv %d tsdu %d, etsdu %d, addr %d, opt %d, tidu %d\n",
605	    sti->sti_serv_type, sti->sti_tsdu_size, sti->sti_etsdu_size,
606	    sti->sti_addr_size, sti->sti_opt_size,
607	    sti->sti_tidu_size));
608	dprintso(so, 1, ("tinfo: so_state %s\n",
609	    pr_state(so->so_state, so->so_mode)));
610	return (0);
611}
612
613/*
614 * Send down T_info_req and wait for the ack.
615 * Record interesting T_info_ack values in the sonode.
616 */
617static int
618do_tinfo(struct sonode *so)
619{
620	struct T_info_req tir;
621	mblk_t *mp;
622	int error;
623
624	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
625
626	if (so_no_tinfo) {
627		SOTOTPI(so)->sti_addr_size = 0;
628		return (0);
629	}
630
631	dprintso(so, 1, ("do_tinfo(%p)\n", (void *)so));
632
633	/* Send T_INFO_REQ */
634	tir.PRIM_type = T_INFO_REQ;
635	mp = soallocproto1(&tir, sizeof (tir),
636	    sizeof (struct T_info_req) + sizeof (struct T_info_ack),
637	    _ALLOC_INTR);
638	if (mp == NULL) {
639		eprintsoline(so, ENOBUFS);
640		return (ENOBUFS);
641	}
642	/* T_INFO_REQ has to be M_PCPROTO */
643	DB_TYPE(mp) = M_PCPROTO;
644
645	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
646	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
647	if (error) {
648		eprintsoline(so, error);
649		return (error);
650	}
651	mutex_enter(&so->so_lock);
652	/* Wait for T_INFO_ACK */
653	if ((error = sowaitprim(so, T_INFO_REQ, T_INFO_ACK,
654	    (t_uscalar_t)sizeof (struct T_info_ack), &mp, 0))) {
655		mutex_exit(&so->so_lock);
656		eprintsoline(so, error);
657		return (error);
658	}
659
660	ASSERT(mp);
661	copy_tinfo(so, (struct T_info_ack *)mp->b_rptr);
662	mutex_exit(&so->so_lock);
663	freemsg(mp);
664	return (check_tinfo(so));
665}
666
667/*
668 * Send down T_capability_req and wait for the ack.
669 * Record interesting T_capability_ack values in the sonode.
670 */
671static int
672do_tcapability(struct sonode *so, t_uscalar_t cap_bits1)
673{
674	struct T_capability_req tcr;
675	struct T_capability_ack *tca;
676	mblk_t *mp;
677	int error;
678	sotpi_info_t *sti = SOTOTPI(so);
679
680	ASSERT(cap_bits1 != 0);
681	ASSERT((cap_bits1 & ~(TC1_ACCEPTOR_ID | TC1_INFO)) == 0);
682	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
683
684	if (sti->sti_provinfo->tpi_capability == PI_NO)
685		return (do_tinfo(so));
686
687	if (so_no_tinfo) {
688		sti->sti_addr_size = 0;
689		if ((cap_bits1 &= ~TC1_INFO) == 0)
690			return (0);
691	}
692
693	dprintso(so, 1, ("do_tcapability(%p)\n", (void *)so));
694
695	/* Send T_CAPABILITY_REQ */
696	tcr.PRIM_type = T_CAPABILITY_REQ;
697	tcr.CAP_bits1 = cap_bits1;
698	mp = soallocproto1(&tcr, sizeof (tcr),
699	    sizeof (struct T_capability_req) + sizeof (struct T_capability_ack),
700	    _ALLOC_INTR);
701	if (mp == NULL) {
702		eprintsoline(so, ENOBUFS);
703		return (ENOBUFS);
704	}
705	/* T_CAPABILITY_REQ should be M_PCPROTO here */
706	DB_TYPE(mp) = M_PCPROTO;
707
708	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
709	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
710	if (error) {
711		eprintsoline(so, error);
712		return (error);
713	}
714	mutex_enter(&so->so_lock);
715	/* Wait for T_CAPABILITY_ACK */
716	if ((error = sowaitprim(so, T_CAPABILITY_REQ, T_CAPABILITY_ACK,
717	    (t_uscalar_t)sizeof (*tca), &mp, sock_capability_timeout * hz))) {
718		mutex_exit(&so->so_lock);
719		PI_PROVLOCK(sti->sti_provinfo);
720		if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW)
721			sti->sti_provinfo->tpi_capability = PI_NO;
722		PI_PROVUNLOCK(sti->sti_provinfo);
723		ASSERT((so->so_mode & SM_ACCEPTOR_ID) == 0);
724		if (cap_bits1 & TC1_INFO) {
725			/*
726			 * If the T_CAPABILITY_REQ timed out and then a
727			 * T_INFO_REQ gets a protocol error, most likely
728			 * the capability was slow (vs. unsupported). Return
729			 * ENOSR for this case as a best guess.
730			 */
731			if (error == ETIME) {
732				return ((error = do_tinfo(so)) == EPROTO ?
733				    ENOSR : error);
734			}
735			return (do_tinfo(so));
736		}
737		return (0);
738	}
739
740	ASSERT(mp);
741	tca = (struct T_capability_ack *)mp->b_rptr;
742
743	ASSERT((cap_bits1 & TC1_INFO) == (tca->CAP_bits1 & TC1_INFO));
744	so_proc_tcapability_ack(so, tca);
745
746	cap_bits1 = tca->CAP_bits1;
747
748	mutex_exit(&so->so_lock);
749	freemsg(mp);
750
751	if (cap_bits1 & TC1_INFO)
752		return (check_tinfo(so));
753
754	return (0);
755}
756
757/*
758 * Process a T_CAPABILITY_ACK
759 */
760void
761so_proc_tcapability_ack(struct sonode *so, struct T_capability_ack *tca)
762{
763	sotpi_info_t *sti = SOTOTPI(so);
764
765	if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW) {
766		PI_PROVLOCK(sti->sti_provinfo);
767		sti->sti_provinfo->tpi_capability = PI_YES;
768		PI_PROVUNLOCK(sti->sti_provinfo);
769	}
770
771	if (tca->CAP_bits1 & TC1_ACCEPTOR_ID) {
772		sti->sti_acceptor_id = tca->ACCEPTOR_id;
773		so->so_mode |= SM_ACCEPTOR_ID;
774	}
775
776	if (tca->CAP_bits1 & TC1_INFO)
777		copy_tinfo(so, &tca->INFO_ack);
778}
779
780/*
781 * Retrieve socket error, clear error if not peek.
782 */
783int
784sogeterr(struct sonode *so, boolean_t clear_err)
785{
786	int error;
787
788	ASSERT(MUTEX_HELD(&so->so_lock));
789
790	error = so->so_error;
791	if (clear_err)
792		so->so_error = 0;
793
794	return (error);
795}
796
797/*
798 * This routine is registered with the stream head to retrieve read
799 * side errors.
800 * It does not clear the socket error for a peeking read side operation.
801 * It the error is to be cleared it sets *clearerr.
802 */
803int
804sogetrderr(vnode_t *vp, int ispeek, int *clearerr)
805{
806	struct sonode *so = VTOSO(vp);
807	int error;
808
809	mutex_enter(&so->so_lock);
810	if (ispeek) {
811		error = so->so_error;
812		*clearerr = 0;
813	} else {
814		error = so->so_error;
815		so->so_error = 0;
816		*clearerr = 1;
817	}
818	mutex_exit(&so->so_lock);
819	return (error);
820}
821
822/*
823 * This routine is registered with the stream head to retrieve write
824 * side errors.
825 * It does not clear the socket error for a peeking read side operation.
826 * It the error is to be cleared it sets *clearerr.
827 */
828int
829sogetwrerr(vnode_t *vp, int ispeek, int *clearerr)
830{
831	struct sonode *so = VTOSO(vp);
832	int error;
833
834	mutex_enter(&so->so_lock);
835	if (so->so_state & SS_CANTSENDMORE) {
836		error = EPIPE;
837		*clearerr = 0;
838	} else {
839		error = so->so_error;
840		if (ispeek) {
841			*clearerr = 0;
842		} else {
843			so->so_error = 0;
844			*clearerr = 1;
845		}
846	}
847	mutex_exit(&so->so_lock);
848	return (error);
849}
850
851/*
852 * Set a nonpersistent read and write error on the socket.
853 * Used when there is a T_uderror_ind for a connected socket.
854 * The caller also needs to call strsetrerror and strsetwerror
855 * after dropping the lock.
856 */
857void
858soseterror(struct sonode *so, int error)
859{
860	ASSERT(error != 0);
861
862	ASSERT(MUTEX_HELD(&so->so_lock));
863	so->so_error = (ushort_t)error;
864}
865
866void
867soisconnecting(struct sonode *so)
868{
869	ASSERT(MUTEX_HELD(&so->so_lock));
870	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
871	so->so_state |= SS_ISCONNECTING;
872	cv_broadcast(&so->so_state_cv);
873}
874
875void
876soisconnected(struct sonode *so)
877{
878	ASSERT(MUTEX_HELD(&so->so_lock));
879	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
880	so->so_state |= SS_ISCONNECTED;
881	cv_broadcast(&so->so_state_cv);
882}
883
884/*
885 * The caller also needs to call strsetrerror, strsetwerror and strseteof.
886 */
887void
888soisdisconnected(struct sonode *so, int error)
889{
890	ASSERT(MUTEX_HELD(&so->so_lock));
891	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
892	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
893	so->so_error = (ushort_t)error;
894	if (so->so_peercred != NULL) {
895		crfree(so->so_peercred);
896		so->so_peercred = NULL;
897	}
898	cv_broadcast(&so->so_state_cv);
899}
900
901/*
902 * For connected AF_UNIX SOCK_DGRAM sockets when the peer closes.
903 * Does not affect write side.
904 * The caller also has to call strsetrerror.
905 */
906static void
907sobreakconn(struct sonode *so, int error)
908{
909	ASSERT(MUTEX_HELD(&so->so_lock));
910	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
911	so->so_error = (ushort_t)error;
912	cv_broadcast(&so->so_state_cv);
913}
914
915/*
916 * Can no longer send.
917 * Caller must also call strsetwerror.
918 *
919 * We mark the peer address as no longer valid for getpeername, but
920 * leave it around for so_unix_close to notify the peer (that
921 * transport has no addressing held at that layer).
922 */
923void
924socantsendmore(struct sonode *so)
925{
926	ASSERT(MUTEX_HELD(&so->so_lock));
927	so->so_state |= SS_CANTSENDMORE;
928	cv_broadcast(&so->so_state_cv);
929}
930
931/*
932 * The caller must call strseteof(,1) as well as this routine
933 * to change the socket state.
934 */
935void
936socantrcvmore(struct sonode *so)
937{
938	ASSERT(MUTEX_HELD(&so->so_lock));
939	so->so_state |= SS_CANTRCVMORE;
940	cv_broadcast(&so->so_state_cv);
941}
942
943/*
944 * The caller has sent down a "request_prim" primitive and wants to wait for
945 * an ack ("ack_prim") or an T_ERROR_ACK for it.
946 * The specified "ack_prim" can be a T_OK_ACK.
947 *
948 * Assumes that all the TPI acks are M_PCPROTO messages.
949 *
950 * Note that the socket is single-threaded (using so_lock_single)
951 * for all operations that generate TPI ack messages. Since
952 * only TPI ack messages are M_PCPROTO we should never receive
953 * anything except either the ack we are expecting or a T_ERROR_ACK
954 * for the same primitive.
955 */
956int
957sowaitprim(struct sonode *so, t_scalar_t request_prim, t_scalar_t ack_prim,
958	    t_uscalar_t min_size, mblk_t **mpp, clock_t wait)
959{
960	mblk_t *mp;
961	union T_primitives *tpr;
962	int error;
963
964	dprintso(so, 1, ("sowaitprim(%p, %d, %d, %d, %p, %lu)\n",
965	    (void *)so, request_prim, ack_prim, min_size, (void *)mpp, wait));
966
967	ASSERT(MUTEX_HELD(&so->so_lock));
968
969	error = sowaitack(so, &mp, wait);
970	if (error)
971		return (error);
972
973	dprintso(so, 1, ("got msg %p\n", (void *)mp));
974	if (DB_TYPE(mp) != M_PCPROTO ||
975	    MBLKL(mp) < sizeof (tpr->type)) {
976		freemsg(mp);
977		eprintsoline(so, EPROTO);
978		return (EPROTO);
979	}
980	tpr = (union T_primitives *)mp->b_rptr;
981	/*
982	 * Did we get the primitive that we were asking for?
983	 * For T_OK_ACK we also check that it matches the request primitive.
984	 */
985	if (tpr->type == ack_prim &&
986	    (ack_prim != T_OK_ACK ||
987	    tpr->ok_ack.CORRECT_prim == request_prim)) {
988		if (MBLKL(mp) >= (ssize_t)min_size) {
989			/* Found what we are looking for */
990			*mpp = mp;
991			return (0);
992		}
993		/* Too short */
994		freemsg(mp);
995		eprintsoline(so, EPROTO);
996		return (EPROTO);
997	}
998
999	if (tpr->type == T_ERROR_ACK &&
1000	    tpr->error_ack.ERROR_prim == request_prim) {
1001		/* Error to the primitive we were looking for */
1002		if (tpr->error_ack.TLI_error == TSYSERR) {
1003			error = tpr->error_ack.UNIX_error;
1004		} else {
1005			error = proto_tlitosyserr(tpr->error_ack.TLI_error);
1006		}
1007		dprintso(so, 0, ("error_ack for %d: %d/%d ->%d\n",
1008		    tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error,
1009		    tpr->error_ack.UNIX_error, error));
1010		freemsg(mp);
1011		return (error);
1012	}
1013	/*
1014	 * Wrong primitive or T_ERROR_ACK for the wrong primitive
1015	 */
1016#ifdef DEBUG
1017	if (tpr->type == T_ERROR_ACK) {
1018		dprintso(so, 0, ("error_ack for %d: %d/%d\n",
1019		    tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error,
1020		    tpr->error_ack.UNIX_error));
1021	} else if (tpr->type == T_OK_ACK) {
1022		dprintso(so, 0, ("ok_ack for %d, expected %d for %d\n",
1023		    tpr->ok_ack.CORRECT_prim, ack_prim, request_prim));
1024	} else {
1025		dprintso(so, 0,
1026		    ("unexpected primitive %d, expected %d for %d\n",
1027		    tpr->type, ack_prim, request_prim));
1028	}
1029#endif /* DEBUG */
1030
1031	freemsg(mp);
1032	eprintsoline(so, EPROTO);
1033	return (EPROTO);
1034}
1035
1036/*
1037 * Wait for a T_OK_ACK for the specified primitive.
1038 */
1039int
1040sowaitokack(struct sonode *so, t_scalar_t request_prim)
1041{
1042	mblk_t *mp;
1043	int error;
1044
1045	error = sowaitprim(so, request_prim, T_OK_ACK,
1046	    (t_uscalar_t)sizeof (struct T_ok_ack), &mp, 0);
1047	if (error)
1048		return (error);
1049	freemsg(mp);
1050	return (0);
1051}
1052
1053/*
1054 * Queue a received TPI ack message on sti_ack_mp.
1055 */
1056void
1057soqueueack(struct sonode *so, mblk_t *mp)
1058{
1059	sotpi_info_t *sti = SOTOTPI(so);
1060
1061	if (DB_TYPE(mp) != M_PCPROTO) {
1062		zcmn_err(getzoneid(), CE_WARN,
1063		    "sockfs: received unexpected M_PROTO TPI ack. Prim %d\n",
1064		    *(t_scalar_t *)mp->b_rptr);
1065		freemsg(mp);
1066		return;
1067	}
1068
1069	mutex_enter(&so->so_lock);
1070	if (sti->sti_ack_mp != NULL) {
1071		dprintso(so, 1, ("sti_ack_mp already set\n"));
1072		freemsg(sti->sti_ack_mp);
1073		sti->sti_ack_mp = NULL;
1074	}
1075	sti->sti_ack_mp = mp;
1076	cv_broadcast(&sti->sti_ack_cv);
1077	mutex_exit(&so->so_lock);
1078}
1079
1080/*
1081 * Wait for a TPI ack ignoring signals and errors.
1082 */
1083int
1084sowaitack(struct sonode *so, mblk_t **mpp, clock_t wait)
1085{
1086	sotpi_info_t *sti = SOTOTPI(so);
1087
1088	ASSERT(MUTEX_HELD(&so->so_lock));
1089
1090	while (sti->sti_ack_mp == NULL) {
1091#ifdef SOCK_TEST
1092		if (wait == 0 && sock_test_timelimit != 0)
1093			wait = sock_test_timelimit;
1094#endif
1095		if (wait != 0) {
1096			/*
1097			 * Only wait for the time limit.
1098			 */
1099			clock_t now;
1100
1101			time_to_wait(&now, wait);
1102			if (cv_timedwait(&sti->sti_ack_cv, &so->so_lock,
1103			    now) == -1) {
1104				eprintsoline(so, ETIME);
1105				return (ETIME);
1106			}
1107		}
1108		else
1109			cv_wait(&sti->sti_ack_cv, &so->so_lock);
1110	}
1111	*mpp = sti->sti_ack_mp;
1112#ifdef DEBUG
1113	{
1114		union T_primitives *tpr;
1115		mblk_t *mp = *mpp;
1116
1117		tpr = (union T_primitives *)mp->b_rptr;
1118		ASSERT(DB_TYPE(mp) == M_PCPROTO);
1119		ASSERT(tpr->type == T_OK_ACK ||
1120		    tpr->type == T_ERROR_ACK ||
1121		    tpr->type == T_BIND_ACK ||
1122		    tpr->type == T_CAPABILITY_ACK ||
1123		    tpr->type == T_INFO_ACK ||
1124		    tpr->type == T_OPTMGMT_ACK);
1125	}
1126#endif /* DEBUG */
1127	sti->sti_ack_mp = NULL;
1128	return (0);
1129}
1130
1131/*
1132 * Queue a received T_CONN_IND message on sti_conn_ind_head/tail.
1133 */
1134void
1135soqueueconnind(struct sonode *so, mblk_t *mp)
1136{
1137	sotpi_info_t *sti = SOTOTPI(so);
1138
1139	if (DB_TYPE(mp) != M_PROTO) {
1140		zcmn_err(getzoneid(), CE_WARN,
1141		    "sockfs: received unexpected M_PCPROTO T_CONN_IND\n");
1142		freemsg(mp);
1143		return;
1144	}
1145
1146	mutex_enter(&so->so_lock);
1147	ASSERT(mp->b_next == NULL);
1148	if (sti->sti_conn_ind_head == NULL) {
1149		sti->sti_conn_ind_head = mp;
1150	} else {
1151		ASSERT(sti->sti_conn_ind_tail->b_next == NULL);
1152		sti->sti_conn_ind_tail->b_next = mp;
1153	}
1154	sti->sti_conn_ind_tail = mp;
1155	/* Wakeup a single consumer of the T_CONN_IND */
1156	cv_signal(&so->so_acceptq_cv);
1157	mutex_exit(&so->so_lock);
1158}
1159
1160/*
1161 * Wait for a T_CONN_IND.
1162 * Don't wait if nonblocking.
1163 * Accept signals and socket errors.
1164 */
1165int
1166sowaitconnind(struct sonode *so, int fmode, mblk_t **mpp)
1167{
1168	mblk_t *mp;
1169	sotpi_info_t *sti = SOTOTPI(so);
1170	int error = 0;
1171
1172	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1173	mutex_enter(&so->so_lock);
1174check_error:
1175	if (so->so_error) {
1176		error = sogeterr(so, B_TRUE);
1177		if (error) {
1178			mutex_exit(&so->so_lock);
1179			return (error);
1180		}
1181	}
1182
1183	if (sti->sti_conn_ind_head == NULL) {
1184		if (fmode & (FNDELAY|FNONBLOCK)) {
1185			error = EWOULDBLOCK;
1186			goto done;
1187		}
1188
1189		if (so->so_state & SS_CLOSING) {
1190			error = EINTR;
1191			goto done;
1192		}
1193
1194		if (!cv_wait_sig_swap(&so->so_acceptq_cv, &so->so_lock)) {
1195			error = EINTR;
1196			goto done;
1197		}
1198		goto check_error;
1199	}
1200	mp = sti->sti_conn_ind_head;
1201	sti->sti_conn_ind_head = mp->b_next;
1202	mp->b_next = NULL;
1203	if (sti->sti_conn_ind_head == NULL) {
1204		ASSERT(sti->sti_conn_ind_tail == mp);
1205		sti->sti_conn_ind_tail = NULL;
1206	}
1207	*mpp = mp;
1208done:
1209	mutex_exit(&so->so_lock);
1210	return (error);
1211}
1212
1213/*
1214 * Flush a T_CONN_IND matching the sequence number from the list.
1215 * Return zero if found; non-zero otherwise.
1216 * This is called very infrequently thus it is ok to do a linear search.
1217 */
1218int
1219soflushconnind(struct sonode *so, t_scalar_t seqno)
1220{
1221	mblk_t *prevmp, *mp;
1222	struct T_conn_ind *tci;
1223	sotpi_info_t *sti = SOTOTPI(so);
1224
1225	mutex_enter(&so->so_lock);
1226	for (prevmp = NULL, mp = sti->sti_conn_ind_head; mp != NULL;
1227	    prevmp = mp, mp = mp->b_next) {
1228		tci = (struct T_conn_ind *)mp->b_rptr;
1229		if (tci->SEQ_number == seqno) {
1230			dprintso(so, 1,
1231			    ("t_discon_ind: found T_CONN_IND %d\n", seqno));
1232			/* Deleting last? */
1233			if (sti->sti_conn_ind_tail == mp) {
1234				sti->sti_conn_ind_tail = prevmp;
1235			}
1236			if (prevmp == NULL) {
1237				/* Deleting first */
1238				sti->sti_conn_ind_head = mp->b_next;
1239			} else {
1240				prevmp->b_next = mp->b_next;
1241			}
1242			mp->b_next = NULL;
1243
1244			ASSERT((sti->sti_conn_ind_head == NULL &&
1245			    sti->sti_conn_ind_tail == NULL) ||
1246			    (sti->sti_conn_ind_head != NULL &&
1247			    sti->sti_conn_ind_tail != NULL));
1248
1249			so->so_error = ECONNABORTED;
1250			mutex_exit(&so->so_lock);
1251
1252			/*
1253			 * T_KSSL_PROXY_CONN_IND may carry a handle for
1254			 * an SSL context, and needs to be released.
1255			 */
1256			if ((tci->PRIM_type == T_SSL_PROXY_CONN_IND) &&
1257			    (mp->b_cont != NULL)) {
1258				kssl_ctx_t kssl_ctx;
1259
1260				ASSERT(MBLKL(mp->b_cont) ==
1261				    sizeof (kssl_ctx_t));
1262				kssl_ctx = *((kssl_ctx_t *)mp->b_cont->b_rptr);
1263				kssl_release_ctx(kssl_ctx);
1264			}
1265			freemsg(mp);
1266			return (0);
1267		}
1268	}
1269	mutex_exit(&so->so_lock);
1270	dprintso(so, 1,	("t_discon_ind: NOT found T_CONN_IND %d\n", seqno));
1271	return (-1);
1272}
1273
1274/*
1275 * Wait until the socket is connected or there is an error.
1276 * fmode should contain any nonblocking flags. nosig should be
1277 * set if the caller does not want the wait to be interrupted by a signal.
1278 */
1279int
1280sowaitconnected(struct sonode *so, int fmode, int nosig)
1281{
1282	int error;
1283
1284	ASSERT(MUTEX_HELD(&so->so_lock));
1285
1286	while ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) ==
1287	    SS_ISCONNECTING && so->so_error == 0) {
1288
1289		dprintso(so, 1, ("waiting for SS_ISCONNECTED on %p\n",
1290		    (void *)so));
1291		if (fmode & (FNDELAY|FNONBLOCK))
1292			return (EINPROGRESS);
1293
1294		if (so->so_state & SS_CLOSING)
1295			return (EINTR);
1296
1297		if (nosig)
1298			cv_wait(&so->so_state_cv, &so->so_lock);
1299		else if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) {
1300			/*
1301			 * Return EINTR and let the application use
1302			 * nonblocking techniques for detecting when
1303			 * the connection has been established.
1304			 */
1305			return (EINTR);
1306		}
1307		dprintso(so, 1, ("awoken on %p\n", (void *)so));
1308	}
1309
1310	if (so->so_error != 0) {
1311		error = sogeterr(so, B_TRUE);
1312		ASSERT(error != 0);
1313		dprintso(so, 1, ("sowaitconnected: error %d\n", error));
1314		return (error);
1315	}
1316	if (!(so->so_state & SS_ISCONNECTED)) {
1317		/*
1318		 * Could have received a T_ORDREL_IND or a T_DISCON_IND with
1319		 * zero errno. Or another thread could have consumed so_error
1320		 * e.g. by calling read.
1321		 */
1322		error = ECONNREFUSED;
1323		dprintso(so, 1, ("sowaitconnected: error %d\n", error));
1324		return (error);
1325	}
1326	return (0);
1327}
1328
1329
1330/*
1331 * Handle the signal generation aspect of urgent data.
1332 */
1333static void
1334so_oob_sig(struct sonode *so, int extrasig,
1335    strsigset_t *signals, strpollset_t *pollwakeups)
1336{
1337	sotpi_info_t *sti = SOTOTPI(so);
1338
1339	ASSERT(MUTEX_HELD(&so->so_lock));
1340
1341	ASSERT(so_verify_oobstate(so));
1342	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1343	if (sti->sti_oobsigcnt > sti->sti_oobcnt) {
1344		/*
1345		 * Signal has already been generated once for this
1346		 * urgent "event". However, since TCP can receive updated
1347		 * urgent pointers we still generate a signal.
1348		 */
1349		ASSERT(so->so_state & SS_OOBPEND);
1350		if (extrasig) {
1351			*signals |= S_RDBAND;
1352			*pollwakeups |= POLLRDBAND;
1353		}
1354		return;
1355	}
1356
1357	sti->sti_oobsigcnt++;
1358	ASSERT(sti->sti_oobsigcnt > 0);	/* Wraparound */
1359	ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt);
1360
1361	/*
1362	 * Record (for select/poll) that urgent data is pending.
1363	 */
1364	so->so_state |= SS_OOBPEND;
1365	/*
1366	 * New urgent data on the way so forget about any old
1367	 * urgent data.
1368	 */
1369	so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA);
1370	if (so->so_oobmsg != NULL) {
1371		dprintso(so, 1, ("sock: discarding old oob\n"));
1372		freemsg(so->so_oobmsg);
1373		so->so_oobmsg = NULL;
1374	}
1375	*signals |= S_RDBAND;
1376	*pollwakeups |= POLLRDBAND;
1377	ASSERT(so_verify_oobstate(so));
1378}
1379
1380/*
1381 * Handle the processing of the T_EXDATA_IND with urgent data.
1382 * Returns the T_EXDATA_IND if it should be queued on the read queue.
1383 */
1384/* ARGSUSED2 */
1385static mblk_t *
1386so_oob_exdata(struct sonode *so, mblk_t *mp,
1387	strsigset_t *signals, strpollset_t *pollwakeups)
1388{
1389	sotpi_info_t *sti = SOTOTPI(so);
1390
1391	ASSERT(MUTEX_HELD(&so->so_lock));
1392
1393	ASSERT(so_verify_oobstate(so));
1394
1395	ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt);
1396
1397	sti->sti_oobcnt++;
1398	ASSERT(sti->sti_oobcnt > 0);	/* wraparound? */
1399	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1400
1401	/*
1402	 * Set MSGMARK for SIOCATMARK.
1403	 */
1404	mp->b_flag |= MSGMARK;
1405
1406	ASSERT(so_verify_oobstate(so));
1407	return (mp);
1408}
1409
1410/*
1411 * Handle the processing of the actual urgent data.
1412 * Returns the data mblk if it should be queued on the read queue.
1413 */
1414static mblk_t *
1415so_oob_data(struct sonode *so, mblk_t *mp,
1416	strsigset_t *signals, strpollset_t *pollwakeups)
1417{
1418	sotpi_info_t *sti = SOTOTPI(so);
1419
1420	ASSERT(MUTEX_HELD(&so->so_lock));
1421
1422	ASSERT(so_verify_oobstate(so));
1423
1424	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1425	ASSERT(mp != NULL);
1426	/*
1427	 * For OOBINLINE we keep the data in the T_EXDATA_IND.
1428	 * Otherwise we store it in so_oobmsg.
1429	 */
1430	ASSERT(so->so_oobmsg == NULL);
1431	if (so->so_options & SO_OOBINLINE) {
1432		*pollwakeups |= POLLIN | POLLRDNORM | POLLRDBAND;
1433		*signals |= S_INPUT | S_RDNORM;
1434	} else {
1435		*pollwakeups |= POLLRDBAND;
1436		so->so_state |= SS_HAVEOOBDATA;
1437		so->so_oobmsg = mp;
1438		mp = NULL;
1439	}
1440	ASSERT(so_verify_oobstate(so));
1441	return (mp);
1442}
1443
1444/*
1445 * Caller must hold the mutex.
1446 * For delayed processing, save the T_DISCON_IND received
1447 * from below on sti_discon_ind_mp.
1448 * When the message is processed the framework will call:
1449 *      (*func)(so, mp);
1450 */
1451static void
1452so_save_discon_ind(struct sonode *so,
1453	mblk_t *mp,
1454	void (*func)(struct sonode *so, mblk_t *))
1455{
1456	sotpi_info_t *sti = SOTOTPI(so);
1457
1458	ASSERT(MUTEX_HELD(&so->so_lock));
1459
1460	/*
1461	 * Discard new T_DISCON_IND if we have already received another.
1462	 * Currently the earlier message can either be on sti_discon_ind_mp
1463	 * or being processed.
1464	 */
1465	if (sti->sti_discon_ind_mp != NULL || (so->so_flag & SOASYNC_UNBIND)) {
1466		zcmn_err(getzoneid(), CE_WARN,
1467		    "sockfs: received unexpected additional T_DISCON_IND\n");
1468		freemsg(mp);
1469		return;
1470	}
1471	mp->b_prev = (mblk_t *)func;
1472	mp->b_next = NULL;
1473	sti->sti_discon_ind_mp = mp;
1474}
1475
1476/*
1477 * Caller must hold the mutex and make sure that either SOLOCKED
1478 * or SOASYNC_UNBIND is set. Called from so_unlock_single().
1479 * Perform delayed processing of T_DISCON_IND message on sti_discon_ind_mp.
1480 * Need to ensure that strsock_proto() will not end up sleeping for
1481 * SOASYNC_UNBIND, while executing this function.
1482 */
1483void
1484so_drain_discon_ind(struct sonode *so)
1485{
1486	mblk_t	*bp;
1487	void (*func)(struct sonode *so, mblk_t *);
1488	sotpi_info_t *sti = SOTOTPI(so);
1489
1490	ASSERT(MUTEX_HELD(&so->so_lock));
1491	ASSERT(so->so_flag & (SOLOCKED|SOASYNC_UNBIND));
1492
1493	/* Process T_DISCON_IND on sti_discon_ind_mp */
1494	if ((bp = sti->sti_discon_ind_mp) != NULL) {
1495		sti->sti_discon_ind_mp = NULL;
1496		func = (void (*)())bp->b_prev;
1497		bp->b_prev = NULL;
1498
1499		/*
1500		 * This (*func) is supposed to generate a message downstream
1501		 * and we need to have a flag set until the corresponding
1502		 * upstream message reaches stream head.
1503		 * When processing T_DISCON_IND in strsock_discon_ind
1504		 * we hold SOASYN_UNBIND when sending T_UNBIND_REQ down and
1505		 * drop the flag after we get the ACK in strsock_proto.
1506		 */
1507		(void) (*func)(so, bp);
1508	}
1509}
1510
1511/*
1512 * Caller must hold the mutex.
1513 * Remove the T_DISCON_IND on sti_discon_ind_mp.
1514 */
1515void
1516so_flush_discon_ind(struct sonode *so)
1517{
1518	mblk_t	*bp;
1519	sotpi_info_t *sti = SOTOTPI(so);
1520
1521	ASSERT(MUTEX_HELD(&so->so_lock));
1522
1523	/*
1524	 * Remove T_DISCON_IND mblk at sti_discon_ind_mp.
1525	 */
1526	if ((bp = sti->sti_discon_ind_mp) != NULL) {
1527		sti->sti_discon_ind_mp = NULL;
1528		bp->b_prev = NULL;
1529		freemsg(bp);
1530	}
1531}
1532
1533/*
1534 * Caller must hold the mutex.
1535 *
1536 * This function is used to process the T_DISCON_IND message. It does
1537 * immediate processing when called from strsock_proto and delayed
1538 * processing of discon_ind saved on sti_discon_ind_mp when called from
1539 * so_drain_discon_ind. When a T_DISCON_IND message is saved in
1540 * sti_discon_ind_mp for delayed processing, this function is registered
1541 * as the callback function to process the message.
1542 *
1543 * SOASYNC_UNBIND should be held in this function, during the non-blocking
1544 * unbind operation, and should be released only after we receive the ACK
1545 * in strsock_proto, for the T_UNBIND_REQ sent here. Since SOLOCKED is not set,
1546 * no TPI messages would be sent down at this time. This is to prevent M_FLUSH
1547 * sent from either this function or tcp_unbind(), flushing away any TPI
1548 * message that is being sent down and stays in a lower module's queue.
1549 *
1550 * This function drops so_lock and grabs it again.
1551 */
1552static void
1553strsock_discon_ind(struct sonode *so, mblk_t *discon_mp)
1554{
1555	struct vnode *vp;
1556	struct stdata *stp;
1557	union T_primitives *tpr;
1558	struct T_unbind_req *ubr;
1559	mblk_t *mp;
1560	int error;
1561	sotpi_info_t *sti = SOTOTPI(so);
1562
1563	ASSERT(MUTEX_HELD(&so->so_lock));
1564	ASSERT(discon_mp);
1565	ASSERT(discon_mp->b_rptr);
1566
1567	tpr = (union T_primitives *)discon_mp->b_rptr;
1568	ASSERT(tpr->type == T_DISCON_IND);
1569
1570	vp = SOTOV(so);
1571	stp = vp->v_stream;
1572	ASSERT(stp);
1573
1574	/*
1575	 * Not a listener
1576	 */
1577	ASSERT((so->so_state & SS_ACCEPTCONN) == 0);
1578
1579	/*
1580	 * This assumes that the name space for DISCON_reason
1581	 * is the errno name space.
1582	 */
1583	soisdisconnected(so, tpr->discon_ind.DISCON_reason);
1584	sti->sti_laddr_valid = 0;
1585	sti->sti_faddr_valid = 0;
1586
1587	/*
1588	 * Unbind with the transport without blocking.
1589	 * If we've already received a T_DISCON_IND do not unbind.
1590	 *
1591	 * If there is no preallocated unbind message, we have already
1592	 * unbound with the transport
1593	 *
1594	 * If the socket is not bound, no need to unbind.
1595	 */
1596	mp = sti->sti_unbind_mp;
1597	if (mp == NULL) {
1598		ASSERT(!(so->so_state & SS_ISBOUND));
1599		mutex_exit(&so->so_lock);
1600	} else if (!(so->so_state & SS_ISBOUND))  {
1601		mutex_exit(&so->so_lock);
1602	} else {
1603		sti->sti_unbind_mp = NULL;
1604
1605		/*
1606		 * Is another T_DISCON_IND being processed.
1607		 */
1608		ASSERT((so->so_flag & SOASYNC_UNBIND) == 0);
1609
1610		/*
1611		 * Make strsock_proto ignore T_OK_ACK and T_ERROR_ACK for
1612		 * this unbind. Set SOASYNC_UNBIND. This should be cleared
1613		 * only after we receive the ACK in strsock_proto.
1614		 */
1615		so->so_flag |= SOASYNC_UNBIND;
1616		ASSERT(!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)));
1617		so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1618		sti->sti_laddr_valid = 0;
1619		mutex_exit(&so->so_lock);
1620
1621		/*
1622		 * Send down T_UNBIND_REQ ignoring flow control.
1623		 * XXX Assumes that MSG_IGNFLOW implies that this thread
1624		 * does not run service procedures.
1625		 */
1626		ASSERT(DB_TYPE(mp) == M_PROTO);
1627		ubr = (struct T_unbind_req *)mp->b_rptr;
1628		mp->b_wptr += sizeof (*ubr);
1629		ubr->PRIM_type = T_UNBIND_REQ;
1630
1631		/*
1632		 * Flush the read and write side (except stream head read queue)
1633		 * and send down T_UNBIND_REQ.
1634		 */
1635		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1636		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1637		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
1638		/* LINTED - warning: statement has no consequent: if */
1639		if (error) {
1640			eprintsoline(so, error);
1641		}
1642	}
1643
1644	if (tpr->discon_ind.DISCON_reason != 0)
1645		strsetrerror(SOTOV(so), 0, 0, sogetrderr);
1646	strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
1647	strseteof(SOTOV(so), 1);
1648	/*
1649	 * strseteof takes care of read side wakeups,
1650	 * pollwakeups, and signals.
1651	 */
1652	dprintso(so, 1, ("T_DISCON_IND: error %d\n", so->so_error));
1653	freemsg(discon_mp);
1654
1655
1656	pollwakeup(&stp->sd_pollist, POLLOUT);
1657	mutex_enter(&stp->sd_lock);
1658
1659	/*
1660	 * Wake sleeping write
1661	 */
1662	if (stp->sd_flag & WSLEEP) {
1663		stp->sd_flag &= ~WSLEEP;
1664		cv_broadcast(&stp->sd_wrq->q_wait);
1665	}
1666
1667	/*
1668	 * strsendsig can handle multiple signals with a
1669	 * single call.  Send SIGPOLL for S_OUTPUT event.
1670	 */
1671	if (stp->sd_sigflags & S_OUTPUT)
1672		strsendsig(stp->sd_siglist, S_OUTPUT, 0, 0);
1673
1674	mutex_exit(&stp->sd_lock);
1675	mutex_enter(&so->so_lock);
1676}
1677
1678/*
1679 * This routine is registered with the stream head to receive M_PROTO
1680 * and M_PCPROTO messages.
1681 *
1682 * Returns NULL if the message was consumed.
1683 * Returns an mblk to make that mblk be processed (and queued) by the stream
1684 * head.
1685 *
1686 * Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
1687 * *pollwakeups) for the stream head to take action on. Note that since
1688 * sockets always deliver SIGIO for every new piece of data this routine
1689 * never sets *firstmsgsigs; any signals are returned in *allmsgsigs.
1690 *
1691 * This routine handles all data related TPI messages independent of
1692 * the type of the socket i.e. it doesn't care if T_UNITDATA_IND message
1693 * arrive on a SOCK_STREAM.
1694 */
1695static mblk_t *
1696strsock_proto(vnode_t *vp, mblk_t *mp,
1697		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
1698		strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
1699{
1700	union T_primitives *tpr;
1701	struct sonode *so;
1702	sotpi_info_t *sti;
1703
1704	so = VTOSO(vp);
1705	sti = SOTOTPI(so);
1706
1707	dprintso(so, 1, ("strsock_proto(%p, %p)\n", (void *)vp, (void *)mp));
1708
1709	/* Set default return values */
1710	*firstmsgsigs = *wakeups = *allmsgsigs = *pollwakeups = 0;
1711
1712	ASSERT(DB_TYPE(mp) == M_PROTO ||
1713	    DB_TYPE(mp) == M_PCPROTO);
1714
1715	if (MBLKL(mp) < sizeof (tpr->type)) {
1716		/* The message is too short to even contain the primitive */
1717		zcmn_err(getzoneid(), CE_WARN,
1718		    "sockfs: Too short TPI message received. Len = %ld\n",
1719		    (ptrdiff_t)(MBLKL(mp)));
1720		freemsg(mp);
1721		return (NULL);
1722	}
1723	if (!__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
1724		/* The read pointer is not aligned correctly for TPI */
1725		zcmn_err(getzoneid(), CE_WARN,
1726		    "sockfs: Unaligned TPI message received. rptr = %p\n",
1727		    (void *)mp->b_rptr);
1728		freemsg(mp);
1729		return (NULL);
1730	}
1731	tpr = (union T_primitives *)mp->b_rptr;
1732	dprintso(so, 1, ("strsock_proto: primitive %d\n", tpr->type));
1733
1734	switch (tpr->type) {
1735
1736	case T_DATA_IND:
1737		if (MBLKL(mp) < sizeof (struct T_data_ind)) {
1738			zcmn_err(getzoneid(), CE_WARN,
1739			    "sockfs: Too short T_DATA_IND. Len = %ld\n",
1740			    (ptrdiff_t)(MBLKL(mp)));
1741			freemsg(mp);
1742			return (NULL);
1743		}
1744		/*
1745		 * Ignore zero-length T_DATA_IND messages. These might be
1746		 * generated by some transports.
1747		 * This is needed to prevent read (which skips the M_PROTO
1748		 * part) to unexpectedly return 0 (or return EWOULDBLOCK
1749		 * on a non-blocking socket after select/poll has indicated
1750		 * that data is available).
1751		 */
1752		if (msgdsize(mp->b_cont) == 0) {
1753			dprintso(so, 0,
1754			    ("strsock_proto: zero length T_DATA_IND\n"));
1755			freemsg(mp);
1756			return (NULL);
1757		}
1758		*allmsgsigs = S_INPUT | S_RDNORM;
1759		*pollwakeups = POLLIN | POLLRDNORM;
1760		*wakeups = RSLEEP;
1761		return (mp);
1762
1763	case T_UNITDATA_IND: {
1764		struct T_unitdata_ind	*tudi = &tpr->unitdata_ind;
1765		void			*addr;
1766		t_uscalar_t		addrlen;
1767
1768		if (MBLKL(mp) < sizeof (struct T_unitdata_ind)) {
1769			zcmn_err(getzoneid(), CE_WARN,
1770			    "sockfs: Too short T_UNITDATA_IND. Len = %ld\n",
1771			    (ptrdiff_t)(MBLKL(mp)));
1772			freemsg(mp);
1773			return (NULL);
1774		}
1775
1776		/* Is this is not a connected datagram socket? */
1777		if ((so->so_mode & SM_CONNREQUIRED) ||
1778		    !(so->so_state & SS_ISCONNECTED)) {
1779			/*
1780			 * Not a connected datagram socket. Look for
1781			 * the SO_UNIX_CLOSE option. If such an option is found
1782			 * discard the message (since it has no meaning
1783			 * unless connected).
1784			 */
1785			if (so->so_family == AF_UNIX && msgdsize(mp) == 0 &&
1786			    tudi->OPT_length != 0) {
1787				void *opt;
1788				t_uscalar_t optlen = tudi->OPT_length;
1789
1790				opt = sogetoff(mp, tudi->OPT_offset,
1791				    optlen, __TPI_ALIGN_SIZE);
1792				if (opt == NULL) {
1793					/* The len/off falls outside mp */
1794					freemsg(mp);
1795					mutex_enter(&so->so_lock);
1796					soseterror(so, EPROTO);
1797					mutex_exit(&so->so_lock);
1798					zcmn_err(getzoneid(), CE_WARN,
1799					    "sockfs: T_unidata_ind with "
1800					    "invalid optlen/offset %u/%d\n",
1801					    optlen, tudi->OPT_offset);
1802					return (NULL);
1803				}
1804				if (so_getopt_unix_close(opt, optlen)) {
1805					freemsg(mp);
1806					return (NULL);
1807				}
1808			}
1809			*allmsgsigs = S_INPUT | S_RDNORM;
1810			*pollwakeups = POLLIN | POLLRDNORM;
1811			*wakeups = RSLEEP;
1812			if (audit_active)
1813				audit_sock(T_UNITDATA_IND, strvp2wq(vp),
1814				    mp, 0);
1815			return (mp);
1816		}
1817
1818		/*
1819		 * A connect datagram socket. For AF_INET{,6} we verify that
1820		 * the source address matches the "connected to" address.
1821		 * The semantics of AF_UNIX sockets is to not verify
1822		 * the source address.
1823		 * Note that this source address verification is transport
1824		 * specific. Thus the real fix would be to extent TPI
1825		 * to allow T_CONN_REQ messages to be send to connectionless
1826		 * transport providers and always let the transport provider
1827		 * do whatever filtering is needed.
1828		 *
1829		 * The verification/filtering semantics for transports
1830		 * other than AF_INET and AF_UNIX are unknown. The choice
1831		 * would be to either filter using bcmp or let all messages
1832		 * get through. This code does not filter other address
1833		 * families since this at least allows the application to
1834		 * work around any missing filtering.
1835		 *
1836		 * XXX Should we move filtering to UDP/ICMP???
1837		 * That would require passing e.g. a T_DISCON_REQ to UDP
1838		 * when the socket becomes unconnected.
1839		 */
1840		addrlen = tudi->SRC_length;
1841		/*
1842		 * The alignment restriction is really to strict but
1843		 * we want enough alignment to inspect the fields of
1844		 * a sockaddr_in.
1845		 */
1846		addr = sogetoff(mp, tudi->SRC_offset, addrlen,
1847		    __TPI_ALIGN_SIZE);
1848		if (addr == NULL) {
1849			freemsg(mp);
1850			mutex_enter(&so->so_lock);
1851			soseterror(so, EPROTO);
1852			mutex_exit(&so->so_lock);
1853			zcmn_err(getzoneid(), CE_WARN,
1854			    "sockfs: T_unidata_ind with invalid "
1855			    "addrlen/offset %u/%d\n",
1856			    addrlen, tudi->SRC_offset);
1857			return (NULL);
1858		}
1859
1860		if (so->so_family == AF_INET) {
1861			/*
1862			 * For AF_INET we allow wildcarding both sin_addr
1863			 * and sin_port.
1864			 */
1865			struct sockaddr_in *faddr, *sin;
1866
1867			/* Prevent sti_faddr_sa from changing while accessed */
1868			mutex_enter(&so->so_lock);
1869			ASSERT(sti->sti_faddr_len ==
1870			    (socklen_t)sizeof (struct sockaddr_in));
1871			faddr = (struct sockaddr_in *)sti->sti_faddr_sa;
1872			sin = (struct sockaddr_in *)addr;
1873			if (addrlen !=
1874			    (t_uscalar_t)sizeof (struct sockaddr_in) ||
1875			    (sin->sin_addr.s_addr != faddr->sin_addr.s_addr &&
1876			    faddr->sin_addr.s_addr != INADDR_ANY) ||
1877			    (so->so_type != SOCK_RAW &&
1878			    sin->sin_port != faddr->sin_port &&
1879			    faddr->sin_port != 0)) {
1880#ifdef DEBUG
1881				dprintso(so, 0,
1882				    ("sockfs: T_UNITDATA_IND mismatch: %s",
1883				    pr_addr(so->so_family,
1884				    (struct sockaddr *)addr, addrlen)));
1885				dprintso(so, 0, (" - %s\n",
1886				    pr_addr(so->so_family, sti->sti_faddr_sa,
1887				    (t_uscalar_t)sti->sti_faddr_len)));
1888#endif /* DEBUG */
1889				mutex_exit(&so->so_lock);
1890				freemsg(mp);
1891				return (NULL);
1892			}
1893			mutex_exit(&so->so_lock);
1894		} else if (so->so_family == AF_INET6) {
1895			/*
1896			 * For AF_INET6 we allow wildcarding both sin6_addr
1897			 * and sin6_port.
1898			 */
1899			struct sockaddr_in6 *faddr6, *sin6;
1900			static struct in6_addr zeroes; /* inits to all zeros */
1901
1902			/* Prevent sti_faddr_sa from changing while accessed */
1903			mutex_enter(&so->so_lock);
1904			ASSERT(sti->sti_faddr_len ==
1905			    (socklen_t)sizeof (struct sockaddr_in6));
1906			faddr6 = (struct sockaddr_in6 *)sti->sti_faddr_sa;
1907			sin6 = (struct sockaddr_in6 *)addr;
1908			/* XXX could we get a mapped address ::ffff:0.0.0.0 ? */
1909			if (addrlen !=
1910			    (t_uscalar_t)sizeof (struct sockaddr_in6) ||
1911			    (!IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
1912			    &faddr6->sin6_addr) &&
1913			    !IN6_ARE_ADDR_EQUAL(&faddr6->sin6_addr, &zeroes)) ||
1914			    (so->so_type != SOCK_RAW &&
1915			    sin6->sin6_port != faddr6->sin6_port &&
1916			    faddr6->sin6_port != 0)) {
1917#ifdef DEBUG
1918				dprintso(so, 0,
1919				    ("sockfs: T_UNITDATA_IND mismatch: %s",
1920				    pr_addr(so->so_family,
1921				    (struct sockaddr *)addr, addrlen)));
1922				dprintso(so, 0, (" - %s\n",
1923				    pr_addr(so->so_family, sti->sti_faddr_sa,
1924				    (t_uscalar_t)sti->sti_faddr_len)));
1925#endif /* DEBUG */
1926				mutex_exit(&so->so_lock);
1927				freemsg(mp);
1928				return (NULL);
1929			}
1930			mutex_exit(&so->so_lock);
1931		} else if (so->so_family == AF_UNIX &&
1932		    msgdsize(mp->b_cont) == 0 &&
1933		    tudi->OPT_length != 0) {
1934			/*
1935			 * Attempt to extract AF_UNIX
1936			 * SO_UNIX_CLOSE indication from options.
1937			 */
1938			void *opt;
1939			t_uscalar_t optlen = tudi->OPT_length;
1940
1941			opt = sogetoff(mp, tudi->OPT_offset,
1942			    optlen, __TPI_ALIGN_SIZE);
1943			if (opt == NULL) {
1944				/* The len/off falls outside mp */
1945				freemsg(mp);
1946				mutex_enter(&so->so_lock);
1947				soseterror(so, EPROTO);
1948				mutex_exit(&so->so_lock);
1949				zcmn_err(getzoneid(), CE_WARN,
1950				    "sockfs: T_unidata_ind with invalid "
1951				    "optlen/offset %u/%d\n",
1952				    optlen, tudi->OPT_offset);
1953				return (NULL);
1954			}
1955			/*
1956			 * If we received a unix close indication mark the
1957			 * socket and discard this message.
1958			 */
1959			if (so_getopt_unix_close(opt, optlen)) {
1960				mutex_enter(&so->so_lock);
1961				sobreakconn(so, ECONNRESET);
1962				mutex_exit(&so->so_lock);
1963				strsetrerror(SOTOV(so), 0, 0, sogetrderr);
1964				freemsg(mp);
1965				*pollwakeups = POLLIN | POLLRDNORM;
1966				*allmsgsigs = S_INPUT | S_RDNORM;
1967				*wakeups = RSLEEP;
1968				return (NULL);
1969			}
1970		}
1971		*allmsgsigs = S_INPUT | S_RDNORM;
1972		*pollwakeups = POLLIN | POLLRDNORM;
1973		*wakeups = RSLEEP;
1974		return (mp);
1975	}
1976
1977	case T_OPTDATA_IND: {
1978		struct T_optdata_ind	*tdi = &tpr->optdata_ind;
1979
1980		if (MBLKL(mp) < sizeof (struct T_optdata_ind)) {
1981			zcmn_err(getzoneid(), CE_WARN,
1982			    "sockfs: Too short T_OPTDATA_IND. Len = %ld\n",
1983			    (ptrdiff_t)(MBLKL(mp)));
1984			freemsg(mp);
1985			return (NULL);
1986		}
1987		/*
1988		 * Allow zero-length messages carrying options.
1989		 * This is used when carrying the SO_UNIX_CLOSE option.
1990		 */
1991		if (so->so_family == AF_UNIX && msgdsize(mp->b_cont) == 0 &&
1992		    tdi->OPT_length != 0) {
1993			/*
1994			 * Attempt to extract AF_UNIX close indication
1995			 * from the options. Ignore any other options -
1996			 * those are handled once the message is removed
1997			 * from the queue.
1998			 * The close indication message should not carry data.
1999			 */
2000			void *opt;
2001			t_uscalar_t optlen = tdi->OPT_length;
2002
2003			opt = sogetoff(mp, tdi->OPT_offset,
2004			    optlen, __TPI_ALIGN_SIZE);
2005			if (opt == NULL) {
2006				/* The len/off falls outside mp */
2007				freemsg(mp);
2008				mutex_enter(&so->so_lock);
2009				soseterror(so, EPROTO);
2010				mutex_exit(&so->so_lock);
2011				zcmn_err(getzoneid(), CE_WARN,
2012				    "sockfs: T_optdata_ind with invalid "
2013				    "optlen/offset %u/%d\n",
2014				    optlen, tdi->OPT_offset);
2015				return (NULL);
2016			}
2017			/*
2018			 * If we received a close indication mark the
2019			 * socket and discard this message.
2020			 */
2021			if (so_getopt_unix_close(opt, optlen)) {
2022				mutex_enter(&so->so_lock);
2023				socantsendmore(so);
2024				sti->sti_faddr_valid = 0;
2025				mutex_exit(&so->so_lock);
2026				strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2027				freemsg(mp);
2028				return (NULL);
2029			}
2030		}
2031		*allmsgsigs = S_INPUT | S_RDNORM;
2032		*pollwakeups = POLLIN | POLLRDNORM;
2033		*wakeups = RSLEEP;
2034		return (mp);
2035	}
2036
2037	case T_EXDATA_IND: {
2038		mblk_t		*mctl, *mdata;
2039		mblk_t *lbp;
2040		union T_primitives *tprp;
2041		struct stdata   *stp;
2042		queue_t *qp;
2043
2044		if (MBLKL(mp) < sizeof (struct T_exdata_ind)) {
2045			zcmn_err(getzoneid(), CE_WARN,
2046			    "sockfs: Too short T_EXDATA_IND. Len = %ld\n",
2047			    (ptrdiff_t)(MBLKL(mp)));
2048			freemsg(mp);
2049			return (NULL);
2050		}
2051		/*
2052		 * Ignore zero-length T_EXDATA_IND messages. These might be
2053		 * generated by some transports.
2054		 *
2055		 * This is needed to prevent read (which skips the M_PROTO
2056		 * part) to unexpectedly return 0 (or return EWOULDBLOCK
2057		 * on a non-blocking socket after select/poll has indicated
2058		 * that data is available).
2059		 */
2060		dprintso(so, 1,
2061		    ("T_EXDATA_IND(%p): counts %d/%d state %s\n",
2062		    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2063		    pr_state(so->so_state, so->so_mode)));
2064
2065		if (msgdsize(mp->b_cont) == 0) {
2066			dprintso(so, 0,
2067			    ("strsock_proto: zero length T_EXDATA_IND\n"));
2068			freemsg(mp);
2069			return (NULL);
2070		}
2071
2072		/*
2073		 * Split into the T_EXDATA_IND and the M_DATA part.
2074		 * We process these three pieces separately:
2075		 *	signal generation
2076		 *	handling T_EXDATA_IND
2077		 *	handling M_DATA component
2078		 */
2079		mctl = mp;
2080		mdata = mctl->b_cont;
2081		mctl->b_cont = NULL;
2082		mutex_enter(&so->so_lock);
2083		so_oob_sig(so, 0, allmsgsigs, pollwakeups);
2084		mctl = so_oob_exdata(so, mctl, allmsgsigs, pollwakeups);
2085		mdata = so_oob_data(so, mdata, allmsgsigs, pollwakeups);
2086
2087		stp = vp->v_stream;
2088		ASSERT(stp != NULL);
2089		qp = _RD(stp->sd_wrq);
2090
2091		mutex_enter(QLOCK(qp));
2092		lbp = qp->q_last;
2093
2094		/*
2095		 * We want to avoid queueing up a string of T_EXDATA_IND
2096		 * messages with no intervening data messages at the stream
2097		 * head. These messages contribute to the total message
2098		 * count. Eventually this can lead to STREAMS flow contol
2099		 * and also cause TCP to advertise a zero window condition
2100		 * to the peer. This can happen in the degenerate case where
2101		 * the sender and receiver exchange only OOB data. The sender
2102		 * only sends messages with MSG_OOB flag and the receiver
2103		 * receives only MSG_OOB messages and does not use SO_OOBINLINE.
2104		 * An example of this scenario has been reported in applications
2105		 * that use OOB data to exchange heart beats. Flow control
2106		 * relief will never happen if the application only reads OOB
2107		 * data which is done directly by sorecvoob() and the
2108		 * T_EXDATA_IND messages at the streamhead won't be consumed.
2109		 * Note that there is no correctness issue in compressing the
2110		 * string of T_EXDATA_IND messages into a single T_EXDATA_IND
2111		 * message. A single read that does not specify MSG_OOB will
2112		 * read across all the marks in a loop in sotpi_recvmsg().
2113		 * Each mark is individually distinguishable only if the
2114		 * T_EXDATA_IND messages are separated by data messages.
2115		 */
2116		if ((qp->q_first != NULL) && (DB_TYPE(lbp) == M_PROTO)) {
2117			tprp = (union T_primitives *)lbp->b_rptr;
2118			if ((tprp->type == T_EXDATA_IND) &&
2119			    !(so->so_options & SO_OOBINLINE)) {
2120
2121				/*
2122				 * free the new M_PROTO message
2123				 */
2124				freemsg(mctl);
2125
2126				/*
2127				 * adjust the OOB count and OOB	signal count
2128				 * just incremented for the new OOB data.
2129				 */
2130				sti->sti_oobcnt--;
2131				sti->sti_oobsigcnt--;
2132				mutex_exit(QLOCK(qp));
2133				mutex_exit(&so->so_lock);
2134				return (NULL);
2135			}
2136		}
2137		mutex_exit(QLOCK(qp));
2138
2139		/*
2140		 * Pass the T_EXDATA_IND and the M_DATA back separately
2141		 * by using b_next linkage. (The stream head will queue any
2142		 * b_next linked messages separately.) This is needed
2143		 * since MSGMARK applies to the last by of the message
2144		 * hence we can not have any M_DATA component attached
2145		 * to the marked T_EXDATA_IND. Note that the stream head
2146		 * will not consolidate M_DATA messages onto an MSGMARK'ed
2147		 * message in order to preserve the constraint that
2148		 * the T_EXDATA_IND always is a separate message.
2149		 */
2150		ASSERT(mctl != NULL);
2151		mctl->b_next = mdata;
2152		mp = mctl;
2153#ifdef DEBUG
2154		if (mdata == NULL) {
2155			dprintso(so, 1,
2156			    ("after outofline T_EXDATA_IND(%p): "
2157			    "counts %d/%d  poll 0x%x sig 0x%x state %s\n",
2158			    (void *)vp, sti->sti_oobsigcnt,
2159			    sti->sti_oobcnt, *pollwakeups, *allmsgsigs,
2160			    pr_state(so->so_state, so->so_mode)));
2161		} else {
2162			dprintso(so, 1,
2163			    ("after inline T_EXDATA_IND(%p): "
2164			    "counts %d/%d  poll 0x%x sig 0x%x state %s\n",
2165			    (void *)vp, sti->sti_oobsigcnt,
2166			    sti->sti_oobcnt, *pollwakeups, *allmsgsigs,
2167			    pr_state(so->so_state, so->so_mode)));
2168		}
2169#endif /* DEBUG */
2170		mutex_exit(&so->so_lock);
2171		*wakeups = RSLEEP;
2172		return (mp);
2173	}
2174
2175	case T_CONN_CON: {
2176		struct T_conn_con	*conn_con;
2177		void			*addr;
2178		t_uscalar_t		addrlen;
2179
2180		/*
2181		 * Verify the state, update the state to ISCONNECTED,
2182		 * record the potentially new address in the message,
2183		 * and drop the message.
2184		 */
2185		if (MBLKL(mp) < sizeof (struct T_conn_con)) {
2186			zcmn_err(getzoneid(), CE_WARN,
2187			    "sockfs: Too short T_CONN_CON. Len = %ld\n",
2188			    (ptrdiff_t)(MBLKL(mp)));
2189			freemsg(mp);
2190			return (NULL);
2191		}
2192
2193		mutex_enter(&so->so_lock);
2194		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) !=
2195		    SS_ISCONNECTING) {
2196			mutex_exit(&so->so_lock);
2197			dprintso(so, 1,
2198			    ("T_CONN_CON: state %x\n", so->so_state));
2199			freemsg(mp);
2200			return (NULL);
2201		}
2202
2203		conn_con = &tpr->conn_con;
2204		addrlen = conn_con->RES_length;
2205		/*
2206		 * Allow the address to be of different size than sent down
2207		 * in the T_CONN_REQ as long as it doesn't exceed the maxlen.
2208		 * For AF_UNIX require the identical length.
2209		 */
2210		if (so->so_family == AF_UNIX ?
2211		    addrlen != (t_uscalar_t)sizeof (sti->sti_ux_laddr) :
2212		    addrlen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2213			zcmn_err(getzoneid(), CE_WARN,
2214			    "sockfs: T_conn_con with different "
2215			    "length %u/%d\n",
2216			    addrlen, conn_con->RES_length);
2217			soisdisconnected(so, EPROTO);
2218			sti->sti_laddr_valid = 0;
2219			sti->sti_faddr_valid = 0;
2220			mutex_exit(&so->so_lock);
2221			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2222			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2223			strseteof(SOTOV(so), 1);
2224			freemsg(mp);
2225			/*
2226			 * strseteof takes care of read side wakeups,
2227			 * pollwakeups, and signals.
2228			 */
2229			*wakeups = WSLEEP;
2230			*allmsgsigs = S_OUTPUT;
2231			*pollwakeups = POLLOUT;
2232			return (NULL);
2233		}
2234		addr = sogetoff(mp, conn_con->RES_offset, addrlen, 1);
2235		if (addr == NULL) {
2236			zcmn_err(getzoneid(), CE_WARN,
2237			    "sockfs: T_conn_con with invalid "
2238			    "addrlen/offset %u/%d\n",
2239			    addrlen, conn_con->RES_offset);
2240			mutex_exit(&so->so_lock);
2241			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2242			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2243			strseteof(SOTOV(so), 1);
2244			freemsg(mp);
2245			/*
2246			 * strseteof takes care of read side wakeups,
2247			 * pollwakeups, and signals.
2248			 */
2249			*wakeups = WSLEEP;
2250			*allmsgsigs = S_OUTPUT;
2251			*pollwakeups = POLLOUT;
2252			return (NULL);
2253		}
2254
2255		/*
2256		 * Save for getpeername.
2257		 */
2258		if (so->so_family != AF_UNIX) {
2259			sti->sti_faddr_len = (socklen_t)addrlen;
2260			ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2261			bcopy(addr, sti->sti_faddr_sa, addrlen);
2262			sti->sti_faddr_valid = 1;
2263		}
2264
2265		if (so->so_peercred != NULL)
2266			crfree(so->so_peercred);
2267		so->so_peercred = DB_CRED(mp);
2268		so->so_cpid = DB_CPID(mp);
2269		if (so->so_peercred != NULL)
2270			crhold(so->so_peercred);
2271
2272		/* Wakeup anybody sleeping in sowaitconnected */
2273		soisconnected(so);
2274		mutex_exit(&so->so_lock);
2275
2276		/*
2277		 * The socket is now available for sending data.
2278		 */
2279		*wakeups = WSLEEP;
2280		*allmsgsigs = S_OUTPUT;
2281		*pollwakeups = POLLOUT;
2282		freemsg(mp);
2283		return (NULL);
2284	}
2285
2286	/*
2287	 * Extra processing in case of an SSL proxy, before queuing or
2288	 * forwarding to the fallback endpoint
2289	 */
2290	case T_SSL_PROXY_CONN_IND:
2291	case T_CONN_IND:
2292		/*
2293		 * Verify the min size and queue the message on
2294		 * the sti_conn_ind_head/tail list.
2295		 */
2296		if (MBLKL(mp) < sizeof (struct T_conn_ind)) {
2297			zcmn_err(getzoneid(), CE_WARN,
2298			    "sockfs: Too short T_CONN_IND. Len = %ld\n",
2299			    (ptrdiff_t)(MBLKL(mp)));
2300			freemsg(mp);
2301			return (NULL);
2302		}
2303
2304		if (audit_active)
2305			audit_sock(T_CONN_IND, strvp2wq(vp), mp, 0);
2306		if (!(so->so_state & SS_ACCEPTCONN)) {
2307			zcmn_err(getzoneid(), CE_WARN,
2308			    "sockfs: T_conn_ind on non-listening socket\n");
2309			freemsg(mp);
2310			return (NULL);
2311		}
2312
2313		if (tpr->type == T_SSL_PROXY_CONN_IND && mp->b_cont == NULL) {
2314			/* No context: need to fall back */
2315			struct sonode *fbso;
2316			stdata_t *fbstp;
2317
2318			tpr->type = T_CONN_IND;
2319
2320			fbso = kssl_find_fallback(sti->sti_kssl_ent);
2321
2322			/*
2323			 * No fallback: the remote will timeout and
2324			 * disconnect.
2325			 */
2326			if (fbso == NULL) {
2327				freemsg(mp);
2328				return (NULL);
2329			}
2330			fbstp = SOTOV(fbso)->v_stream;
2331			qreply(fbstp->sd_wrq->q_next, mp);
2332			return (NULL);
2333		}
2334		soqueueconnind(so, mp);
2335		*allmsgsigs = S_INPUT | S_RDNORM;
2336		*pollwakeups = POLLIN | POLLRDNORM;
2337		*wakeups = RSLEEP;
2338		return (NULL);
2339
2340	case T_ORDREL_IND:
2341		if (MBLKL(mp) < sizeof (struct T_ordrel_ind)) {
2342			zcmn_err(getzoneid(), CE_WARN,
2343			    "sockfs: Too short T_ORDREL_IND. Len = %ld\n",
2344			    (ptrdiff_t)(MBLKL(mp)));
2345			freemsg(mp);
2346			return (NULL);
2347		}
2348
2349		/*
2350		 * Some providers send this when not fully connected.
2351		 * SunLink X.25 needs to retrieve disconnect reason after
2352		 * disconnect for compatibility. It uses T_ORDREL_IND
2353		 * instead of T_DISCON_IND so that it may use the
2354		 * endpoint after a connect failure to retrieve the
2355		 * reason using an ioctl. Thus we explicitly clear
2356		 * SS_ISCONNECTING here for SunLink X.25.
2357		 * This is a needed TPI violation.
2358		 */
2359		mutex_enter(&so->so_lock);
2360		so->so_state &= ~SS_ISCONNECTING;
2361		socantrcvmore(so);
2362		mutex_exit(&so->so_lock);
2363		strseteof(SOTOV(so), 1);
2364		/*
2365		 * strseteof takes care of read side wakeups,
2366		 * pollwakeups, and signals.
2367		 */
2368		freemsg(mp);
2369		return (NULL);
2370
2371	case T_DISCON_IND:
2372		if (MBLKL(mp) < sizeof (struct T_discon_ind)) {
2373			zcmn_err(getzoneid(), CE_WARN,
2374			    "sockfs: Too short T_DISCON_IND. Len = %ld\n",
2375			    (ptrdiff_t)(MBLKL(mp)));
2376			freemsg(mp);
2377			return (NULL);
2378		}
2379		if (so->so_state & SS_ACCEPTCONN) {
2380			/*
2381			 * This is a listener. Look for a queued T_CONN_IND
2382			 * with a matching sequence number and remove it
2383			 * from the list.
2384			 * It is normal to not find the sequence number since
2385			 * the soaccept might have already dequeued it
2386			 * (in which case the T_CONN_RES will fail with
2387			 * TBADSEQ).
2388			 */
2389			(void) soflushconnind(so, tpr->discon_ind.SEQ_number);
2390			freemsg(mp);
2391			return (0);
2392		}
2393
2394		/*
2395		 * Not a listener
2396		 *
2397		 * If SS_CANTRCVMORE for AF_UNIX ignore the discon_reason.
2398		 * Such a discon_ind appears when the peer has first done
2399		 * a shutdown() followed by a close() in which case we just
2400		 * want to record socantsendmore.
2401		 * In this case sockfs first receives a T_ORDREL_IND followed
2402		 * by a T_DISCON_IND.
2403		 * Note that for other transports (e.g. TCP) we need to handle
2404		 * the discon_ind in this case since it signals an error.
2405		 */
2406		mutex_enter(&so->so_lock);
2407		if ((so->so_state & SS_CANTRCVMORE) &&
2408		    (so->so_family == AF_UNIX)) {
2409			socantsendmore(so);
2410			sti->sti_faddr_valid = 0;
2411			mutex_exit(&so->so_lock);
2412			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2413			dprintso(so, 1,
2414			    ("T_DISCON_IND: error %d\n", so->so_error));
2415			freemsg(mp);
2416			/*
2417			 * Set these variables for caller to process them.
2418			 * For the else part where T_DISCON_IND is processed,
2419			 * this will be done in the function being called
2420			 * (strsock_discon_ind())
2421			 */
2422			*wakeups = WSLEEP;
2423			*allmsgsigs = S_OUTPUT;
2424			*pollwakeups = POLLOUT;
2425		} else if (so->so_flag & (SOASYNC_UNBIND | SOLOCKED)) {
2426			/*
2427			 * Deferred processing of T_DISCON_IND
2428			 */
2429			so_save_discon_ind(so, mp, strsock_discon_ind);
2430			mutex_exit(&so->so_lock);
2431		} else {
2432			/*
2433			 * Process T_DISCON_IND now
2434			 */
2435			(void) strsock_discon_ind(so, mp);
2436			mutex_exit(&so->so_lock);
2437		}
2438		return (NULL);
2439
2440	case T_UDERROR_IND: {
2441		struct T_uderror_ind	*tudi = &tpr->uderror_ind;
2442		void			*addr;
2443		t_uscalar_t		addrlen;
2444		int			error;
2445
2446		dprintso(so, 0,
2447		    ("T_UDERROR_IND: error %d\n", tudi->ERROR_type));
2448
2449		if (MBLKL(mp) < sizeof (struct T_uderror_ind)) {
2450			zcmn_err(getzoneid(), CE_WARN,
2451			    "sockfs: Too short T_UDERROR_IND. Len = %ld\n",
2452			    (ptrdiff_t)(MBLKL(mp)));
2453			freemsg(mp);
2454			return (NULL);
2455		}
2456		/* Ignore on connection-oriented transports */
2457		if (so->so_mode & SM_CONNREQUIRED) {
2458			freemsg(mp);
2459			eprintsoline(so, 0);
2460			zcmn_err(getzoneid(), CE_WARN,
2461			    "sockfs: T_uderror_ind on connection-oriented "
2462			    "transport\n");
2463			return (NULL);
2464		}
2465		addrlen = tudi->DEST_length;
2466		addr = sogetoff(mp, tudi->DEST_offset, addrlen, 1);
2467		if (addr == NULL) {
2468			zcmn_err(getzoneid(), CE_WARN,
2469			    "sockfs: T_uderror_ind with invalid "
2470			    "addrlen/offset %u/%d\n",
2471			    addrlen, tudi->DEST_offset);
2472			freemsg(mp);
2473			return (NULL);
2474		}
2475
2476		/* Verify source address for connected socket. */
2477		mutex_enter(&so->so_lock);
2478		if (so->so_state & SS_ISCONNECTED) {
2479			void *faddr;
2480			t_uscalar_t faddr_len;
2481			boolean_t match = B_FALSE;
2482
2483			switch (so->so_family) {
2484			case AF_INET: {
2485				/* Compare just IP address and port */
2486				struct sockaddr_in *sin1, *sin2;
2487
2488				sin1 = (struct sockaddr_in *)sti->sti_faddr_sa;
2489				sin2 = (struct sockaddr_in *)addr;
2490				if (addrlen == sizeof (struct sockaddr_in) &&
2491				    sin1->sin_port == sin2->sin_port &&
2492				    sin1->sin_addr.s_addr ==
2493				    sin2->sin_addr.s_addr)
2494					match = B_TRUE;
2495				break;
2496			}
2497			case AF_INET6: {
2498				/* Compare just IP address and port. Not flow */
2499				struct sockaddr_in6 *sin1, *sin2;
2500
2501				sin1 = (struct sockaddr_in6 *)sti->sti_faddr_sa;
2502				sin2 = (struct sockaddr_in6 *)addr;
2503				if (addrlen == sizeof (struct sockaddr_in6) &&
2504				    sin1->sin6_port == sin2->sin6_port &&
2505				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
2506				    &sin2->sin6_addr))
2507					match = B_TRUE;
2508				break;
2509			}
2510			case AF_UNIX:
2511				faddr = &sti->sti_ux_faddr;
2512				faddr_len =
2513				    (t_uscalar_t)sizeof (sti->sti_ux_faddr);
2514				if (faddr_len == addrlen &&
2515				    bcmp(addr, faddr, addrlen) == 0)
2516					match = B_TRUE;
2517				break;
2518			default:
2519				faddr = sti->sti_faddr_sa;
2520				faddr_len = (t_uscalar_t)sti->sti_faddr_len;
2521				if (faddr_len == addrlen &&
2522				    bcmp(addr, faddr, addrlen) == 0)
2523					match = B_TRUE;
2524				break;
2525			}
2526
2527			if (!match) {
2528#ifdef DEBUG
2529				dprintso(so, 0,
2530				    ("sockfs: T_UDERR_IND mismatch: %s - ",
2531				    pr_addr(so->so_family,
2532				    (struct sockaddr *)addr, addrlen)));
2533				dprintso(so, 0, ("%s\n",
2534				    pr_addr(so->so_family, sti->sti_faddr_sa,
2535				    sti->sti_faddr_len)));
2536#endif /* DEBUG */
2537				mutex_exit(&so->so_lock);
2538				freemsg(mp);
2539				return (NULL);
2540			}
2541			/*
2542			 * Make the write error nonpersistent. If the error
2543			 * is zero we use ECONNRESET.
2544			 * This assumes that the name space for ERROR_type
2545			 * is the errno name space.
2546			 */
2547			if (tudi->ERROR_type != 0)
2548				error = tudi->ERROR_type;
2549			else
2550				error = ECONNRESET;
2551
2552			soseterror(so, error);
2553			mutex_exit(&so->so_lock);
2554			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2555			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2556			*wakeups = RSLEEP | WSLEEP;
2557			*allmsgsigs = S_INPUT | S_RDNORM | S_OUTPUT;
2558			*pollwakeups = POLLIN | POLLRDNORM | POLLOUT;
2559			freemsg(mp);
2560			return (NULL);
2561		}
2562		/*
2563		 * If the application asked for delayed errors
2564		 * record the T_UDERROR_IND sti_eaddr_mp and the reason in
2565		 * sti_delayed_error for delayed error posting. If the reason
2566		 * is zero use ECONNRESET.
2567		 * Note that delayed error indications do not make sense for
2568		 * AF_UNIX sockets since sendto checks that the destination
2569		 * address is valid at the time of the sendto.
2570		 */
2571		if (!(so->so_options & SO_DGRAM_ERRIND)) {
2572			mutex_exit(&so->so_lock);
2573			freemsg(mp);
2574			return (NULL);
2575		}
2576		if (sti->sti_eaddr_mp != NULL)
2577			freemsg(sti->sti_eaddr_mp);
2578
2579		sti->sti_eaddr_mp = mp;
2580		if (tudi->ERROR_type != 0)
2581			error = tudi->ERROR_type;
2582		else
2583			error = ECONNRESET;
2584		sti->sti_delayed_error = (ushort_t)error;
2585		mutex_exit(&so->so_lock);
2586		return (NULL);
2587	}
2588
2589	case T_ERROR_ACK:
2590		dprintso(so, 0,
2591		    ("strsock_proto: T_ERROR_ACK for %d, error %d/%d\n",
2592		    tpr->error_ack.ERROR_prim,
2593		    tpr->error_ack.TLI_error,
2594		    tpr->error_ack.UNIX_error));
2595
2596		if (MBLKL(mp) < sizeof (struct T_error_ack)) {
2597			zcmn_err(getzoneid(), CE_WARN,
2598			    "sockfs: Too short T_ERROR_ACK. Len = %ld\n",
2599			    (ptrdiff_t)(MBLKL(mp)));
2600			freemsg(mp);
2601			return (NULL);
2602		}
2603		/*
2604		 * Check if we were waiting for the async message
2605		 */
2606		mutex_enter(&so->so_lock);
2607		if ((so->so_flag & SOASYNC_UNBIND) &&
2608		    tpr->error_ack.ERROR_prim == T_UNBIND_REQ) {
2609			so_unlock_single(so, SOASYNC_UNBIND);
2610			mutex_exit(&so->so_lock);
2611			freemsg(mp);
2612			return (NULL);
2613		}
2614		mutex_exit(&so->so_lock);
2615		soqueueack(so, mp);
2616		return (NULL);
2617
2618	case T_OK_ACK:
2619		if (MBLKL(mp) < sizeof (struct T_ok_ack)) {
2620			zcmn_err(getzoneid(), CE_WARN,
2621			    "sockfs: Too short T_OK_ACK. Len = %ld\n",
2622			    (ptrdiff_t)(MBLKL(mp)));
2623			freemsg(mp);
2624			return (NULL);
2625		}
2626		/*
2627		 * Check if we were waiting for the async message
2628		 */
2629		mutex_enter(&so->so_lock);
2630		if ((so->so_flag & SOASYNC_UNBIND) &&
2631		    tpr->ok_ack.CORRECT_prim == T_UNBIND_REQ) {
2632			dprintso(so, 1,
2633			    ("strsock_proto: T_OK_ACK async unbind\n"));
2634			so_unlock_single(so, SOASYNC_UNBIND);
2635			mutex_exit(&so->so_lock);
2636			freemsg(mp);
2637			return (NULL);
2638		}
2639		mutex_exit(&so->so_lock);
2640		soqueueack(so, mp);
2641		return (NULL);
2642
2643	case T_INFO_ACK:
2644		if (MBLKL(mp) < sizeof (struct T_info_ack)) {
2645			zcmn_err(getzoneid(), CE_WARN,
2646			    "sockfs: Too short T_INFO_ACK. Len = %ld\n",
2647			    (ptrdiff_t)(MBLKL(mp)));
2648			freemsg(mp);
2649			return (NULL);
2650		}
2651		soqueueack(so, mp);
2652		return (NULL);
2653
2654	case T_CAPABILITY_ACK:
2655		/*
2656		 * A T_capability_ack need only be large enough to hold
2657		 * the PRIM_type and CAP_bits1 fields; checking for anything
2658		 * larger might reject a correct response from an older
2659		 * provider.
2660		 */
2661		if (MBLKL(mp) < 2 * sizeof (t_uscalar_t)) {
2662			zcmn_err(getzoneid(), CE_WARN,
2663			    "sockfs: Too short T_CAPABILITY_ACK. Len = %ld\n",
2664			    (ptrdiff_t)(MBLKL(mp)));
2665			freemsg(mp);
2666			return (NULL);
2667		}
2668		soqueueack(so, mp);
2669		return (NULL);
2670
2671	case T_BIND_ACK:
2672		if (MBLKL(mp) < sizeof (struct T_bind_ack)) {
2673			zcmn_err(getzoneid(), CE_WARN,
2674			    "sockfs: Too short T_BIND_ACK. Len = %ld\n",
2675			    (ptrdiff_t)(MBLKL(mp)));
2676			freemsg(mp);
2677			return (NULL);
2678		}
2679		soqueueack(so, mp);
2680		return (NULL);
2681
2682	case T_OPTMGMT_ACK:
2683		if (MBLKL(mp) < sizeof (struct T_optmgmt_ack)) {
2684			zcmn_err(getzoneid(), CE_WARN,
2685			    "sockfs: Too short T_OPTMGMT_ACK. Len = %ld\n",
2686			    (ptrdiff_t)(MBLKL(mp)));
2687			freemsg(mp);
2688			return (NULL);
2689		}
2690		soqueueack(so, mp);
2691		return (NULL);
2692	default:
2693#ifdef DEBUG
2694		zcmn_err(getzoneid(), CE_WARN,
2695		    "sockfs: unknown TPI primitive %d received\n",
2696		    tpr->type);
2697#endif /* DEBUG */
2698		freemsg(mp);
2699		return (NULL);
2700	}
2701}
2702
2703/*
2704 * This routine is registered with the stream head to receive other
2705 * (non-data, and non-proto) messages.
2706 *
2707 * Returns NULL if the message was consumed.
2708 * Returns an mblk to make that mblk be processed by the stream head.
2709 *
2710 * Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
2711 * *pollwakeups) for the stream head to take action on.
2712 */
2713static mblk_t *
2714strsock_misc(vnode_t *vp, mblk_t *mp,
2715		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
2716		strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
2717{
2718	struct sonode *so;
2719	sotpi_info_t *sti;
2720
2721	so = VTOSO(vp);
2722	sti = SOTOTPI(so);
2723
2724	dprintso(so, 1, ("strsock_misc(%p, %p, 0x%x)\n",
2725	    (void *)vp, (void *)mp, DB_TYPE(mp)));
2726
2727	/* Set default return values */
2728	*wakeups = *allmsgsigs = *firstmsgsigs = *pollwakeups = 0;
2729
2730	switch (DB_TYPE(mp)) {
2731	case M_PCSIG:
2732		/*
2733		 * This assumes that an M_PCSIG for the urgent data arrives
2734		 * before the corresponding T_EXDATA_IND.
2735		 *
2736		 * Note: Just like in SunOS 4.X and 4.4BSD a poll will be
2737		 * awoken before the urgent data shows up.
2738		 * For OOBINLINE this can result in select returning
2739		 * only exceptions as opposed to except|read.
2740		 */
2741		if (*mp->b_rptr == SIGURG) {
2742			mutex_enter(&so->so_lock);
2743			dprintso(so, 1,
2744			    ("SIGURG(%p): counts %d/%d state %s\n",
2745			    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2746			    pr_state(so->so_state, so->so_mode)));
2747			so_oob_sig(so, 1, allmsgsigs, pollwakeups);
2748			dprintso(so, 1,
2749			    ("after SIGURG(%p): counts %d/%d "
2750			    " poll 0x%x sig 0x%x state %s\n",
2751			    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2752			    *pollwakeups, *allmsgsigs,
2753			    pr_state(so->so_state, so->so_mode)));
2754			mutex_exit(&so->so_lock);
2755		}
2756		freemsg(mp);
2757		return (NULL);
2758
2759	case M_SIG:
2760	case M_HANGUP:
2761	case M_UNHANGUP:
2762	case M_ERROR:
2763		/* M_ERRORs etc are ignored */
2764		freemsg(mp);
2765		return (NULL);
2766
2767	case M_FLUSH:
2768		/*
2769		 * Do not flush read queue. If the M_FLUSH
2770		 * arrives because of an impending T_discon_ind
2771		 * we still have to keep any queued data - this is part of
2772		 * socket semantics.
2773		 */
2774		if (*mp->b_rptr & FLUSHW) {
2775			*mp->b_rptr &= ~FLUSHR;
2776			return (mp);
2777		}
2778		freemsg(mp);
2779		return (NULL);
2780
2781	default:
2782		return (mp);
2783	}
2784}
2785
2786
2787/* Register to receive signals for certain events */
2788int
2789so_set_asyncsigs(vnode_t *vp, pid_t pgrp, int events, int mode, cred_t *cr)
2790{
2791	struct strsigset ss;
2792	int32_t rval;
2793
2794	/*
2795	 * Note that SOLOCKED will be set except for the call from soaccept().
2796	 */
2797	ASSERT(!mutex_owned(&VTOSO(vp)->so_lock));
2798	ss.ss_pid = pgrp;
2799	ss.ss_events = events;
2800	return (strioctl(vp, I_ESETSIG, (intptr_t)&ss, mode, K_TO_K, cr,
2801	    &rval));
2802}
2803
2804
2805/* Register for events matching the SS_ASYNC flag */
2806int
2807so_set_events(struct sonode *so, vnode_t *vp, cred_t *cr)
2808{
2809	int events = so->so_state & SS_ASYNC ?
2810	    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
2811	    S_RDBAND | S_BANDURG;
2812
2813	return (so_set_asyncsigs(vp, so->so_pgrp, events, 0, cr));
2814}
2815
2816
2817/* Change the SS_ASYNC flag, and update signal delivery if needed */
2818int
2819so_flip_async(struct sonode *so, vnode_t *vp, int mode, cred_t *cr)
2820{
2821	ASSERT(mutex_owned(&so->so_lock));
2822	if (so->so_pgrp != 0) {
2823		int error;
2824		int events = so->so_state & SS_ASYNC ?		/* Old flag */
2825		    S_RDBAND | S_BANDURG :			/* New sigs */
2826		    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT;
2827
2828		so_lock_single(so);
2829		mutex_exit(&so->so_lock);
2830
2831		error = so_set_asyncsigs(vp, so->so_pgrp, events, mode, cr);
2832
2833		mutex_enter(&so->so_lock);
2834		so_unlock_single(so, SOLOCKED);
2835		if (error)
2836			return (error);
2837	}
2838	so->so_state ^= SS_ASYNC;
2839	return (0);
2840}
2841
2842/*
2843 * Set new pid/pgrp for SIGPOLL (or SIGIO for FIOASYNC mode), replacing
2844 * any existing one.  If passed zero, just clear the existing one.
2845 */
2846int
2847so_set_siggrp(struct sonode *so, vnode_t *vp, pid_t pgrp, int mode, cred_t *cr)
2848{
2849	int events = so->so_state & SS_ASYNC ?
2850	    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
2851	    S_RDBAND | S_BANDURG;
2852	int error;
2853
2854	ASSERT(mutex_owned(&so->so_lock));
2855
2856	/*
2857	 * Change socket process (group).
2858	 *
2859	 * strioctl (via so_set_asyncsigs) will perform permission check and
2860	 * also keep a PID_HOLD to prevent the pid from being reused.
2861	 */
2862	so_lock_single(so);
2863	mutex_exit(&so->so_lock);
2864
2865	if (pgrp != 0) {
2866		dprintso(so, 1, ("setown: adding pgrp %d ev 0x%x\n",
2867		    pgrp, events));
2868		error = so_set_asyncsigs(vp, pgrp, events, mode, cr);
2869		if (error != 0) {
2870			eprintsoline(so, error);
2871			goto bad;
2872		}
2873	}
2874	/* Remove the previously registered process/group */
2875	if (so->so_pgrp != 0) {
2876		dprintso(so, 1, ("setown: removing pgrp %d\n", so->so_pgrp));
2877		error = so_set_asyncsigs(vp, so->so_pgrp, 0, mode, cr);
2878		if (error != 0) {
2879			eprintsoline(so, error);
2880			error = 0;
2881		}
2882	}
2883	mutex_enter(&so->so_lock);
2884	so_unlock_single(so, SOLOCKED);
2885	so->so_pgrp = pgrp;
2886	return (0);
2887bad:
2888	mutex_enter(&so->so_lock);
2889	so_unlock_single(so, SOLOCKED);
2890	return (error);
2891}
2892
2893/*
2894 * Wrapper for getmsg. If the socket has been converted to a stream
2895 * pass the request to the stream head.
2896 */
2897int
2898sock_getmsg(
2899	struct vnode *vp,
2900	struct strbuf *mctl,
2901	struct strbuf *mdata,
2902	uchar_t *prip,
2903	int *flagsp,
2904	int fmode,
2905	rval_t *rvp
2906)
2907{
2908	struct sonode *so;
2909
2910	ASSERT(vp->v_type == VSOCK);
2911	/*
2912	 * Use the stream head to find the real socket vnode.
2913	 * This is needed when namefs sits above sockfs.  Some
2914	 * sockets (like SCTP) are not streams.
2915	 */
2916	if (!vp->v_stream) {
2917		return (ENOSTR);
2918	}
2919	ASSERT(vp->v_stream->sd_vnode);
2920	vp = vp->v_stream->sd_vnode;
2921	ASSERT(vn_matchops(vp, socket_vnodeops));
2922	so = VTOSO(vp);
2923
2924	dprintso(so, 1, ("sock_getmsg(%p) %s\n",
2925	    (void *)so, pr_state(so->so_state, so->so_mode)));
2926
2927	if (so->so_version == SOV_STREAM) {
2928		/* The imaginary "sockmod" has been popped - act as a stream */
2929		return (strgetmsg(vp, mctl, mdata, prip, flagsp, fmode, rvp));
2930	}
2931	eprintsoline(so, ENOSTR);
2932	return (ENOSTR);
2933}
2934
2935/*
2936 * Wrapper for putmsg. If the socket has been converted to a stream
2937 * pass the request to the stream head.
2938 *
2939 * Note that a while a regular socket (SOV_SOCKSTREAM) does support the
2940 * streams ioctl set it does not support putmsg and getmsg.
2941 * Allowing putmsg would prevent sockfs from tracking the state of
2942 * the socket/transport and would also invalidate the locking in sockfs.
2943 */
2944int
2945sock_putmsg(
2946	struct vnode *vp,
2947	struct strbuf *mctl,
2948	struct strbuf *mdata,
2949	uchar_t pri,
2950	int flag,
2951	int fmode
2952)
2953{
2954	struct sonode *so;
2955
2956	ASSERT(vp->v_type == VSOCK);
2957	/*
2958	 * Use the stream head to find the real socket vnode.
2959	 * This is needed when namefs sits above sockfs.
2960	 */
2961	if (!vp->v_stream) {
2962		return (ENOSTR);
2963	}
2964	ASSERT(vp->v_stream->sd_vnode);
2965	vp = vp->v_stream->sd_vnode;
2966	ASSERT(vn_matchops(vp, socket_vnodeops));
2967	so = VTOSO(vp);
2968
2969	dprintso(so, 1, ("sock_putmsg(%p) %s\n",
2970	    (void *)so, pr_state(so->so_state, so->so_mode)));
2971
2972	if (so->so_version == SOV_STREAM) {
2973		/* The imaginary "sockmod" has been popped - act as a stream */
2974		return (strputmsg(vp, mctl, mdata, pri, flag, fmode));
2975	}
2976	eprintsoline(so, ENOSTR);
2977	return (ENOSTR);
2978}
2979
2980/*
2981 * Special function called only from f_getfl().
2982 * Returns FASYNC if the SS_ASYNC flag is set on a socket, else 0.
2983 * No locks are acquired here, so it is safe to use while uf_lock is held.
2984 * This exists solely for BSD fcntl() FASYNC compatibility.
2985 */
2986int
2987sock_getfasync(vnode_t *vp)
2988{
2989	struct sonode *so;
2990
2991	ASSERT(vp->v_type == VSOCK);
2992	/*
2993	 * For stream model, v_stream is used; For non-stream, v_stream always
2994	 * equals NULL
2995	 */
2996	if (vp->v_stream != NULL)
2997		so = VTOSO(vp->v_stream->sd_vnode);
2998	else
2999		so = VTOSO(vp);
3000
3001	if (so->so_version == SOV_STREAM || !(so->so_state & SS_ASYNC))
3002		return (0);
3003
3004	return (FASYNC);
3005}
3006
3007/*
3008 * Sockfs sodirect STREAMS read put procedure. Called from sodirect enable
3009 * transport driver/module with an mblk_t chain.
3010 *
3011 * Note, we in-line putq() for the fast-path cases of q is empty, q_last and
3012 * bp are of type M_DATA. All other cases we call putq().
3013 *
3014 * On success a zero will be return, else an errno will be returned.
3015 */
3016int
3017sodput(sodirect_t *sodp, mblk_t *bp)
3018{
3019	queue_t		*q = sodp->sod_q;
3020	struct stdata	*stp = (struct stdata *)q->q_ptr;
3021	mblk_t		*nbp;
3022	mblk_t		*last = q->q_last;
3023	int		bytecnt = 0;
3024	int		mblkcnt = 0;
3025
3026
3027	ASSERT(MUTEX_HELD(sodp->sod_lockp));
3028
3029	if (stp->sd_flag == STREOF) {
3030		do {
3031			if ((nbp = bp->b_next) != NULL)
3032				bp->b_next = NULL;
3033			freemsg(bp);
3034		} while ((bp = nbp) != NULL);
3035
3036		return (0);
3037	}
3038
3039	mutex_enter(QLOCK(q));
3040	if (q->q_first == NULL) {
3041		/* Q empty, really fast fast-path */
3042		bp->b_prev = NULL;
3043		bp->b_next = NULL;
3044		q->q_first = bp;
3045		q->q_last = bp;
3046
3047	} else if (last->b_datap->db_type == M_DATA &&
3048	    bp->b_datap->db_type == M_DATA) {
3049		/*
3050		 * Last mblk_t chain and bp are both type M_DATA so
3051		 * in-line putq() here, if the DBLK_UIOA state match
3052		 * add bp to the end of the current last chain, else
3053		 * start a new last chain with bp.
3054		 */
3055		if ((last->b_datap->db_flags & DBLK_UIOA) ==
3056		    (bp->b_datap->db_flags & DBLK_UIOA)) {
3057			/* Added to end */
3058			while ((nbp = last->b_cont) != NULL)
3059				last = nbp;
3060			last->b_cont = bp;
3061		} else {
3062			/* New last */
3063			ASSERT((bp->b_datap->db_flags & DBLK_UIOA) == 0 ||
3064			    msgdsize(bp) == sodp->sod_uioa.uioa_mbytes);
3065			last->b_next = bp;
3066			bp->b_next = NULL;
3067			bp->b_prev = last;
3068			q->q_last = bp;
3069		}
3070	} else {
3071		/*
3072		 * Can't use q_last so just call putq().
3073		 */
3074		mutex_exit(QLOCK(q));
3075
3076		ASSERT((bp->b_datap->db_flags & DBLK_UIOA) == 0 ||
3077		    msgdsize(bp) == sodp->sod_uioa.uioa_mbytes);
3078		(void) putq(q, bp);
3079		return (0);
3080	}
3081
3082	/* Count bytes and mblk_t's */
3083	do {
3084		bytecnt += MBLKL(bp);
3085		mblkcnt++;
3086	} while ((bp = bp->b_cont) != NULL);
3087	q->q_count += bytecnt;
3088	q->q_mblkcnt += mblkcnt;
3089
3090	/* Check for QFULL */
3091	if (q->q_count >= q->q_hiwat + sodp->sod_want ||
3092	    q->q_mblkcnt >= q->q_hiwat) {
3093		q->q_flag |= QFULL;
3094	}
3095
3096	mutex_exit(QLOCK(q));
3097	return (0);
3098}
3099
3100/*
3101 * Sockfs sodirect read wakeup. Called from a sodirect enabled transport
3102 * driver/module to indicate that read-side data is available.
3103 *
3104 * On return the sodirect_t.lock mutex will be exited so this must be the
3105 * last sodirect_t call to guarantee atomic access of *sodp.
3106 */
3107void
3108sodwakeup(sodirect_t *sodp)
3109{
3110	queue_t		*q = sodp->sod_q;
3111	struct stdata	*stp = (struct stdata *)q->q_ptr;
3112
3113	ASSERT(MUTEX_HELD(sodp->sod_lockp));
3114
3115	if (stp->sd_flag & RSLEEP) {
3116		stp->sd_flag &= ~RSLEEP;
3117		cv_broadcast(&q->q_wait);
3118	}
3119
3120	if (stp->sd_rput_opt & SR_POLLIN) {
3121		stp->sd_rput_opt &= ~SR_POLLIN;
3122		mutex_exit(sodp->sod_lockp);
3123		pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM);
3124	} else
3125		mutex_exit(sodp->sod_lockp);
3126}
3127