1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#include <sys/types.h>
27#include <sys/inttypes.h>
28#include <sys/t_lock.h>
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/buf.h>
32#include <sys/conf.h>
33#include <sys/cred.h>
34#include <sys/kmem.h>
35#include <sys/sysmacros.h>
36#include <sys/vfs.h>
37#include <sys/vnode.h>
38#include <sys/debug.h>
39#include <sys/errno.h>
40#include <sys/time.h>
41#include <sys/file.h>
42#include <sys/user.h>
43#include <sys/stream.h>
44#include <sys/strsubr.h>
45#include <sys/esunddi.h>
46#include <sys/flock.h>
47#include <sys/modctl.h>
48#include <sys/vtrace.h>
49#include <sys/strsun.h>
50#include <sys/cmn_err.h>
51#include <sys/proc.h>
52#include <sys/ddi.h>
53
54#include <sys/suntpi.h>
55#include <sys/socket.h>
56#include <sys/sockio.h>
57#include <sys/socketvar.h>
58#include <netinet/in.h>
59#include <inet/common.h>
60#include <inet/proto_set.h>
61
62#include <sys/tiuser.h>
63#define	_SUN_TPI_VERSION	2
64#include <sys/tihdr.h>
65
66#include <c2/audit.h>
67
68#include <fs/sockfs/socktpi.h>
69#include <fs/sockfs/socktpi_impl.h>
70
71int so_default_version = SOV_SOCKSTREAM;
72
73#ifdef DEBUG
74/* Set sockdebug to print debug messages when SO_DEBUG is set */
75int sockdebug = 0;
76
77/* Set sockprinterr to print error messages when SO_DEBUG is set */
78int sockprinterr = 0;
79
80/*
81 * Set so_default_options to SO_DEBUG is all sockets should be created
82 * with SO_DEBUG set. This is needed to get debug printouts from the
83 * socket() call itself.
84 */
85int so_default_options = 0;
86#endif /* DEBUG */
87
88#ifdef SOCK_TEST
89/*
90 * Set to number of ticks to limit cv_waits for code coverage testing.
91 * Set to 1000 when SO_DEBUG is set to 2.
92 */
93clock_t sock_test_timelimit = 0;
94#endif /* SOCK_TEST */
95
96/*
97 * For concurrency testing of e.g. opening /dev/ip which does not
98 * handle T_INFO_REQ messages.
99 */
100int so_no_tinfo = 0;
101
102/*
103 * Timeout for getting a T_CAPABILITY_ACK - it is possible for a provider
104 * to simply ignore the T_CAPABILITY_REQ.
105 */
106clock_t	sock_capability_timeout	= 2;	/* seconds */
107
108static int	do_tcapability(struct sonode *so, t_uscalar_t cap_bits1);
109static void	so_removehooks(struct sonode *so);
110
111static mblk_t *strsock_proto(vnode_t *vp, mblk_t *mp,
112		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
113		strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
114static mblk_t *strsock_misc(vnode_t *vp, mblk_t *mp,
115		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
116		strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
117
118/*
119 * Convert a socket to a stream. Invoked when the illusory sockmod
120 * is popped from the stream.
121 * Change the stream head back to default operation without losing
122 * any messages (T_conn_ind's are moved to the stream head queue).
123 */
124int
125so_sock2stream(struct sonode *so)
126{
127	struct vnode		*vp = SOTOV(so);
128	queue_t			*rq;
129	mblk_t			*mp;
130	int			error = 0;
131	sotpi_info_t		*sti = SOTOTPI(so);
132
133	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
134
135	mutex_enter(&so->so_lock);
136	so_lock_single(so);
137
138	ASSERT(so->so_version != SOV_STREAM);
139
140	if (sti->sti_direct) {
141		mblk_t **mpp;
142		int rval;
143
144		/*
145		 * Tell the transport below that sockmod is being popped
146		 */
147		mutex_exit(&so->so_lock);
148		error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, CRED(),
149		    &rval);
150		mutex_enter(&so->so_lock);
151		if (error != 0) {
152			dprintso(so, 0, ("so_sock2stream(%p): "
153			    "_SIOCSOCKFALLBACK failed\n", (void *)so));
154			goto exit;
155		}
156		sti->sti_direct = 0;
157
158		for (mpp = &sti->sti_conn_ind_head; (mp = *mpp) != NULL;
159		    mpp = &mp->b_next) {
160			struct T_conn_ind	*conn_ind;
161
162			/*
163			 * strsock_proto() has already verified the length of
164			 * this message block.
165			 */
166			ASSERT(MBLKL(mp) >= sizeof (struct T_conn_ind));
167
168			conn_ind = (struct T_conn_ind *)mp->b_rptr;
169			if (conn_ind->OPT_length == 0 &&
170			    conn_ind->OPT_offset == 0)
171				continue;
172
173			if (DB_REF(mp) > 1) {
174				mblk_t	*newmp;
175				size_t	length;
176				cred_t	*cr;
177				pid_t	cpid;
178				int error;	/* Dummy - error not returned */
179
180				/*
181				 * Copy the message block because it is used
182				 * elsewhere, too.
183				 * Can't use copyb since we want to wait
184				 * yet allow for EINTR.
185				 */
186				/* Round up size for reuse */
187				length = MAX(MBLKL(mp), 64);
188				cr = msg_getcred(mp, &cpid);
189				if (cr != NULL) {
190					newmp = allocb_cred_wait(length, 0,
191					    &error, cr, cpid);
192				} else {
193					newmp = allocb_wait(length, 0, 0,
194					    &error);
195				}
196				if (newmp == NULL) {
197					error = EINTR;
198					goto exit;
199				}
200				bcopy(mp->b_rptr, newmp->b_wptr, length);
201				newmp->b_wptr += length;
202				newmp->b_next = mp->b_next;
203
204				/*
205				 * Link the new message block into the queue
206				 * and free the old one.
207				 */
208				*mpp = newmp;
209				mp->b_next = NULL;
210				freemsg(mp);
211
212				mp = newmp;
213				conn_ind = (struct T_conn_ind *)mp->b_rptr;
214			}
215
216			/*
217			 * Remove options added by TCP for accept fast-path.
218			 */
219			conn_ind->OPT_length = 0;
220			conn_ind->OPT_offset = 0;
221		}
222	}
223
224	so->so_version = SOV_STREAM;
225	so->so_proto_handle = NULL;
226
227	/*
228	 * Remove the hooks in the stream head to avoid queuing more
229	 * packets in sockfs.
230	 */
231	mutex_exit(&so->so_lock);
232	so_removehooks(so);
233	mutex_enter(&so->so_lock);
234
235	/*
236	 * Clear any state related to urgent data. Leave any T_EXDATA_IND
237	 * on the queue - the behavior of urgent data after a switch is
238	 * left undefined.
239	 */
240	so->so_error = sti->sti_delayed_error = 0;
241	freemsg(so->so_oobmsg);
242	so->so_oobmsg = NULL;
243	sti->sti_oobsigcnt = sti->sti_oobcnt = 0;
244
245	so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
246	    SS_SAVEDEOR);
247	ASSERT(so_verify_oobstate(so));
248
249	freemsg(sti->sti_ack_mp);
250	sti->sti_ack_mp = NULL;
251
252	/*
253	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
254	 */
255	so_flush_discon_ind(so);
256
257	/*
258	 * Move any queued T_CONN_IND messages to stream head queue.
259	 */
260	rq = RD(strvp2wq(vp));
261	while ((mp = sti->sti_conn_ind_head) != NULL) {
262		sti->sti_conn_ind_head = mp->b_next;
263		mp->b_next = NULL;
264		if (sti->sti_conn_ind_head == NULL) {
265			ASSERT(sti->sti_conn_ind_tail == mp);
266			sti->sti_conn_ind_tail = NULL;
267		}
268		dprintso(so, 0,
269		    ("so_sock2stream(%p): moving T_CONN_IND\n", (void *)so));
270
271		/* Drop lock across put() */
272		mutex_exit(&so->so_lock);
273		put(rq, mp);
274		mutex_enter(&so->so_lock);
275	}
276
277exit:
278	ASSERT(MUTEX_HELD(&so->so_lock));
279	so_unlock_single(so, SOLOCKED);
280	mutex_exit(&so->so_lock);
281	return (error);
282}
283
284/*
285 * Covert a stream back to a socket. This is invoked when the illusory
286 * sockmod is pushed on a stream (where the stream was "created" by
287 * popping the illusory sockmod).
288 * This routine can not recreate the socket state (certain aspects of
289 * it like urgent data state and the bound/connected addresses for AF_UNIX
290 * sockets can not be recreated by asking the transport for information).
291 * Thus this routine implicitly assumes that the socket is in an initial
292 * state (as if it was just created). It flushes any messages queued on the
293 * read queue to avoid dealing with e.g. TPI acks or T_exdata_ind messages.
294 */
295void
296so_stream2sock(struct sonode *so)
297{
298	struct vnode *vp = SOTOV(so);
299	sotpi_info_t *sti = SOTOTPI(so);
300
301	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
302
303	mutex_enter(&so->so_lock);
304	so_lock_single(so);
305	ASSERT(so->so_version == SOV_STREAM);
306	so->so_version = SOV_SOCKSTREAM;
307	sti->sti_pushcnt = 0;
308	mutex_exit(&so->so_lock);
309
310	/*
311	 * Set a permenent error to force any thread in sorecvmsg to
312	 * return (and drop SOREADLOCKED). Clear the error once
313	 * we have SOREADLOCKED.
314	 * This makes a read sleeping during the I_PUSH of sockmod return
315	 * EIO.
316	 */
317	strsetrerror(SOTOV(so), EIO, 1, NULL);
318
319	/*
320	 * Get the read lock before flushing data to avoid
321	 * problems with the T_EXDATA_IND MSG_PEEK code in sorecvmsg.
322	 */
323	mutex_enter(&so->so_lock);
324	(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
325	mutex_exit(&so->so_lock);
326
327	strsetrerror(SOTOV(so), 0, 0, NULL);
328	so_installhooks(so);
329
330	/*
331	 * Flush everything on the read queue.
332	 * This ensures that no T_CONN_IND remain and that no T_EXDATA_IND
333	 * remain; those types of messages would confuse sockfs.
334	 */
335	strflushrq(vp, FLUSHALL);
336	mutex_enter(&so->so_lock);
337
338	/*
339	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
340	 */
341	so_flush_discon_ind(so);
342	so_unlock_read(so);	/* Clear SOREADLOCKED */
343
344	so_unlock_single(so, SOLOCKED);
345	mutex_exit(&so->so_lock);
346}
347
348/*
349 * Install the hooks in the stream head.
350 */
351void
352so_installhooks(struct sonode *so)
353{
354	struct vnode *vp = SOTOV(so);
355
356	strsetrputhooks(vp, SH_SIGALLDATA | SH_IGN_ZEROLEN | SH_CONSOL_DATA,
357	    strsock_proto, strsock_misc);
358	strsetwputhooks(vp, SH_SIGPIPE | SH_RECHECK_ERR, 0);
359}
360
361/*
362 * Remove the hooks in the stream head.
363 */
364static void
365so_removehooks(struct sonode *so)
366{
367	struct vnode *vp = SOTOV(so);
368
369	strsetrputhooks(vp, 0, NULL, NULL);
370	strsetwputhooks(vp, 0, STRTIMOUT);
371	/*
372	 * Leave read behavior as it would have been for a normal
373	 * stream i.e. a read of an M_PROTO will fail.
374	 */
375}
376
377void
378so_basic_strinit(struct sonode *so)
379{
380	struct vnode *vp = SOTOV(so);
381	struct stdata *stp;
382	mblk_t *mp;
383	sotpi_info_t *sti = SOTOTPI(so);
384
385	/* Preallocate an unbind_req message */
386	mp = soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP, CRED());
387	mutex_enter(&so->so_lock);
388	sti->sti_unbind_mp = mp;
389#ifdef DEBUG
390	so->so_options = so_default_options;
391#endif /* DEBUG */
392	mutex_exit(&so->so_lock);
393
394	so_installhooks(so);
395
396	stp = vp->v_stream;
397	/*
398	 * Have to keep minpsz at zero in order to allow write/send of zero
399	 * bytes.
400	 */
401	mutex_enter(&stp->sd_lock);
402	if (stp->sd_qn_minpsz == 1)
403		stp->sd_qn_minpsz = 0;
404	mutex_exit(&stp->sd_lock);
405}
406
407/*
408 * Initialize the streams side of a socket including
409 * T_info_req/ack processing. If tso is not NULL its values are used thereby
410 * avoiding the T_INFO_REQ.
411 */
412int
413so_strinit(struct sonode *so, struct sonode *tso)
414{
415	sotpi_info_t *sti = SOTOTPI(so);
416	sotpi_info_t *tsti;
417	int error;
418
419	so_basic_strinit(so);
420
421	/*
422	 * The T_CAPABILITY_REQ should be the first message sent down because
423	 * at least TCP has a fast-path for this which avoids timeouts while
424	 * waiting for the T_CAPABILITY_ACK under high system load.
425	 */
426	if (tso == NULL) {
427		error = do_tcapability(so, TC1_ACCEPTOR_ID | TC1_INFO);
428		if (error)
429			return (error);
430	} else {
431		tsti = SOTOTPI(tso);
432
433		mutex_enter(&so->so_lock);
434		sti->sti_tsdu_size = tsti->sti_tsdu_size;
435		sti->sti_etsdu_size = tsti->sti_etsdu_size;
436		sti->sti_addr_size = tsti->sti_addr_size;
437		sti->sti_opt_size = tsti->sti_opt_size;
438		sti->sti_tidu_size = tsti->sti_tidu_size;
439		sti->sti_serv_type = tsti->sti_serv_type;
440		so->so_mode = tso->so_mode & ~SM_ACCEPTOR_ID;
441		mutex_exit(&so->so_lock);
442
443		/* the following do_tcapability may update so->so_mode */
444		if ((tsti->sti_serv_type != T_CLTS) &&
445		    (sti->sti_direct == 0)) {
446			error = do_tcapability(so, TC1_ACCEPTOR_ID);
447			if (error)
448				return (error);
449		}
450	}
451	/*
452	 * If the addr_size is 0 we treat it as already bound
453	 * and connected. This is used by the routing socket.
454	 * We set the addr_size to something to allocate a the address
455	 * structures.
456	 */
457	if (sti->sti_addr_size == 0) {
458		so->so_state |= SS_ISBOUND | SS_ISCONNECTED;
459		/* Address size can vary with address families. */
460		if (so->so_family == AF_INET6)
461			sti->sti_addr_size =
462			    (t_scalar_t)sizeof (struct sockaddr_in6);
463		else
464			sti->sti_addr_size =
465			    (t_scalar_t)sizeof (struct sockaddr_in);
466		ASSERT(sti->sti_unbind_mp);
467	}
468
469	so_alloc_addr(so, sti->sti_addr_size);
470
471	return (0);
472}
473
474static void
475copy_tinfo(struct sonode *so, struct T_info_ack *tia)
476{
477	sotpi_info_t *sti = SOTOTPI(so);
478
479	sti->sti_tsdu_size = tia->TSDU_size;
480	sti->sti_etsdu_size = tia->ETSDU_size;
481	sti->sti_addr_size = tia->ADDR_size;
482	sti->sti_opt_size = tia->OPT_size;
483	sti->sti_tidu_size = tia->TIDU_size;
484	sti->sti_serv_type = tia->SERV_type;
485	switch (tia->CURRENT_state) {
486	case TS_UNBND:
487		break;
488	case TS_IDLE:
489		so->so_state |= SS_ISBOUND;
490		sti->sti_laddr_len = 0;
491		sti->sti_laddr_valid = 0;
492		break;
493	case TS_DATA_XFER:
494		so->so_state |= SS_ISBOUND|SS_ISCONNECTED;
495		sti->sti_laddr_len = 0;
496		sti->sti_faddr_len = 0;
497		sti->sti_laddr_valid = 0;
498		sti->sti_faddr_valid = 0;
499		break;
500	}
501
502	/*
503	 * Heuristics for determining the socket mode flags
504	 * (SM_ATOMIC, SM_CONNREQUIRED, SM_ADDR, SM_FDPASSING,
505	 * and SM_EXDATA, SM_OPTDATA, and SM_BYTESTREAM)
506	 * from the info ack.
507	 */
508	if (sti->sti_serv_type == T_CLTS) {
509		so->so_mode |= SM_ATOMIC | SM_ADDR;
510	} else {
511		so->so_mode |= SM_CONNREQUIRED;
512		if (sti->sti_etsdu_size != 0 && sti->sti_etsdu_size != -2)
513			so->so_mode |= SM_EXDATA;
514	}
515	if (so->so_type == SOCK_SEQPACKET || so->so_type == SOCK_RAW) {
516		/* Semantics are to discard tail end of messages */
517		so->so_mode |= SM_ATOMIC;
518	}
519	if (so->so_family == AF_UNIX) {
520		so->so_mode |= SM_FDPASSING | SM_OPTDATA;
521		if (sti->sti_addr_size == -1) {
522			/* MAXPATHLEN + soun_family + nul termination */
523			sti->sti_addr_size = (t_scalar_t)(MAXPATHLEN +
524			    sizeof (short) + 1);
525		}
526		if (so->so_type == SOCK_STREAM) {
527			/*
528			 * Make it into a byte-stream transport.
529			 * SOCK_SEQPACKET sockets are unchanged.
530			 */
531			sti->sti_tsdu_size = 0;
532		}
533	} else if (sti->sti_addr_size == -1) {
534		/*
535		 * Logic extracted from sockmod - have to pick some max address
536		 * length in order to preallocate the addresses.
537		 */
538		sti->sti_addr_size = SOA_DEFSIZE;
539	}
540	if (sti->sti_tsdu_size == 0)
541		so->so_mode |= SM_BYTESTREAM;
542}
543
544static int
545check_tinfo(struct sonode *so)
546{
547	sotpi_info_t *sti = SOTOTPI(so);
548
549	/* Consistency checks */
550	if (so->so_type == SOCK_DGRAM && sti->sti_serv_type != T_CLTS) {
551		eprintso(so, ("service type and socket type mismatch\n"));
552		eprintsoline(so, EPROTO);
553		return (EPROTO);
554	}
555	if (so->so_type == SOCK_STREAM && sti->sti_serv_type == T_CLTS) {
556		eprintso(so, ("service type and socket type mismatch\n"));
557		eprintsoline(so, EPROTO);
558		return (EPROTO);
559	}
560	if (so->so_type == SOCK_SEQPACKET && sti->sti_serv_type == T_CLTS) {
561		eprintso(so, ("service type and socket type mismatch\n"));
562		eprintsoline(so, EPROTO);
563		return (EPROTO);
564	}
565	if (so->so_family == AF_INET &&
566	    sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in)) {
567		eprintso(so,
568		    ("AF_INET must have sockaddr_in address length. Got %d\n",
569		    sti->sti_addr_size));
570		eprintsoline(so, EMSGSIZE);
571		return (EMSGSIZE);
572	}
573	if (so->so_family == AF_INET6 &&
574	    sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in6)) {
575		eprintso(so,
576		    ("AF_INET6 must have sockaddr_in6 address length. Got %d\n",
577		    sti->sti_addr_size));
578		eprintsoline(so, EMSGSIZE);
579		return (EMSGSIZE);
580	}
581
582	dprintso(so, 1, (
583	    "tinfo: serv %d tsdu %d, etsdu %d, addr %d, opt %d, tidu %d\n",
584	    sti->sti_serv_type, sti->sti_tsdu_size, sti->sti_etsdu_size,
585	    sti->sti_addr_size, sti->sti_opt_size,
586	    sti->sti_tidu_size));
587	dprintso(so, 1, ("tinfo: so_state %s\n",
588	    pr_state(so->so_state, so->so_mode)));
589	return (0);
590}
591
592/*
593 * Send down T_info_req and wait for the ack.
594 * Record interesting T_info_ack values in the sonode.
595 */
596static int
597do_tinfo(struct sonode *so)
598{
599	struct T_info_req tir;
600	mblk_t *mp;
601	int error;
602
603	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
604
605	if (so_no_tinfo) {
606		SOTOTPI(so)->sti_addr_size = 0;
607		return (0);
608	}
609
610	dprintso(so, 1, ("do_tinfo(%p)\n", (void *)so));
611
612	/* Send T_INFO_REQ */
613	tir.PRIM_type = T_INFO_REQ;
614	mp = soallocproto1(&tir, sizeof (tir),
615	    sizeof (struct T_info_req) + sizeof (struct T_info_ack),
616	    _ALLOC_INTR, CRED());
617	if (mp == NULL) {
618		eprintsoline(so, ENOBUFS);
619		return (ENOBUFS);
620	}
621	/* T_INFO_REQ has to be M_PCPROTO */
622	DB_TYPE(mp) = M_PCPROTO;
623
624	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
625	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
626	if (error) {
627		eprintsoline(so, error);
628		return (error);
629	}
630	mutex_enter(&so->so_lock);
631	/* Wait for T_INFO_ACK */
632	if ((error = sowaitprim(so, T_INFO_REQ, T_INFO_ACK,
633	    (t_uscalar_t)sizeof (struct T_info_ack), &mp, 0))) {
634		mutex_exit(&so->so_lock);
635		eprintsoline(so, error);
636		return (error);
637	}
638
639	ASSERT(mp);
640	copy_tinfo(so, (struct T_info_ack *)mp->b_rptr);
641	mutex_exit(&so->so_lock);
642	freemsg(mp);
643	return (check_tinfo(so));
644}
645
646/*
647 * Send down T_capability_req and wait for the ack.
648 * Record interesting T_capability_ack values in the sonode.
649 */
650static int
651do_tcapability(struct sonode *so, t_uscalar_t cap_bits1)
652{
653	struct T_capability_req tcr;
654	struct T_capability_ack *tca;
655	mblk_t *mp;
656	int error;
657	sotpi_info_t *sti = SOTOTPI(so);
658
659	ASSERT(cap_bits1 != 0);
660	ASSERT((cap_bits1 & ~(TC1_ACCEPTOR_ID | TC1_INFO)) == 0);
661	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
662
663	if (sti->sti_provinfo->tpi_capability == PI_NO)
664		return (do_tinfo(so));
665
666	if (so_no_tinfo) {
667		sti->sti_addr_size = 0;
668		if ((cap_bits1 &= ~TC1_INFO) == 0)
669			return (0);
670	}
671
672	dprintso(so, 1, ("do_tcapability(%p)\n", (void *)so));
673
674	/* Send T_CAPABILITY_REQ */
675	tcr.PRIM_type = T_CAPABILITY_REQ;
676	tcr.CAP_bits1 = cap_bits1;
677	mp = soallocproto1(&tcr, sizeof (tcr),
678	    sizeof (struct T_capability_req) + sizeof (struct T_capability_ack),
679	    _ALLOC_INTR, CRED());
680	if (mp == NULL) {
681		eprintsoline(so, ENOBUFS);
682		return (ENOBUFS);
683	}
684	/* T_CAPABILITY_REQ should be M_PCPROTO here */
685	DB_TYPE(mp) = M_PCPROTO;
686
687	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
688	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
689	if (error) {
690		eprintsoline(so, error);
691		return (error);
692	}
693	mutex_enter(&so->so_lock);
694	/* Wait for T_CAPABILITY_ACK */
695	if ((error = sowaitprim(so, T_CAPABILITY_REQ, T_CAPABILITY_ACK,
696	    (t_uscalar_t)sizeof (*tca), &mp, sock_capability_timeout * hz))) {
697		mutex_exit(&so->so_lock);
698		PI_PROVLOCK(sti->sti_provinfo);
699		if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW)
700			sti->sti_provinfo->tpi_capability = PI_NO;
701		PI_PROVUNLOCK(sti->sti_provinfo);
702		ASSERT((so->so_mode & SM_ACCEPTOR_ID) == 0);
703		if (cap_bits1 & TC1_INFO) {
704			/*
705			 * If the T_CAPABILITY_REQ timed out and then a
706			 * T_INFO_REQ gets a protocol error, most likely
707			 * the capability was slow (vs. unsupported). Return
708			 * ENOSR for this case as a best guess.
709			 */
710			if (error == ETIME) {
711				return ((error = do_tinfo(so)) == EPROTO ?
712				    ENOSR : error);
713			}
714			return (do_tinfo(so));
715		}
716		return (0);
717	}
718
719	ASSERT(mp);
720	tca = (struct T_capability_ack *)mp->b_rptr;
721
722	ASSERT((cap_bits1 & TC1_INFO) == (tca->CAP_bits1 & TC1_INFO));
723	so_proc_tcapability_ack(so, tca);
724
725	cap_bits1 = tca->CAP_bits1;
726
727	mutex_exit(&so->so_lock);
728	freemsg(mp);
729
730	if (cap_bits1 & TC1_INFO)
731		return (check_tinfo(so));
732
733	return (0);
734}
735
736/*
737 * Process a T_CAPABILITY_ACK
738 */
739void
740so_proc_tcapability_ack(struct sonode *so, struct T_capability_ack *tca)
741{
742	sotpi_info_t *sti = SOTOTPI(so);
743
744	if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW) {
745		PI_PROVLOCK(sti->sti_provinfo);
746		sti->sti_provinfo->tpi_capability = PI_YES;
747		PI_PROVUNLOCK(sti->sti_provinfo);
748	}
749
750	if (tca->CAP_bits1 & TC1_ACCEPTOR_ID) {
751		sti->sti_acceptor_id = tca->ACCEPTOR_id;
752		so->so_mode |= SM_ACCEPTOR_ID;
753	}
754
755	if (tca->CAP_bits1 & TC1_INFO)
756		copy_tinfo(so, &tca->INFO_ack);
757}
758
759/*
760 * Retrieve socket error, clear error if not peek.
761 */
762int
763sogeterr(struct sonode *so, boolean_t clear_err)
764{
765	int error;
766
767	ASSERT(MUTEX_HELD(&so->so_lock));
768
769	error = so->so_error;
770	if (clear_err)
771		so->so_error = 0;
772
773	return (error);
774}
775
776/*
777 * This routine is registered with the stream head to retrieve read
778 * side errors.
779 * It does not clear the socket error for a peeking read side operation.
780 * It the error is to be cleared it sets *clearerr.
781 */
782int
783sogetrderr(vnode_t *vp, int ispeek, int *clearerr)
784{
785	struct sonode *so = VTOSO(vp);
786	int error;
787
788	mutex_enter(&so->so_lock);
789	if (ispeek) {
790		error = so->so_error;
791		*clearerr = 0;
792	} else {
793		error = so->so_error;
794		so->so_error = 0;
795		*clearerr = 1;
796	}
797	mutex_exit(&so->so_lock);
798	return (error);
799}
800
801/*
802 * This routine is registered with the stream head to retrieve write
803 * side errors.
804 * It does not clear the socket error for a peeking read side operation.
805 * It the error is to be cleared it sets *clearerr.
806 */
807int
808sogetwrerr(vnode_t *vp, int ispeek, int *clearerr)
809{
810	struct sonode *so = VTOSO(vp);
811	int error;
812
813	mutex_enter(&so->so_lock);
814	if (so->so_state & SS_CANTSENDMORE) {
815		error = EPIPE;
816		*clearerr = 0;
817	} else {
818		error = so->so_error;
819		if (ispeek) {
820			*clearerr = 0;
821		} else {
822			so->so_error = 0;
823			*clearerr = 1;
824		}
825	}
826	mutex_exit(&so->so_lock);
827	return (error);
828}
829
830/*
831 * Set a nonpersistent read and write error on the socket.
832 * Used when there is a T_uderror_ind for a connected socket.
833 * The caller also needs to call strsetrerror and strsetwerror
834 * after dropping the lock.
835 */
836void
837soseterror(struct sonode *so, int error)
838{
839	ASSERT(error != 0);
840
841	ASSERT(MUTEX_HELD(&so->so_lock));
842	so->so_error = (ushort_t)error;
843}
844
845void
846soisconnecting(struct sonode *so)
847{
848	ASSERT(MUTEX_HELD(&so->so_lock));
849	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
850	so->so_state |= SS_ISCONNECTING;
851	cv_broadcast(&so->so_state_cv);
852}
853
854void
855soisconnected(struct sonode *so)
856{
857	ASSERT(MUTEX_HELD(&so->so_lock));
858	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
859	so->so_state |= SS_ISCONNECTED;
860	cv_broadcast(&so->so_state_cv);
861}
862
863/*
864 * The caller also needs to call strsetrerror, strsetwerror and strseteof.
865 */
866void
867soisdisconnected(struct sonode *so, int error)
868{
869	ASSERT(MUTEX_HELD(&so->so_lock));
870	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
871	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
872	so->so_error = (ushort_t)error;
873	if (so->so_peercred != NULL) {
874		crfree(so->so_peercred);
875		so->so_peercred = NULL;
876	}
877	cv_broadcast(&so->so_state_cv);
878}
879
880/*
881 * For connected AF_UNIX SOCK_DGRAM sockets when the peer closes.
882 * Does not affect write side.
883 * The caller also has to call strsetrerror.
884 */
885static void
886sobreakconn(struct sonode *so, int error)
887{
888	ASSERT(MUTEX_HELD(&so->so_lock));
889	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
890	so->so_error = (ushort_t)error;
891	cv_broadcast(&so->so_state_cv);
892}
893
894/*
895 * Can no longer send.
896 * Caller must also call strsetwerror.
897 *
898 * We mark the peer address as no longer valid for getpeername, but
899 * leave it around for so_unix_close to notify the peer (that
900 * transport has no addressing held at that layer).
901 */
902void
903socantsendmore(struct sonode *so)
904{
905	ASSERT(MUTEX_HELD(&so->so_lock));
906	so->so_state |= SS_CANTSENDMORE;
907	cv_broadcast(&so->so_state_cv);
908}
909
910/*
911 * The caller must call strseteof(,1) as well as this routine
912 * to change the socket state.
913 */
914void
915socantrcvmore(struct sonode *so)
916{
917	ASSERT(MUTEX_HELD(&so->so_lock));
918	so->so_state |= SS_CANTRCVMORE;
919	cv_broadcast(&so->so_state_cv);
920}
921
922/*
923 * The caller has sent down a "request_prim" primitive and wants to wait for
924 * an ack ("ack_prim") or an T_ERROR_ACK for it.
925 * The specified "ack_prim" can be a T_OK_ACK.
926 *
927 * Assumes that all the TPI acks are M_PCPROTO messages.
928 *
929 * Note that the socket is single-threaded (using so_lock_single)
930 * for all operations that generate TPI ack messages. Since
931 * only TPI ack messages are M_PCPROTO we should never receive
932 * anything except either the ack we are expecting or a T_ERROR_ACK
933 * for the same primitive.
934 */
935int
936sowaitprim(struct sonode *so, t_scalar_t request_prim, t_scalar_t ack_prim,
937	    t_uscalar_t min_size, mblk_t **mpp, clock_t wait)
938{
939	mblk_t *mp;
940	union T_primitives *tpr;
941	int error;
942
943	dprintso(so, 1, ("sowaitprim(%p, %d, %d, %d, %p, %lu)\n",
944	    (void *)so, request_prim, ack_prim, min_size, (void *)mpp, wait));
945
946	ASSERT(MUTEX_HELD(&so->so_lock));
947
948	error = sowaitack(so, &mp, wait);
949	if (error)
950		return (error);
951
952	dprintso(so, 1, ("got msg %p\n", (void *)mp));
953	if (DB_TYPE(mp) != M_PCPROTO ||
954	    MBLKL(mp) < sizeof (tpr->type)) {
955		freemsg(mp);
956		eprintsoline(so, EPROTO);
957		return (EPROTO);
958	}
959	tpr = (union T_primitives *)mp->b_rptr;
960	/*
961	 * Did we get the primitive that we were asking for?
962	 * For T_OK_ACK we also check that it matches the request primitive.
963	 */
964	if (tpr->type == ack_prim &&
965	    (ack_prim != T_OK_ACK ||
966	    tpr->ok_ack.CORRECT_prim == request_prim)) {
967		if (MBLKL(mp) >= (ssize_t)min_size) {
968			/* Found what we are looking for */
969			*mpp = mp;
970			return (0);
971		}
972		/* Too short */
973		freemsg(mp);
974		eprintsoline(so, EPROTO);
975		return (EPROTO);
976	}
977
978	if (tpr->type == T_ERROR_ACK &&
979	    tpr->error_ack.ERROR_prim == request_prim) {
980		/* Error to the primitive we were looking for */
981		if (tpr->error_ack.TLI_error == TSYSERR) {
982			error = tpr->error_ack.UNIX_error;
983		} else {
984			error = proto_tlitosyserr(tpr->error_ack.TLI_error);
985		}
986		dprintso(so, 0, ("error_ack for %d: %d/%d ->%d\n",
987		    tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error,
988		    tpr->error_ack.UNIX_error, error));
989		freemsg(mp);
990		return (error);
991	}
992	/*
993	 * Wrong primitive or T_ERROR_ACK for the wrong primitive
994	 */
995#ifdef DEBUG
996	if (tpr->type == T_ERROR_ACK) {
997		dprintso(so, 0, ("error_ack for %d: %d/%d\n",
998		    tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error,
999		    tpr->error_ack.UNIX_error));
1000	} else if (tpr->type == T_OK_ACK) {
1001		dprintso(so, 0, ("ok_ack for %d, expected %d for %d\n",
1002		    tpr->ok_ack.CORRECT_prim, ack_prim, request_prim));
1003	} else {
1004		dprintso(so, 0,
1005		    ("unexpected primitive %d, expected %d for %d\n",
1006		    tpr->type, ack_prim, request_prim));
1007	}
1008#endif /* DEBUG */
1009
1010	freemsg(mp);
1011	eprintsoline(so, EPROTO);
1012	return (EPROTO);
1013}
1014
1015/*
1016 * Wait for a T_OK_ACK for the specified primitive.
1017 */
1018int
1019sowaitokack(struct sonode *so, t_scalar_t request_prim)
1020{
1021	mblk_t *mp;
1022	int error;
1023
1024	error = sowaitprim(so, request_prim, T_OK_ACK,
1025	    (t_uscalar_t)sizeof (struct T_ok_ack), &mp, 0);
1026	if (error)
1027		return (error);
1028	freemsg(mp);
1029	return (0);
1030}
1031
1032/*
1033 * Queue a received TPI ack message on sti_ack_mp.
1034 */
1035void
1036soqueueack(struct sonode *so, mblk_t *mp)
1037{
1038	sotpi_info_t *sti = SOTOTPI(so);
1039
1040	if (DB_TYPE(mp) != M_PCPROTO) {
1041		zcmn_err(getzoneid(), CE_WARN,
1042		    "sockfs: received unexpected M_PROTO TPI ack. Prim %d\n",
1043		    *(t_scalar_t *)mp->b_rptr);
1044		freemsg(mp);
1045		return;
1046	}
1047
1048	mutex_enter(&so->so_lock);
1049	if (sti->sti_ack_mp != NULL) {
1050		dprintso(so, 1, ("sti_ack_mp already set\n"));
1051		freemsg(sti->sti_ack_mp);
1052		sti->sti_ack_mp = NULL;
1053	}
1054	sti->sti_ack_mp = mp;
1055	cv_broadcast(&sti->sti_ack_cv);
1056	mutex_exit(&so->so_lock);
1057}
1058
1059/*
1060 * Wait for a TPI ack ignoring signals and errors.
1061 */
1062int
1063sowaitack(struct sonode *so, mblk_t **mpp, clock_t wait)
1064{
1065	sotpi_info_t *sti = SOTOTPI(so);
1066
1067	ASSERT(MUTEX_HELD(&so->so_lock));
1068
1069	while (sti->sti_ack_mp == NULL) {
1070#ifdef SOCK_TEST
1071		if (wait == 0 && sock_test_timelimit != 0)
1072			wait = sock_test_timelimit;
1073#endif
1074		if (wait != 0) {
1075			/*
1076			 * Only wait for the time limit.
1077			 */
1078			if (cv_reltimedwait(&sti->sti_ack_cv, &so->so_lock,
1079			    wait, TR_CLOCK_TICK) == -1) {
1080				eprintsoline(so, ETIME);
1081				return (ETIME);
1082			}
1083		}
1084		else
1085			cv_wait(&sti->sti_ack_cv, &so->so_lock);
1086	}
1087	*mpp = sti->sti_ack_mp;
1088#ifdef DEBUG
1089	{
1090		union T_primitives *tpr;
1091		mblk_t *mp = *mpp;
1092
1093		tpr = (union T_primitives *)mp->b_rptr;
1094		ASSERT(DB_TYPE(mp) == M_PCPROTO);
1095		ASSERT(tpr->type == T_OK_ACK ||
1096		    tpr->type == T_ERROR_ACK ||
1097		    tpr->type == T_BIND_ACK ||
1098		    tpr->type == T_CAPABILITY_ACK ||
1099		    tpr->type == T_INFO_ACK ||
1100		    tpr->type == T_OPTMGMT_ACK);
1101	}
1102#endif /* DEBUG */
1103	sti->sti_ack_mp = NULL;
1104	return (0);
1105}
1106
1107/*
1108 * Queue a received T_CONN_IND message on sti_conn_ind_head/tail.
1109 */
1110void
1111soqueueconnind(struct sonode *so, mblk_t *mp)
1112{
1113	sotpi_info_t *sti = SOTOTPI(so);
1114
1115	if (DB_TYPE(mp) != M_PROTO) {
1116		zcmn_err(getzoneid(), CE_WARN,
1117		    "sockfs: received unexpected M_PCPROTO T_CONN_IND\n");
1118		freemsg(mp);
1119		return;
1120	}
1121
1122	mutex_enter(&so->so_lock);
1123	ASSERT(mp->b_next == NULL);
1124	if (sti->sti_conn_ind_head == NULL) {
1125		sti->sti_conn_ind_head = mp;
1126	} else {
1127		ASSERT(sti->sti_conn_ind_tail->b_next == NULL);
1128		sti->sti_conn_ind_tail->b_next = mp;
1129	}
1130	sti->sti_conn_ind_tail = mp;
1131	/* Wakeup a single consumer of the T_CONN_IND */
1132	cv_signal(&so->so_acceptq_cv);
1133	mutex_exit(&so->so_lock);
1134}
1135
1136/*
1137 * Wait for a T_CONN_IND.
1138 * Don't wait if nonblocking.
1139 * Accept signals and socket errors.
1140 */
1141int
1142sowaitconnind(struct sonode *so, int fmode, mblk_t **mpp)
1143{
1144	mblk_t *mp;
1145	sotpi_info_t *sti = SOTOTPI(so);
1146	int error = 0;
1147
1148	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1149	mutex_enter(&so->so_lock);
1150check_error:
1151	if (so->so_error) {
1152		error = sogeterr(so, B_TRUE);
1153		if (error) {
1154			mutex_exit(&so->so_lock);
1155			return (error);
1156		}
1157	}
1158
1159	if (sti->sti_conn_ind_head == NULL) {
1160		if (fmode & (FNDELAY|FNONBLOCK)) {
1161			error = EWOULDBLOCK;
1162			goto done;
1163		}
1164
1165		if (so->so_state & SS_CLOSING) {
1166			error = EINTR;
1167			goto done;
1168		}
1169
1170		if (!cv_wait_sig_swap(&so->so_acceptq_cv, &so->so_lock)) {
1171			error = EINTR;
1172			goto done;
1173		}
1174		goto check_error;
1175	}
1176	mp = sti->sti_conn_ind_head;
1177	sti->sti_conn_ind_head = mp->b_next;
1178	mp->b_next = NULL;
1179	if (sti->sti_conn_ind_head == NULL) {
1180		ASSERT(sti->sti_conn_ind_tail == mp);
1181		sti->sti_conn_ind_tail = NULL;
1182	}
1183	*mpp = mp;
1184done:
1185	mutex_exit(&so->so_lock);
1186	return (error);
1187}
1188
1189/*
1190 * Flush a T_CONN_IND matching the sequence number from the list.
1191 * Return zero if found; non-zero otherwise.
1192 * This is called very infrequently thus it is ok to do a linear search.
1193 */
1194int
1195soflushconnind(struct sonode *so, t_scalar_t seqno)
1196{
1197	mblk_t *prevmp, *mp;
1198	struct T_conn_ind *tci;
1199	sotpi_info_t *sti = SOTOTPI(so);
1200
1201	mutex_enter(&so->so_lock);
1202	for (prevmp = NULL, mp = sti->sti_conn_ind_head; mp != NULL;
1203	    prevmp = mp, mp = mp->b_next) {
1204		tci = (struct T_conn_ind *)mp->b_rptr;
1205		if (tci->SEQ_number == seqno) {
1206			dprintso(so, 1,
1207			    ("t_discon_ind: found T_CONN_IND %d\n", seqno));
1208			/* Deleting last? */
1209			if (sti->sti_conn_ind_tail == mp) {
1210				sti->sti_conn_ind_tail = prevmp;
1211			}
1212			if (prevmp == NULL) {
1213				/* Deleting first */
1214				sti->sti_conn_ind_head = mp->b_next;
1215			} else {
1216				prevmp->b_next = mp->b_next;
1217			}
1218			mp->b_next = NULL;
1219
1220			ASSERT((sti->sti_conn_ind_head == NULL &&
1221			    sti->sti_conn_ind_tail == NULL) ||
1222			    (sti->sti_conn_ind_head != NULL &&
1223			    sti->sti_conn_ind_tail != NULL));
1224
1225			so->so_error = ECONNABORTED;
1226			mutex_exit(&so->so_lock);
1227
1228			freemsg(mp);
1229			return (0);
1230		}
1231	}
1232	mutex_exit(&so->so_lock);
1233	dprintso(so, 1,	("t_discon_ind: NOT found T_CONN_IND %d\n", seqno));
1234	return (-1);
1235}
1236
1237/*
1238 * Wait until the socket is connected or there is an error.
1239 * fmode should contain any nonblocking flags. nosig should be
1240 * set if the caller does not want the wait to be interrupted by a signal.
1241 */
1242int
1243sowaitconnected(struct sonode *so, int fmode, int nosig)
1244{
1245	int error;
1246
1247	ASSERT(MUTEX_HELD(&so->so_lock));
1248
1249	while ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) ==
1250	    SS_ISCONNECTING && so->so_error == 0) {
1251
1252		dprintso(so, 1, ("waiting for SS_ISCONNECTED on %p\n",
1253		    (void *)so));
1254		if (fmode & (FNDELAY|FNONBLOCK))
1255			return (EINPROGRESS);
1256
1257		if (so->so_state & SS_CLOSING)
1258			return (EINTR);
1259
1260		if (nosig)
1261			cv_wait(&so->so_state_cv, &so->so_lock);
1262		else if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) {
1263			/*
1264			 * Return EINTR and let the application use
1265			 * nonblocking techniques for detecting when
1266			 * the connection has been established.
1267			 */
1268			return (EINTR);
1269		}
1270		dprintso(so, 1, ("awoken on %p\n", (void *)so));
1271	}
1272
1273	if (so->so_error != 0) {
1274		error = sogeterr(so, B_TRUE);
1275		ASSERT(error != 0);
1276		dprintso(so, 1, ("sowaitconnected: error %d\n", error));
1277		return (error);
1278	}
1279	if (!(so->so_state & SS_ISCONNECTED)) {
1280		/*
1281		 * Could have received a T_ORDREL_IND or a T_DISCON_IND with
1282		 * zero errno. Or another thread could have consumed so_error
1283		 * e.g. by calling read.
1284		 */
1285		error = ECONNREFUSED;
1286		dprintso(so, 1, ("sowaitconnected: error %d\n", error));
1287		return (error);
1288	}
1289	return (0);
1290}
1291
1292
1293/*
1294 * Handle the signal generation aspect of urgent data.
1295 */
1296static void
1297so_oob_sig(struct sonode *so, int extrasig,
1298    strsigset_t *signals, strpollset_t *pollwakeups)
1299{
1300	sotpi_info_t *sti = SOTOTPI(so);
1301
1302	ASSERT(MUTEX_HELD(&so->so_lock));
1303
1304	ASSERT(so_verify_oobstate(so));
1305	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1306	if (sti->sti_oobsigcnt > sti->sti_oobcnt) {
1307		/*
1308		 * Signal has already been generated once for this
1309		 * urgent "event". However, since TCP can receive updated
1310		 * urgent pointers we still generate a signal.
1311		 */
1312		ASSERT(so->so_state & SS_OOBPEND);
1313		if (extrasig) {
1314			*signals |= S_RDBAND;
1315			*pollwakeups |= POLLRDBAND;
1316		}
1317		return;
1318	}
1319
1320	sti->sti_oobsigcnt++;
1321	ASSERT(sti->sti_oobsigcnt > 0);	/* Wraparound */
1322	ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt);
1323
1324	/*
1325	 * Record (for select/poll) that urgent data is pending.
1326	 */
1327	so->so_state |= SS_OOBPEND;
1328	/*
1329	 * New urgent data on the way so forget about any old
1330	 * urgent data.
1331	 */
1332	so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA);
1333	if (so->so_oobmsg != NULL) {
1334		dprintso(so, 1, ("sock: discarding old oob\n"));
1335		freemsg(so->so_oobmsg);
1336		so->so_oobmsg = NULL;
1337	}
1338	*signals |= S_RDBAND;
1339	*pollwakeups |= POLLRDBAND;
1340	ASSERT(so_verify_oobstate(so));
1341}
1342
1343/*
1344 * Handle the processing of the T_EXDATA_IND with urgent data.
1345 * Returns the T_EXDATA_IND if it should be queued on the read queue.
1346 */
1347/* ARGSUSED2 */
1348static mblk_t *
1349so_oob_exdata(struct sonode *so, mblk_t *mp,
1350	strsigset_t *signals, strpollset_t *pollwakeups)
1351{
1352	sotpi_info_t *sti = SOTOTPI(so);
1353
1354	ASSERT(MUTEX_HELD(&so->so_lock));
1355
1356	ASSERT(so_verify_oobstate(so));
1357
1358	ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt);
1359
1360	sti->sti_oobcnt++;
1361	ASSERT(sti->sti_oobcnt > 0);	/* wraparound? */
1362	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1363
1364	/*
1365	 * Set MSGMARK for SIOCATMARK.
1366	 */
1367	mp->b_flag |= MSGMARK;
1368
1369	ASSERT(so_verify_oobstate(so));
1370	return (mp);
1371}
1372
1373/*
1374 * Handle the processing of the actual urgent data.
1375 * Returns the data mblk if it should be queued on the read queue.
1376 */
1377static mblk_t *
1378so_oob_data(struct sonode *so, mblk_t *mp,
1379	strsigset_t *signals, strpollset_t *pollwakeups)
1380{
1381	sotpi_info_t *sti = SOTOTPI(so);
1382
1383	ASSERT(MUTEX_HELD(&so->so_lock));
1384
1385	ASSERT(so_verify_oobstate(so));
1386
1387	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1388	ASSERT(mp != NULL);
1389	/*
1390	 * For OOBINLINE we keep the data in the T_EXDATA_IND.
1391	 * Otherwise we store it in so_oobmsg.
1392	 */
1393	ASSERT(so->so_oobmsg == NULL);
1394	if (so->so_options & SO_OOBINLINE) {
1395		*pollwakeups |= POLLIN | POLLRDNORM | POLLRDBAND;
1396		*signals |= S_INPUT | S_RDNORM;
1397	} else {
1398		*pollwakeups |= POLLRDBAND;
1399		so->so_state |= SS_HAVEOOBDATA;
1400		so->so_oobmsg = mp;
1401		mp = NULL;
1402	}
1403	ASSERT(so_verify_oobstate(so));
1404	return (mp);
1405}
1406
1407/*
1408 * Caller must hold the mutex.
1409 * For delayed processing, save the T_DISCON_IND received
1410 * from below on sti_discon_ind_mp.
1411 * When the message is processed the framework will call:
1412 *      (*func)(so, mp);
1413 */
1414static void
1415so_save_discon_ind(struct sonode *so,
1416	mblk_t *mp,
1417	void (*func)(struct sonode *so, mblk_t *))
1418{
1419	sotpi_info_t *sti = SOTOTPI(so);
1420
1421	ASSERT(MUTEX_HELD(&so->so_lock));
1422
1423	/*
1424	 * Discard new T_DISCON_IND if we have already received another.
1425	 * Currently the earlier message can either be on sti_discon_ind_mp
1426	 * or being processed.
1427	 */
1428	if (sti->sti_discon_ind_mp != NULL || (so->so_flag & SOASYNC_UNBIND)) {
1429		zcmn_err(getzoneid(), CE_WARN,
1430		    "sockfs: received unexpected additional T_DISCON_IND\n");
1431		freemsg(mp);
1432		return;
1433	}
1434	mp->b_prev = (mblk_t *)func;
1435	mp->b_next = NULL;
1436	sti->sti_discon_ind_mp = mp;
1437}
1438
1439/*
1440 * Caller must hold the mutex and make sure that either SOLOCKED
1441 * or SOASYNC_UNBIND is set. Called from so_unlock_single().
1442 * Perform delayed processing of T_DISCON_IND message on sti_discon_ind_mp.
1443 * Need to ensure that strsock_proto() will not end up sleeping for
1444 * SOASYNC_UNBIND, while executing this function.
1445 */
1446void
1447so_drain_discon_ind(struct sonode *so)
1448{
1449	mblk_t	*bp;
1450	void (*func)(struct sonode *so, mblk_t *);
1451	sotpi_info_t *sti = SOTOTPI(so);
1452
1453	ASSERT(MUTEX_HELD(&so->so_lock));
1454	ASSERT(so->so_flag & (SOLOCKED|SOASYNC_UNBIND));
1455
1456	/* Process T_DISCON_IND on sti_discon_ind_mp */
1457	if ((bp = sti->sti_discon_ind_mp) != NULL) {
1458		sti->sti_discon_ind_mp = NULL;
1459		func = (void (*)())bp->b_prev;
1460		bp->b_prev = NULL;
1461
1462		/*
1463		 * This (*func) is supposed to generate a message downstream
1464		 * and we need to have a flag set until the corresponding
1465		 * upstream message reaches stream head.
1466		 * When processing T_DISCON_IND in strsock_discon_ind
1467		 * we hold SOASYN_UNBIND when sending T_UNBIND_REQ down and
1468		 * drop the flag after we get the ACK in strsock_proto.
1469		 */
1470		(void) (*func)(so, bp);
1471	}
1472}
1473
1474/*
1475 * Caller must hold the mutex.
1476 * Remove the T_DISCON_IND on sti_discon_ind_mp.
1477 */
1478void
1479so_flush_discon_ind(struct sonode *so)
1480{
1481	mblk_t	*bp;
1482	sotpi_info_t *sti = SOTOTPI(so);
1483
1484	ASSERT(MUTEX_HELD(&so->so_lock));
1485
1486	/*
1487	 * Remove T_DISCON_IND mblk at sti_discon_ind_mp.
1488	 */
1489	if ((bp = sti->sti_discon_ind_mp) != NULL) {
1490		sti->sti_discon_ind_mp = NULL;
1491		bp->b_prev = NULL;
1492		freemsg(bp);
1493	}
1494}
1495
1496/*
1497 * Caller must hold the mutex.
1498 *
1499 * This function is used to process the T_DISCON_IND message. It does
1500 * immediate processing when called from strsock_proto and delayed
1501 * processing of discon_ind saved on sti_discon_ind_mp when called from
1502 * so_drain_discon_ind. When a T_DISCON_IND message is saved in
1503 * sti_discon_ind_mp for delayed processing, this function is registered
1504 * as the callback function to process the message.
1505 *
1506 * SOASYNC_UNBIND should be held in this function, during the non-blocking
1507 * unbind operation, and should be released only after we receive the ACK
1508 * in strsock_proto, for the T_UNBIND_REQ sent here. Since SOLOCKED is not set,
1509 * no TPI messages would be sent down at this time. This is to prevent M_FLUSH
1510 * sent from either this function or tcp_unbind(), flushing away any TPI
1511 * message that is being sent down and stays in a lower module's queue.
1512 *
1513 * This function drops so_lock and grabs it again.
1514 */
1515static void
1516strsock_discon_ind(struct sonode *so, mblk_t *discon_mp)
1517{
1518	struct vnode *vp;
1519	struct stdata *stp;
1520	union T_primitives *tpr;
1521	struct T_unbind_req *ubr;
1522	mblk_t *mp;
1523	int error;
1524	sotpi_info_t *sti = SOTOTPI(so);
1525
1526	ASSERT(MUTEX_HELD(&so->so_lock));
1527	ASSERT(discon_mp);
1528	ASSERT(discon_mp->b_rptr);
1529
1530	tpr = (union T_primitives *)discon_mp->b_rptr;
1531	ASSERT(tpr->type == T_DISCON_IND);
1532
1533	vp = SOTOV(so);
1534	stp = vp->v_stream;
1535	ASSERT(stp);
1536
1537	/*
1538	 * Not a listener
1539	 */
1540	ASSERT((so->so_state & SS_ACCEPTCONN) == 0);
1541
1542	/*
1543	 * This assumes that the name space for DISCON_reason
1544	 * is the errno name space.
1545	 */
1546	soisdisconnected(so, tpr->discon_ind.DISCON_reason);
1547	sti->sti_laddr_valid = 0;
1548	sti->sti_faddr_valid = 0;
1549
1550	/*
1551	 * Unbind with the transport without blocking.
1552	 * If we've already received a T_DISCON_IND do not unbind.
1553	 *
1554	 * If there is no preallocated unbind message, we have already
1555	 * unbound with the transport
1556	 *
1557	 * If the socket is not bound, no need to unbind.
1558	 */
1559	mp = sti->sti_unbind_mp;
1560	if (mp == NULL) {
1561		ASSERT(!(so->so_state & SS_ISBOUND));
1562		mutex_exit(&so->so_lock);
1563	} else if (!(so->so_state & SS_ISBOUND))  {
1564		mutex_exit(&so->so_lock);
1565	} else {
1566		sti->sti_unbind_mp = NULL;
1567
1568		/*
1569		 * Is another T_DISCON_IND being processed.
1570		 */
1571		ASSERT((so->so_flag & SOASYNC_UNBIND) == 0);
1572
1573		/*
1574		 * Make strsock_proto ignore T_OK_ACK and T_ERROR_ACK for
1575		 * this unbind. Set SOASYNC_UNBIND. This should be cleared
1576		 * only after we receive the ACK in strsock_proto.
1577		 */
1578		so->so_flag |= SOASYNC_UNBIND;
1579		ASSERT(!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)));
1580		so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1581		sti->sti_laddr_valid = 0;
1582		mutex_exit(&so->so_lock);
1583
1584		/*
1585		 * Send down T_UNBIND_REQ ignoring flow control.
1586		 * XXX Assumes that MSG_IGNFLOW implies that this thread
1587		 * does not run service procedures.
1588		 */
1589		ASSERT(DB_TYPE(mp) == M_PROTO);
1590		ubr = (struct T_unbind_req *)mp->b_rptr;
1591		mp->b_wptr += sizeof (*ubr);
1592		ubr->PRIM_type = T_UNBIND_REQ;
1593
1594		/*
1595		 * Flush the read and write side (except stream head read queue)
1596		 * and send down T_UNBIND_REQ.
1597		 */
1598		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1599		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1600		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
1601		/* LINTED - warning: statement has no consequent: if */
1602		if (error) {
1603			eprintsoline(so, error);
1604		}
1605	}
1606
1607	if (tpr->discon_ind.DISCON_reason != 0)
1608		strsetrerror(SOTOV(so), 0, 0, sogetrderr);
1609	strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
1610	strseteof(SOTOV(so), 1);
1611	/*
1612	 * strseteof takes care of read side wakeups,
1613	 * pollwakeups, and signals.
1614	 */
1615	dprintso(so, 1, ("T_DISCON_IND: error %d\n", so->so_error));
1616	freemsg(discon_mp);
1617
1618
1619	pollwakeup(&stp->sd_pollist, POLLOUT);
1620	mutex_enter(&stp->sd_lock);
1621
1622	/*
1623	 * Wake sleeping write
1624	 */
1625	if (stp->sd_flag & WSLEEP) {
1626		stp->sd_flag &= ~WSLEEP;
1627		cv_broadcast(&stp->sd_wrq->q_wait);
1628	}
1629
1630	/*
1631	 * strsendsig can handle multiple signals with a
1632	 * single call.  Send SIGPOLL for S_OUTPUT event.
1633	 */
1634	if (stp->sd_sigflags & S_OUTPUT)
1635		strsendsig(stp->sd_siglist, S_OUTPUT, 0, 0);
1636
1637	mutex_exit(&stp->sd_lock);
1638	mutex_enter(&so->so_lock);
1639}
1640
1641/*
1642 * This routine is registered with the stream head to receive M_PROTO
1643 * and M_PCPROTO messages.
1644 *
1645 * Returns NULL if the message was consumed.
1646 * Returns an mblk to make that mblk be processed (and queued) by the stream
1647 * head.
1648 *
1649 * Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
1650 * *pollwakeups) for the stream head to take action on. Note that since
1651 * sockets always deliver SIGIO for every new piece of data this routine
1652 * never sets *firstmsgsigs; any signals are returned in *allmsgsigs.
1653 *
1654 * This routine handles all data related TPI messages independent of
1655 * the type of the socket i.e. it doesn't care if T_UNITDATA_IND message
1656 * arrive on a SOCK_STREAM.
1657 */
1658static mblk_t *
1659strsock_proto(vnode_t *vp, mblk_t *mp,
1660		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
1661		strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
1662{
1663	union T_primitives *tpr;
1664	struct sonode *so;
1665	sotpi_info_t *sti;
1666	uint32_t auditing = AU_AUDITING();
1667
1668	so = VTOSO(vp);
1669	sti = SOTOTPI(so);
1670
1671	dprintso(so, 1, ("strsock_proto(%p, %p)\n", (void *)vp, (void *)mp));
1672
1673	/* Set default return values */
1674	*firstmsgsigs = *wakeups = *allmsgsigs = *pollwakeups = 0;
1675
1676	ASSERT(DB_TYPE(mp) == M_PROTO ||
1677	    DB_TYPE(mp) == M_PCPROTO);
1678
1679	if (MBLKL(mp) < sizeof (tpr->type)) {
1680		/* The message is too short to even contain the primitive */
1681		zcmn_err(getzoneid(), CE_WARN,
1682		    "sockfs: Too short TPI message received. Len = %ld\n",
1683		    (ptrdiff_t)(MBLKL(mp)));
1684		freemsg(mp);
1685		return (NULL);
1686	}
1687	if (!__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
1688		/* The read pointer is not aligned correctly for TPI */
1689		zcmn_err(getzoneid(), CE_WARN,
1690		    "sockfs: Unaligned TPI message received. rptr = %p\n",
1691		    (void *)mp->b_rptr);
1692		freemsg(mp);
1693		return (NULL);
1694	}
1695	tpr = (union T_primitives *)mp->b_rptr;
1696	dprintso(so, 1, ("strsock_proto: primitive %d\n", tpr->type));
1697
1698	switch (tpr->type) {
1699
1700	case T_DATA_IND:
1701		if (MBLKL(mp) < sizeof (struct T_data_ind)) {
1702			zcmn_err(getzoneid(), CE_WARN,
1703			    "sockfs: Too short T_DATA_IND. Len = %ld\n",
1704			    (ptrdiff_t)(MBLKL(mp)));
1705			freemsg(mp);
1706			return (NULL);
1707		}
1708		/*
1709		 * Ignore zero-length T_DATA_IND messages. These might be
1710		 * generated by some transports.
1711		 * This is needed to prevent read (which skips the M_PROTO
1712		 * part) to unexpectedly return 0 (or return EWOULDBLOCK
1713		 * on a non-blocking socket after select/poll has indicated
1714		 * that data is available).
1715		 */
1716		if (msgdsize(mp->b_cont) == 0) {
1717			dprintso(so, 0,
1718			    ("strsock_proto: zero length T_DATA_IND\n"));
1719			freemsg(mp);
1720			return (NULL);
1721		}
1722		*allmsgsigs = S_INPUT | S_RDNORM;
1723		*pollwakeups = POLLIN | POLLRDNORM;
1724		*wakeups = RSLEEP;
1725		return (mp);
1726
1727	case T_UNITDATA_IND: {
1728		struct T_unitdata_ind	*tudi = &tpr->unitdata_ind;
1729		void			*addr;
1730		t_uscalar_t		addrlen;
1731
1732		if (MBLKL(mp) < sizeof (struct T_unitdata_ind)) {
1733			zcmn_err(getzoneid(), CE_WARN,
1734			    "sockfs: Too short T_UNITDATA_IND. Len = %ld\n",
1735			    (ptrdiff_t)(MBLKL(mp)));
1736			freemsg(mp);
1737			return (NULL);
1738		}
1739
1740		/* Is this is not a connected datagram socket? */
1741		if ((so->so_mode & SM_CONNREQUIRED) ||
1742		    !(so->so_state & SS_ISCONNECTED)) {
1743			/*
1744			 * Not a connected datagram socket. Look for
1745			 * the SO_UNIX_CLOSE option. If such an option is found
1746			 * discard the message (since it has no meaning
1747			 * unless connected).
1748			 */
1749			if (so->so_family == AF_UNIX && msgdsize(mp) == 0 &&
1750			    tudi->OPT_length != 0) {
1751				void *opt;
1752				t_uscalar_t optlen = tudi->OPT_length;
1753
1754				opt = sogetoff(mp, tudi->OPT_offset,
1755				    optlen, __TPI_ALIGN_SIZE);
1756				if (opt == NULL) {
1757					/* The len/off falls outside mp */
1758					freemsg(mp);
1759					mutex_enter(&so->so_lock);
1760					soseterror(so, EPROTO);
1761					mutex_exit(&so->so_lock);
1762					zcmn_err(getzoneid(), CE_WARN,
1763					    "sockfs: T_unidata_ind with "
1764					    "invalid optlen/offset %u/%d\n",
1765					    optlen, tudi->OPT_offset);
1766					return (NULL);
1767				}
1768				if (so_getopt_unix_close(opt, optlen)) {
1769					freemsg(mp);
1770					return (NULL);
1771				}
1772			}
1773			*allmsgsigs = S_INPUT | S_RDNORM;
1774			*pollwakeups = POLLIN | POLLRDNORM;
1775			*wakeups = RSLEEP;
1776			if (auditing)
1777				audit_sock(T_UNITDATA_IND, strvp2wq(vp),
1778				    mp, 0);
1779			return (mp);
1780		}
1781
1782		/*
1783		 * A connect datagram socket. For AF_INET{,6} we verify that
1784		 * the source address matches the "connected to" address.
1785		 * The semantics of AF_UNIX sockets is to not verify
1786		 * the source address.
1787		 * Note that this source address verification is transport
1788		 * specific. Thus the real fix would be to extent TPI
1789		 * to allow T_CONN_REQ messages to be send to connectionless
1790		 * transport providers and always let the transport provider
1791		 * do whatever filtering is needed.
1792		 *
1793		 * The verification/filtering semantics for transports
1794		 * other than AF_INET and AF_UNIX are unknown. The choice
1795		 * would be to either filter using bcmp or let all messages
1796		 * get through. This code does not filter other address
1797		 * families since this at least allows the application to
1798		 * work around any missing filtering.
1799		 *
1800		 * XXX Should we move filtering to UDP/ICMP???
1801		 * That would require passing e.g. a T_DISCON_REQ to UDP
1802		 * when the socket becomes unconnected.
1803		 */
1804		addrlen = tudi->SRC_length;
1805		/*
1806		 * The alignment restriction is really to strict but
1807		 * we want enough alignment to inspect the fields of
1808		 * a sockaddr_in.
1809		 */
1810		addr = sogetoff(mp, tudi->SRC_offset, addrlen,
1811		    __TPI_ALIGN_SIZE);
1812		if (addr == NULL) {
1813			freemsg(mp);
1814			mutex_enter(&so->so_lock);
1815			soseterror(so, EPROTO);
1816			mutex_exit(&so->so_lock);
1817			zcmn_err(getzoneid(), CE_WARN,
1818			    "sockfs: T_unidata_ind with invalid "
1819			    "addrlen/offset %u/%d\n",
1820			    addrlen, tudi->SRC_offset);
1821			return (NULL);
1822		}
1823
1824		if (so->so_family == AF_INET) {
1825			/*
1826			 * For AF_INET we allow wildcarding both sin_addr
1827			 * and sin_port.
1828			 */
1829			struct sockaddr_in *faddr, *sin;
1830
1831			/* Prevent sti_faddr_sa from changing while accessed */
1832			mutex_enter(&so->so_lock);
1833			ASSERT(sti->sti_faddr_len ==
1834			    (socklen_t)sizeof (struct sockaddr_in));
1835			faddr = (struct sockaddr_in *)sti->sti_faddr_sa;
1836			sin = (struct sockaddr_in *)addr;
1837			if (addrlen !=
1838			    (t_uscalar_t)sizeof (struct sockaddr_in) ||
1839			    (sin->sin_addr.s_addr != faddr->sin_addr.s_addr &&
1840			    faddr->sin_addr.s_addr != INADDR_ANY) ||
1841			    (so->so_type != SOCK_RAW &&
1842			    sin->sin_port != faddr->sin_port &&
1843			    faddr->sin_port != 0)) {
1844#ifdef DEBUG
1845				dprintso(so, 0,
1846				    ("sockfs: T_UNITDATA_IND mismatch: %s",
1847				    pr_addr(so->so_family,
1848				    (struct sockaddr *)addr, addrlen)));
1849				dprintso(so, 0, (" - %s\n",
1850				    pr_addr(so->so_family, sti->sti_faddr_sa,
1851				    (t_uscalar_t)sti->sti_faddr_len)));
1852#endif /* DEBUG */
1853				mutex_exit(&so->so_lock);
1854				freemsg(mp);
1855				return (NULL);
1856			}
1857			mutex_exit(&so->so_lock);
1858		} else if (so->so_family == AF_INET6) {
1859			/*
1860			 * For AF_INET6 we allow wildcarding both sin6_addr
1861			 * and sin6_port.
1862			 */
1863			struct sockaddr_in6 *faddr6, *sin6;
1864			static struct in6_addr zeroes; /* inits to all zeros */
1865
1866			/* Prevent sti_faddr_sa from changing while accessed */
1867			mutex_enter(&so->so_lock);
1868			ASSERT(sti->sti_faddr_len ==
1869			    (socklen_t)sizeof (struct sockaddr_in6));
1870			faddr6 = (struct sockaddr_in6 *)sti->sti_faddr_sa;
1871			sin6 = (struct sockaddr_in6 *)addr;
1872			/* XXX could we get a mapped address ::ffff:0.0.0.0 ? */
1873			if (addrlen !=
1874			    (t_uscalar_t)sizeof (struct sockaddr_in6) ||
1875			    (!IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
1876			    &faddr6->sin6_addr) &&
1877			    !IN6_ARE_ADDR_EQUAL(&faddr6->sin6_addr, &zeroes)) ||
1878			    (so->so_type != SOCK_RAW &&
1879			    sin6->sin6_port != faddr6->sin6_port &&
1880			    faddr6->sin6_port != 0)) {
1881#ifdef DEBUG
1882				dprintso(so, 0,
1883				    ("sockfs: T_UNITDATA_IND mismatch: %s",
1884				    pr_addr(so->so_family,
1885				    (struct sockaddr *)addr, addrlen)));
1886				dprintso(so, 0, (" - %s\n",
1887				    pr_addr(so->so_family, sti->sti_faddr_sa,
1888				    (t_uscalar_t)sti->sti_faddr_len)));
1889#endif /* DEBUG */
1890				mutex_exit(&so->so_lock);
1891				freemsg(mp);
1892				return (NULL);
1893			}
1894			mutex_exit(&so->so_lock);
1895		} else if (so->so_family == AF_UNIX &&
1896		    msgdsize(mp->b_cont) == 0 &&
1897		    tudi->OPT_length != 0) {
1898			/*
1899			 * Attempt to extract AF_UNIX
1900			 * SO_UNIX_CLOSE indication from options.
1901			 */
1902			void *opt;
1903			t_uscalar_t optlen = tudi->OPT_length;
1904
1905			opt = sogetoff(mp, tudi->OPT_offset,
1906			    optlen, __TPI_ALIGN_SIZE);
1907			if (opt == NULL) {
1908				/* The len/off falls outside mp */
1909				freemsg(mp);
1910				mutex_enter(&so->so_lock);
1911				soseterror(so, EPROTO);
1912				mutex_exit(&so->so_lock);
1913				zcmn_err(getzoneid(), CE_WARN,
1914				    "sockfs: T_unidata_ind with invalid "
1915				    "optlen/offset %u/%d\n",
1916				    optlen, tudi->OPT_offset);
1917				return (NULL);
1918			}
1919			/*
1920			 * If we received a unix close indication mark the
1921			 * socket and discard this message.
1922			 */
1923			if (so_getopt_unix_close(opt, optlen)) {
1924				mutex_enter(&so->so_lock);
1925				sobreakconn(so, ECONNRESET);
1926				mutex_exit(&so->so_lock);
1927				strsetrerror(SOTOV(so), 0, 0, sogetrderr);
1928				freemsg(mp);
1929				*pollwakeups = POLLIN | POLLRDNORM;
1930				*allmsgsigs = S_INPUT | S_RDNORM;
1931				*wakeups = RSLEEP;
1932				return (NULL);
1933			}
1934		}
1935		*allmsgsigs = S_INPUT | S_RDNORM;
1936		*pollwakeups = POLLIN | POLLRDNORM;
1937		*wakeups = RSLEEP;
1938		return (mp);
1939	}
1940
1941	case T_OPTDATA_IND: {
1942		struct T_optdata_ind	*tdi = &tpr->optdata_ind;
1943
1944		if (MBLKL(mp) < sizeof (struct T_optdata_ind)) {
1945			zcmn_err(getzoneid(), CE_WARN,
1946			    "sockfs: Too short T_OPTDATA_IND. Len = %ld\n",
1947			    (ptrdiff_t)(MBLKL(mp)));
1948			freemsg(mp);
1949			return (NULL);
1950		}
1951		/*
1952		 * Allow zero-length messages carrying options.
1953		 * This is used when carrying the SO_UNIX_CLOSE option.
1954		 */
1955		if (so->so_family == AF_UNIX && msgdsize(mp->b_cont) == 0 &&
1956		    tdi->OPT_length != 0) {
1957			/*
1958			 * Attempt to extract AF_UNIX close indication
1959			 * from the options. Ignore any other options -
1960			 * those are handled once the message is removed
1961			 * from the queue.
1962			 * The close indication message should not carry data.
1963			 */
1964			void *opt;
1965			t_uscalar_t optlen = tdi->OPT_length;
1966
1967			opt = sogetoff(mp, tdi->OPT_offset,
1968			    optlen, __TPI_ALIGN_SIZE);
1969			if (opt == NULL) {
1970				/* The len/off falls outside mp */
1971				freemsg(mp);
1972				mutex_enter(&so->so_lock);
1973				soseterror(so, EPROTO);
1974				mutex_exit(&so->so_lock);
1975				zcmn_err(getzoneid(), CE_WARN,
1976				    "sockfs: T_optdata_ind with invalid "
1977				    "optlen/offset %u/%d\n",
1978				    optlen, tdi->OPT_offset);
1979				return (NULL);
1980			}
1981			/*
1982			 * If we received a close indication mark the
1983			 * socket and discard this message.
1984			 */
1985			if (so_getopt_unix_close(opt, optlen)) {
1986				mutex_enter(&so->so_lock);
1987				socantsendmore(so);
1988				sti->sti_faddr_valid = 0;
1989				mutex_exit(&so->so_lock);
1990				strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
1991				freemsg(mp);
1992				return (NULL);
1993			}
1994		}
1995		*allmsgsigs = S_INPUT | S_RDNORM;
1996		*pollwakeups = POLLIN | POLLRDNORM;
1997		*wakeups = RSLEEP;
1998		return (mp);
1999	}
2000
2001	case T_EXDATA_IND: {
2002		mblk_t		*mctl, *mdata;
2003		mblk_t *lbp;
2004		union T_primitives *tprp;
2005		struct stdata   *stp;
2006		queue_t *qp;
2007
2008		if (MBLKL(mp) < sizeof (struct T_exdata_ind)) {
2009			zcmn_err(getzoneid(), CE_WARN,
2010			    "sockfs: Too short T_EXDATA_IND. Len = %ld\n",
2011			    (ptrdiff_t)(MBLKL(mp)));
2012			freemsg(mp);
2013			return (NULL);
2014		}
2015		/*
2016		 * Ignore zero-length T_EXDATA_IND messages. These might be
2017		 * generated by some transports.
2018		 *
2019		 * This is needed to prevent read (which skips the M_PROTO
2020		 * part) to unexpectedly return 0 (or return EWOULDBLOCK
2021		 * on a non-blocking socket after select/poll has indicated
2022		 * that data is available).
2023		 */
2024		dprintso(so, 1,
2025		    ("T_EXDATA_IND(%p): counts %d/%d state %s\n",
2026		    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2027		    pr_state(so->so_state, so->so_mode)));
2028
2029		if (msgdsize(mp->b_cont) == 0) {
2030			dprintso(so, 0,
2031			    ("strsock_proto: zero length T_EXDATA_IND\n"));
2032			freemsg(mp);
2033			return (NULL);
2034		}
2035
2036		/*
2037		 * Split into the T_EXDATA_IND and the M_DATA part.
2038		 * We process these three pieces separately:
2039		 *	signal generation
2040		 *	handling T_EXDATA_IND
2041		 *	handling M_DATA component
2042		 */
2043		mctl = mp;
2044		mdata = mctl->b_cont;
2045		mctl->b_cont = NULL;
2046		mutex_enter(&so->so_lock);
2047		so_oob_sig(so, 0, allmsgsigs, pollwakeups);
2048		mctl = so_oob_exdata(so, mctl, allmsgsigs, pollwakeups);
2049		mdata = so_oob_data(so, mdata, allmsgsigs, pollwakeups);
2050
2051		stp = vp->v_stream;
2052		ASSERT(stp != NULL);
2053		qp = _RD(stp->sd_wrq);
2054
2055		mutex_enter(QLOCK(qp));
2056		lbp = qp->q_last;
2057
2058		/*
2059		 * We want to avoid queueing up a string of T_EXDATA_IND
2060		 * messages with no intervening data messages at the stream
2061		 * head. These messages contribute to the total message
2062		 * count. Eventually this can lead to STREAMS flow contol
2063		 * and also cause TCP to advertise a zero window condition
2064		 * to the peer. This can happen in the degenerate case where
2065		 * the sender and receiver exchange only OOB data. The sender
2066		 * only sends messages with MSG_OOB flag and the receiver
2067		 * receives only MSG_OOB messages and does not use SO_OOBINLINE.
2068		 * An example of this scenario has been reported in applications
2069		 * that use OOB data to exchange heart beats. Flow control
2070		 * relief will never happen if the application only reads OOB
2071		 * data which is done directly by sorecvoob() and the
2072		 * T_EXDATA_IND messages at the streamhead won't be consumed.
2073		 * Note that there is no correctness issue in compressing the
2074		 * string of T_EXDATA_IND messages into a single T_EXDATA_IND
2075		 * message. A single read that does not specify MSG_OOB will
2076		 * read across all the marks in a loop in sotpi_recvmsg().
2077		 * Each mark is individually distinguishable only if the
2078		 * T_EXDATA_IND messages are separated by data messages.
2079		 */
2080		if ((qp->q_first != NULL) && (DB_TYPE(lbp) == M_PROTO)) {
2081			tprp = (union T_primitives *)lbp->b_rptr;
2082			if ((tprp->type == T_EXDATA_IND) &&
2083			    !(so->so_options & SO_OOBINLINE)) {
2084
2085				/*
2086				 * free the new M_PROTO message
2087				 */
2088				freemsg(mctl);
2089
2090				/*
2091				 * adjust the OOB count and OOB	signal count
2092				 * just incremented for the new OOB data.
2093				 */
2094				sti->sti_oobcnt--;
2095				sti->sti_oobsigcnt--;
2096				mutex_exit(QLOCK(qp));
2097				mutex_exit(&so->so_lock);
2098				return (NULL);
2099			}
2100		}
2101		mutex_exit(QLOCK(qp));
2102
2103		/*
2104		 * Pass the T_EXDATA_IND and the M_DATA back separately
2105		 * by using b_next linkage. (The stream head will queue any
2106		 * b_next linked messages separately.) This is needed
2107		 * since MSGMARK applies to the last by of the message
2108		 * hence we can not have any M_DATA component attached
2109		 * to the marked T_EXDATA_IND. Note that the stream head
2110		 * will not consolidate M_DATA messages onto an MSGMARK'ed
2111		 * message in order to preserve the constraint that
2112		 * the T_EXDATA_IND always is a separate message.
2113		 */
2114		ASSERT(mctl != NULL);
2115		mctl->b_next = mdata;
2116		mp = mctl;
2117#ifdef DEBUG
2118		if (mdata == NULL) {
2119			dprintso(so, 1,
2120			    ("after outofline T_EXDATA_IND(%p): "
2121			    "counts %d/%d  poll 0x%x sig 0x%x state %s\n",
2122			    (void *)vp, sti->sti_oobsigcnt,
2123			    sti->sti_oobcnt, *pollwakeups, *allmsgsigs,
2124			    pr_state(so->so_state, so->so_mode)));
2125		} else {
2126			dprintso(so, 1,
2127			    ("after inline T_EXDATA_IND(%p): "
2128			    "counts %d/%d  poll 0x%x sig 0x%x state %s\n",
2129			    (void *)vp, sti->sti_oobsigcnt,
2130			    sti->sti_oobcnt, *pollwakeups, *allmsgsigs,
2131			    pr_state(so->so_state, so->so_mode)));
2132		}
2133#endif /* DEBUG */
2134		mutex_exit(&so->so_lock);
2135		*wakeups = RSLEEP;
2136		return (mp);
2137	}
2138
2139	case T_CONN_CON: {
2140		struct T_conn_con	*conn_con;
2141		void			*addr;
2142		t_uscalar_t		addrlen;
2143
2144		/*
2145		 * Verify the state, update the state to ISCONNECTED,
2146		 * record the potentially new address in the message,
2147		 * and drop the message.
2148		 */
2149		if (MBLKL(mp) < sizeof (struct T_conn_con)) {
2150			zcmn_err(getzoneid(), CE_WARN,
2151			    "sockfs: Too short T_CONN_CON. Len = %ld\n",
2152			    (ptrdiff_t)(MBLKL(mp)));
2153			freemsg(mp);
2154			return (NULL);
2155		}
2156
2157		mutex_enter(&so->so_lock);
2158		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) !=
2159		    SS_ISCONNECTING) {
2160			mutex_exit(&so->so_lock);
2161			dprintso(so, 1,
2162			    ("T_CONN_CON: state %x\n", so->so_state));
2163			freemsg(mp);
2164			return (NULL);
2165		}
2166
2167		conn_con = &tpr->conn_con;
2168		addrlen = conn_con->RES_length;
2169		/*
2170		 * Allow the address to be of different size than sent down
2171		 * in the T_CONN_REQ as long as it doesn't exceed the maxlen.
2172		 * For AF_UNIX require the identical length.
2173		 */
2174		if (so->so_family == AF_UNIX ?
2175		    addrlen != (t_uscalar_t)sizeof (sti->sti_ux_laddr) :
2176		    addrlen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2177			zcmn_err(getzoneid(), CE_WARN,
2178			    "sockfs: T_conn_con with different "
2179			    "length %u/%d\n",
2180			    addrlen, conn_con->RES_length);
2181			soisdisconnected(so, EPROTO);
2182			sti->sti_laddr_valid = 0;
2183			sti->sti_faddr_valid = 0;
2184			mutex_exit(&so->so_lock);
2185			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2186			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2187			strseteof(SOTOV(so), 1);
2188			freemsg(mp);
2189			/*
2190			 * strseteof takes care of read side wakeups,
2191			 * pollwakeups, and signals.
2192			 */
2193			*wakeups = WSLEEP;
2194			*allmsgsigs = S_OUTPUT;
2195			*pollwakeups = POLLOUT;
2196			return (NULL);
2197		}
2198		addr = sogetoff(mp, conn_con->RES_offset, addrlen, 1);
2199		if (addr == NULL) {
2200			zcmn_err(getzoneid(), CE_WARN,
2201			    "sockfs: T_conn_con with invalid "
2202			    "addrlen/offset %u/%d\n",
2203			    addrlen, conn_con->RES_offset);
2204			mutex_exit(&so->so_lock);
2205			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2206			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2207			strseteof(SOTOV(so), 1);
2208			freemsg(mp);
2209			/*
2210			 * strseteof takes care of read side wakeups,
2211			 * pollwakeups, and signals.
2212			 */
2213			*wakeups = WSLEEP;
2214			*allmsgsigs = S_OUTPUT;
2215			*pollwakeups = POLLOUT;
2216			return (NULL);
2217		}
2218
2219		/*
2220		 * Save for getpeername.
2221		 */
2222		if (so->so_family != AF_UNIX) {
2223			sti->sti_faddr_len = (socklen_t)addrlen;
2224			ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2225			bcopy(addr, sti->sti_faddr_sa, addrlen);
2226			sti->sti_faddr_valid = 1;
2227		}
2228
2229		if (so->so_peercred != NULL)
2230			crfree(so->so_peercred);
2231		so->so_peercred = msg_getcred(mp, &so->so_cpid);
2232		if (so->so_peercred != NULL)
2233			crhold(so->so_peercred);
2234
2235		/* Wakeup anybody sleeping in sowaitconnected */
2236		soisconnected(so);
2237		mutex_exit(&so->so_lock);
2238
2239		/*
2240		 * The socket is now available for sending data.
2241		 */
2242		*wakeups = WSLEEP;
2243		*allmsgsigs = S_OUTPUT;
2244		*pollwakeups = POLLOUT;
2245		freemsg(mp);
2246		return (NULL);
2247	}
2248
2249	case T_CONN_IND:
2250		/*
2251		 * Verify the min size and queue the message on
2252		 * the sti_conn_ind_head/tail list.
2253		 */
2254		if (MBLKL(mp) < sizeof (struct T_conn_ind)) {
2255			zcmn_err(getzoneid(), CE_WARN,
2256			    "sockfs: Too short T_CONN_IND. Len = %ld\n",
2257			    (ptrdiff_t)(MBLKL(mp)));
2258			freemsg(mp);
2259			return (NULL);
2260		}
2261
2262		if (auditing)
2263			audit_sock(T_CONN_IND, strvp2wq(vp), mp, 0);
2264		if (!(so->so_state & SS_ACCEPTCONN)) {
2265			zcmn_err(getzoneid(), CE_WARN,
2266			    "sockfs: T_conn_ind on non-listening socket\n");
2267			freemsg(mp);
2268			return (NULL);
2269		}
2270
2271		soqueueconnind(so, mp);
2272		*allmsgsigs = S_INPUT | S_RDNORM;
2273		*pollwakeups = POLLIN | POLLRDNORM;
2274		*wakeups = RSLEEP;
2275		return (NULL);
2276
2277	case T_ORDREL_IND:
2278		if (MBLKL(mp) < sizeof (struct T_ordrel_ind)) {
2279			zcmn_err(getzoneid(), CE_WARN,
2280			    "sockfs: Too short T_ORDREL_IND. Len = %ld\n",
2281			    (ptrdiff_t)(MBLKL(mp)));
2282			freemsg(mp);
2283			return (NULL);
2284		}
2285
2286		/*
2287		 * Some providers send this when not fully connected.
2288		 * SunLink X.25 needs to retrieve disconnect reason after
2289		 * disconnect for compatibility. It uses T_ORDREL_IND
2290		 * instead of T_DISCON_IND so that it may use the
2291		 * endpoint after a connect failure to retrieve the
2292		 * reason using an ioctl. Thus we explicitly clear
2293		 * SS_ISCONNECTING here for SunLink X.25.
2294		 * This is a needed TPI violation.
2295		 */
2296		mutex_enter(&so->so_lock);
2297		so->so_state &= ~SS_ISCONNECTING;
2298		socantrcvmore(so);
2299		mutex_exit(&so->so_lock);
2300		strseteof(SOTOV(so), 1);
2301		/*
2302		 * strseteof takes care of read side wakeups,
2303		 * pollwakeups, and signals.
2304		 */
2305		freemsg(mp);
2306		return (NULL);
2307
2308	case T_DISCON_IND:
2309		if (MBLKL(mp) < sizeof (struct T_discon_ind)) {
2310			zcmn_err(getzoneid(), CE_WARN,
2311			    "sockfs: Too short T_DISCON_IND. Len = %ld\n",
2312			    (ptrdiff_t)(MBLKL(mp)));
2313			freemsg(mp);
2314			return (NULL);
2315		}
2316		if (so->so_state & SS_ACCEPTCONN) {
2317			/*
2318			 * This is a listener. Look for a queued T_CONN_IND
2319			 * with a matching sequence number and remove it
2320			 * from the list.
2321			 * It is normal to not find the sequence number since
2322			 * the soaccept might have already dequeued it
2323			 * (in which case the T_CONN_RES will fail with
2324			 * TBADSEQ).
2325			 */
2326			(void) soflushconnind(so, tpr->discon_ind.SEQ_number);
2327			freemsg(mp);
2328			return (0);
2329		}
2330
2331		/*
2332		 * Not a listener
2333		 *
2334		 * If SS_CANTRCVMORE for AF_UNIX ignore the discon_reason.
2335		 * Such a discon_ind appears when the peer has first done
2336		 * a shutdown() followed by a close() in which case we just
2337		 * want to record socantsendmore.
2338		 * In this case sockfs first receives a T_ORDREL_IND followed
2339		 * by a T_DISCON_IND.
2340		 * Note that for other transports (e.g. TCP) we need to handle
2341		 * the discon_ind in this case since it signals an error.
2342		 */
2343		mutex_enter(&so->so_lock);
2344		if ((so->so_state & SS_CANTRCVMORE) &&
2345		    (so->so_family == AF_UNIX)) {
2346			socantsendmore(so);
2347			sti->sti_faddr_valid = 0;
2348			mutex_exit(&so->so_lock);
2349			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2350			dprintso(so, 1,
2351			    ("T_DISCON_IND: error %d\n", so->so_error));
2352			freemsg(mp);
2353			/*
2354			 * Set these variables for caller to process them.
2355			 * For the else part where T_DISCON_IND is processed,
2356			 * this will be done in the function being called
2357			 * (strsock_discon_ind())
2358			 */
2359			*wakeups = WSLEEP;
2360			*allmsgsigs = S_OUTPUT;
2361			*pollwakeups = POLLOUT;
2362		} else if (so->so_flag & (SOASYNC_UNBIND | SOLOCKED)) {
2363			/*
2364			 * Deferred processing of T_DISCON_IND
2365			 */
2366			so_save_discon_ind(so, mp, strsock_discon_ind);
2367			mutex_exit(&so->so_lock);
2368		} else {
2369			/*
2370			 * Process T_DISCON_IND now
2371			 */
2372			(void) strsock_discon_ind(so, mp);
2373			mutex_exit(&so->so_lock);
2374		}
2375		return (NULL);
2376
2377	case T_UDERROR_IND: {
2378		struct T_uderror_ind	*tudi = &tpr->uderror_ind;
2379		void			*addr;
2380		t_uscalar_t		addrlen;
2381		int			error;
2382
2383		dprintso(so, 0,
2384		    ("T_UDERROR_IND: error %d\n", tudi->ERROR_type));
2385
2386		if (MBLKL(mp) < sizeof (struct T_uderror_ind)) {
2387			zcmn_err(getzoneid(), CE_WARN,
2388			    "sockfs: Too short T_UDERROR_IND. Len = %ld\n",
2389			    (ptrdiff_t)(MBLKL(mp)));
2390			freemsg(mp);
2391			return (NULL);
2392		}
2393		/* Ignore on connection-oriented transports */
2394		if (so->so_mode & SM_CONNREQUIRED) {
2395			freemsg(mp);
2396			eprintsoline(so, 0);
2397			zcmn_err(getzoneid(), CE_WARN,
2398			    "sockfs: T_uderror_ind on connection-oriented "
2399			    "transport\n");
2400			return (NULL);
2401		}
2402		addrlen = tudi->DEST_length;
2403		addr = sogetoff(mp, tudi->DEST_offset, addrlen, 1);
2404		if (addr == NULL) {
2405			zcmn_err(getzoneid(), CE_WARN,
2406			    "sockfs: T_uderror_ind with invalid "
2407			    "addrlen/offset %u/%d\n",
2408			    addrlen, tudi->DEST_offset);
2409			freemsg(mp);
2410			return (NULL);
2411		}
2412
2413		/* Verify source address for connected socket. */
2414		mutex_enter(&so->so_lock);
2415		if (so->so_state & SS_ISCONNECTED) {
2416			void *faddr;
2417			t_uscalar_t faddr_len;
2418			boolean_t match = B_FALSE;
2419
2420			switch (so->so_family) {
2421			case AF_INET: {
2422				/* Compare just IP address and port */
2423				struct sockaddr_in *sin1, *sin2;
2424
2425				sin1 = (struct sockaddr_in *)sti->sti_faddr_sa;
2426				sin2 = (struct sockaddr_in *)addr;
2427				if (addrlen == sizeof (struct sockaddr_in) &&
2428				    sin1->sin_port == sin2->sin_port &&
2429				    sin1->sin_addr.s_addr ==
2430				    sin2->sin_addr.s_addr)
2431					match = B_TRUE;
2432				break;
2433			}
2434			case AF_INET6: {
2435				/* Compare just IP address and port. Not flow */
2436				struct sockaddr_in6 *sin1, *sin2;
2437
2438				sin1 = (struct sockaddr_in6 *)sti->sti_faddr_sa;
2439				sin2 = (struct sockaddr_in6 *)addr;
2440				if (addrlen == sizeof (struct sockaddr_in6) &&
2441				    sin1->sin6_port == sin2->sin6_port &&
2442				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
2443				    &sin2->sin6_addr))
2444					match = B_TRUE;
2445				break;
2446			}
2447			case AF_UNIX:
2448				faddr = &sti->sti_ux_faddr;
2449				faddr_len =
2450				    (t_uscalar_t)sizeof (sti->sti_ux_faddr);
2451				if (faddr_len == addrlen &&
2452				    bcmp(addr, faddr, addrlen) == 0)
2453					match = B_TRUE;
2454				break;
2455			default:
2456				faddr = sti->sti_faddr_sa;
2457				faddr_len = (t_uscalar_t)sti->sti_faddr_len;
2458				if (faddr_len == addrlen &&
2459				    bcmp(addr, faddr, addrlen) == 0)
2460					match = B_TRUE;
2461				break;
2462			}
2463
2464			if (!match) {
2465#ifdef DEBUG
2466				dprintso(so, 0,
2467				    ("sockfs: T_UDERR_IND mismatch: %s - ",
2468				    pr_addr(so->so_family,
2469				    (struct sockaddr *)addr, addrlen)));
2470				dprintso(so, 0, ("%s\n",
2471				    pr_addr(so->so_family, sti->sti_faddr_sa,
2472				    sti->sti_faddr_len)));
2473#endif /* DEBUG */
2474				mutex_exit(&so->so_lock);
2475				freemsg(mp);
2476				return (NULL);
2477			}
2478			/*
2479			 * Make the write error nonpersistent. If the error
2480			 * is zero we use ECONNRESET.
2481			 * This assumes that the name space for ERROR_type
2482			 * is the errno name space.
2483			 */
2484			if (tudi->ERROR_type != 0)
2485				error = tudi->ERROR_type;
2486			else
2487				error = ECONNRESET;
2488
2489			soseterror(so, error);
2490			mutex_exit(&so->so_lock);
2491			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2492			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2493			*wakeups = RSLEEP | WSLEEP;
2494			*allmsgsigs = S_INPUT | S_RDNORM | S_OUTPUT;
2495			*pollwakeups = POLLIN | POLLRDNORM | POLLOUT;
2496			freemsg(mp);
2497			return (NULL);
2498		}
2499		/*
2500		 * If the application asked for delayed errors
2501		 * record the T_UDERROR_IND sti_eaddr_mp and the reason in
2502		 * sti_delayed_error for delayed error posting. If the reason
2503		 * is zero use ECONNRESET.
2504		 * Note that delayed error indications do not make sense for
2505		 * AF_UNIX sockets since sendto checks that the destination
2506		 * address is valid at the time of the sendto.
2507		 */
2508		if (!(so->so_options & SO_DGRAM_ERRIND)) {
2509			mutex_exit(&so->so_lock);
2510			freemsg(mp);
2511			return (NULL);
2512		}
2513		if (sti->sti_eaddr_mp != NULL)
2514			freemsg(sti->sti_eaddr_mp);
2515
2516		sti->sti_eaddr_mp = mp;
2517		if (tudi->ERROR_type != 0)
2518			error = tudi->ERROR_type;
2519		else
2520			error = ECONNRESET;
2521		sti->sti_delayed_error = (ushort_t)error;
2522		mutex_exit(&so->so_lock);
2523		return (NULL);
2524	}
2525
2526	case T_ERROR_ACK:
2527		dprintso(so, 0,
2528		    ("strsock_proto: T_ERROR_ACK for %d, error %d/%d\n",
2529		    tpr->error_ack.ERROR_prim,
2530		    tpr->error_ack.TLI_error,
2531		    tpr->error_ack.UNIX_error));
2532
2533		if (MBLKL(mp) < sizeof (struct T_error_ack)) {
2534			zcmn_err(getzoneid(), CE_WARN,
2535			    "sockfs: Too short T_ERROR_ACK. Len = %ld\n",
2536			    (ptrdiff_t)(MBLKL(mp)));
2537			freemsg(mp);
2538			return (NULL);
2539		}
2540		/*
2541		 * Check if we were waiting for the async message
2542		 */
2543		mutex_enter(&so->so_lock);
2544		if ((so->so_flag & SOASYNC_UNBIND) &&
2545		    tpr->error_ack.ERROR_prim == T_UNBIND_REQ) {
2546			so_unlock_single(so, SOASYNC_UNBIND);
2547			mutex_exit(&so->so_lock);
2548			freemsg(mp);
2549			return (NULL);
2550		}
2551		mutex_exit(&so->so_lock);
2552		soqueueack(so, mp);
2553		return (NULL);
2554
2555	case T_OK_ACK:
2556		if (MBLKL(mp) < sizeof (struct T_ok_ack)) {
2557			zcmn_err(getzoneid(), CE_WARN,
2558			    "sockfs: Too short T_OK_ACK. Len = %ld\n",
2559			    (ptrdiff_t)(MBLKL(mp)));
2560			freemsg(mp);
2561			return (NULL);
2562		}
2563		/*
2564		 * Check if we were waiting for the async message
2565		 */
2566		mutex_enter(&so->so_lock);
2567		if ((so->so_flag & SOASYNC_UNBIND) &&
2568		    tpr->ok_ack.CORRECT_prim == T_UNBIND_REQ) {
2569			dprintso(so, 1,
2570			    ("strsock_proto: T_OK_ACK async unbind\n"));
2571			so_unlock_single(so, SOASYNC_UNBIND);
2572			mutex_exit(&so->so_lock);
2573			freemsg(mp);
2574			return (NULL);
2575		}
2576		mutex_exit(&so->so_lock);
2577		soqueueack(so, mp);
2578		return (NULL);
2579
2580	case T_INFO_ACK:
2581		if (MBLKL(mp) < sizeof (struct T_info_ack)) {
2582			zcmn_err(getzoneid(), CE_WARN,
2583			    "sockfs: Too short T_INFO_ACK. Len = %ld\n",
2584			    (ptrdiff_t)(MBLKL(mp)));
2585			freemsg(mp);
2586			return (NULL);
2587		}
2588		soqueueack(so, mp);
2589		return (NULL);
2590
2591	case T_CAPABILITY_ACK:
2592		/*
2593		 * A T_capability_ack need only be large enough to hold
2594		 * the PRIM_type and CAP_bits1 fields; checking for anything
2595		 * larger might reject a correct response from an older
2596		 * provider.
2597		 */
2598		if (MBLKL(mp) < 2 * sizeof (t_uscalar_t)) {
2599			zcmn_err(getzoneid(), CE_WARN,
2600			    "sockfs: Too short T_CAPABILITY_ACK. Len = %ld\n",
2601			    (ptrdiff_t)(MBLKL(mp)));
2602			freemsg(mp);
2603			return (NULL);
2604		}
2605		soqueueack(so, mp);
2606		return (NULL);
2607
2608	case T_BIND_ACK:
2609		if (MBLKL(mp) < sizeof (struct T_bind_ack)) {
2610			zcmn_err(getzoneid(), CE_WARN,
2611			    "sockfs: Too short T_BIND_ACK. Len = %ld\n",
2612			    (ptrdiff_t)(MBLKL(mp)));
2613			freemsg(mp);
2614			return (NULL);
2615		}
2616		soqueueack(so, mp);
2617		return (NULL);
2618
2619	case T_OPTMGMT_ACK:
2620		if (MBLKL(mp) < sizeof (struct T_optmgmt_ack)) {
2621			zcmn_err(getzoneid(), CE_WARN,
2622			    "sockfs: Too short T_OPTMGMT_ACK. Len = %ld\n",
2623			    (ptrdiff_t)(MBLKL(mp)));
2624			freemsg(mp);
2625			return (NULL);
2626		}
2627		soqueueack(so, mp);
2628		return (NULL);
2629	default:
2630#ifdef DEBUG
2631		zcmn_err(getzoneid(), CE_WARN,
2632		    "sockfs: unknown TPI primitive %d received\n",
2633		    tpr->type);
2634#endif /* DEBUG */
2635		freemsg(mp);
2636		return (NULL);
2637	}
2638}
2639
2640/*
2641 * This routine is registered with the stream head to receive other
2642 * (non-data, and non-proto) messages.
2643 *
2644 * Returns NULL if the message was consumed.
2645 * Returns an mblk to make that mblk be processed by the stream head.
2646 *
2647 * Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
2648 * *pollwakeups) for the stream head to take action on.
2649 */
2650static mblk_t *
2651strsock_misc(vnode_t *vp, mblk_t *mp,
2652		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
2653		strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
2654{
2655	struct sonode *so;
2656	sotpi_info_t *sti;
2657
2658	so = VTOSO(vp);
2659	sti = SOTOTPI(so);
2660
2661	dprintso(so, 1, ("strsock_misc(%p, %p, 0x%x)\n",
2662	    (void *)vp, (void *)mp, DB_TYPE(mp)));
2663
2664	/* Set default return values */
2665	*wakeups = *allmsgsigs = *firstmsgsigs = *pollwakeups = 0;
2666
2667	switch (DB_TYPE(mp)) {
2668	case M_PCSIG:
2669		/*
2670		 * This assumes that an M_PCSIG for the urgent data arrives
2671		 * before the corresponding T_EXDATA_IND.
2672		 *
2673		 * Note: Just like in SunOS 4.X and 4.4BSD a poll will be
2674		 * awoken before the urgent data shows up.
2675		 * For OOBINLINE this can result in select returning
2676		 * only exceptions as opposed to except|read.
2677		 */
2678		if (*mp->b_rptr == SIGURG) {
2679			mutex_enter(&so->so_lock);
2680			dprintso(so, 1,
2681			    ("SIGURG(%p): counts %d/%d state %s\n",
2682			    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2683			    pr_state(so->so_state, so->so_mode)));
2684			so_oob_sig(so, 1, allmsgsigs, pollwakeups);
2685			dprintso(so, 1,
2686			    ("after SIGURG(%p): counts %d/%d "
2687			    " poll 0x%x sig 0x%x state %s\n",
2688			    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2689			    *pollwakeups, *allmsgsigs,
2690			    pr_state(so->so_state, so->so_mode)));
2691			mutex_exit(&so->so_lock);
2692		}
2693		freemsg(mp);
2694		return (NULL);
2695
2696	case M_SIG:
2697	case M_HANGUP:
2698	case M_UNHANGUP:
2699	case M_ERROR:
2700		/* M_ERRORs etc are ignored */
2701		freemsg(mp);
2702		return (NULL);
2703
2704	case M_FLUSH:
2705		/*
2706		 * Do not flush read queue. If the M_FLUSH
2707		 * arrives because of an impending T_discon_ind
2708		 * we still have to keep any queued data - this is part of
2709		 * socket semantics.
2710		 */
2711		if (*mp->b_rptr & FLUSHW) {
2712			*mp->b_rptr &= ~FLUSHR;
2713			return (mp);
2714		}
2715		freemsg(mp);
2716		return (NULL);
2717
2718	default:
2719		return (mp);
2720	}
2721}
2722
2723
2724/* Register to receive signals for certain events */
2725int
2726so_set_asyncsigs(vnode_t *vp, pid_t pgrp, int events, int mode, cred_t *cr)
2727{
2728	struct strsigset ss;
2729	int32_t rval;
2730
2731	/*
2732	 * Note that SOLOCKED will be set except for the call from soaccept().
2733	 */
2734	ASSERT(!mutex_owned(&VTOSO(vp)->so_lock));
2735	ss.ss_pid = pgrp;
2736	ss.ss_events = events;
2737	return (strioctl(vp, I_ESETSIG, (intptr_t)&ss, mode, K_TO_K, cr,
2738	    &rval));
2739}
2740
2741
2742/* Register for events matching the SS_ASYNC flag */
2743int
2744so_set_events(struct sonode *so, vnode_t *vp, cred_t *cr)
2745{
2746	int events = so->so_state & SS_ASYNC ?
2747	    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
2748	    S_RDBAND | S_BANDURG;
2749
2750	return (so_set_asyncsigs(vp, so->so_pgrp, events, 0, cr));
2751}
2752
2753
2754/* Change the SS_ASYNC flag, and update signal delivery if needed */
2755int
2756so_flip_async(struct sonode *so, vnode_t *vp, int mode, cred_t *cr)
2757{
2758	ASSERT(mutex_owned(&so->so_lock));
2759	if (so->so_pgrp != 0) {
2760		int error;
2761		int events = so->so_state & SS_ASYNC ?		/* Old flag */
2762		    S_RDBAND | S_BANDURG :			/* New sigs */
2763		    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT;
2764
2765		so_lock_single(so);
2766		mutex_exit(&so->so_lock);
2767
2768		error = so_set_asyncsigs(vp, so->so_pgrp, events, mode, cr);
2769
2770		mutex_enter(&so->so_lock);
2771		so_unlock_single(so, SOLOCKED);
2772		if (error)
2773			return (error);
2774	}
2775	so->so_state ^= SS_ASYNC;
2776	return (0);
2777}
2778
2779/*
2780 * Set new pid/pgrp for SIGPOLL (or SIGIO for FIOASYNC mode), replacing
2781 * any existing one.  If passed zero, just clear the existing one.
2782 */
2783int
2784so_set_siggrp(struct sonode *so, vnode_t *vp, pid_t pgrp, int mode, cred_t *cr)
2785{
2786	int events = so->so_state & SS_ASYNC ?
2787	    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
2788	    S_RDBAND | S_BANDURG;
2789	int error;
2790
2791	ASSERT(mutex_owned(&so->so_lock));
2792
2793	/*
2794	 * Change socket process (group).
2795	 *
2796	 * strioctl (via so_set_asyncsigs) will perform permission check and
2797	 * also keep a PID_HOLD to prevent the pid from being reused.
2798	 */
2799	so_lock_single(so);
2800	mutex_exit(&so->so_lock);
2801
2802	if (pgrp != 0) {
2803		dprintso(so, 1, ("setown: adding pgrp %d ev 0x%x\n",
2804		    pgrp, events));
2805		error = so_set_asyncsigs(vp, pgrp, events, mode, cr);
2806		if (error != 0) {
2807			eprintsoline(so, error);
2808			goto bad;
2809		}
2810	}
2811	/* Remove the previously registered process/group */
2812	if (so->so_pgrp != 0) {
2813		dprintso(so, 1, ("setown: removing pgrp %d\n", so->so_pgrp));
2814		error = so_set_asyncsigs(vp, so->so_pgrp, 0, mode, cr);
2815		if (error != 0) {
2816			eprintsoline(so, error);
2817			error = 0;
2818		}
2819	}
2820	mutex_enter(&so->so_lock);
2821	so_unlock_single(so, SOLOCKED);
2822	so->so_pgrp = pgrp;
2823	return (0);
2824bad:
2825	mutex_enter(&so->so_lock);
2826	so_unlock_single(so, SOLOCKED);
2827	return (error);
2828}
2829
2830/*
2831 * Wrapper for getmsg. If the socket has been converted to a stream
2832 * pass the request to the stream head.
2833 */
2834int
2835sock_getmsg(
2836	struct vnode *vp,
2837	struct strbuf *mctl,
2838	struct strbuf *mdata,
2839	uchar_t *prip,
2840	int *flagsp,
2841	int fmode,
2842	rval_t *rvp
2843)
2844{
2845	struct sonode *so;
2846
2847	ASSERT(vp->v_type == VSOCK);
2848	/*
2849	 * Use the stream head to find the real socket vnode.
2850	 * This is needed when namefs sits above sockfs.  Some
2851	 * sockets (like SCTP) are not streams.
2852	 */
2853	if (!vp->v_stream) {
2854		return (ENOSTR);
2855	}
2856	ASSERT(vp->v_stream->sd_vnode);
2857	vp = vp->v_stream->sd_vnode;
2858	ASSERT(vn_matchops(vp, socket_vnodeops));
2859	so = VTOSO(vp);
2860
2861	dprintso(so, 1, ("sock_getmsg(%p) %s\n",
2862	    (void *)so, pr_state(so->so_state, so->so_mode)));
2863
2864	if (so->so_version == SOV_STREAM) {
2865		/* The imaginary "sockmod" has been popped - act as a stream */
2866		return (strgetmsg(vp, mctl, mdata, prip, flagsp, fmode, rvp));
2867	}
2868	eprintsoline(so, ENOSTR);
2869	return (ENOSTR);
2870}
2871
2872/*
2873 * Wrapper for putmsg. If the socket has been converted to a stream
2874 * pass the request to the stream head.
2875 *
2876 * Note that a while a regular socket (SOV_SOCKSTREAM) does support the
2877 * streams ioctl set it does not support putmsg and getmsg.
2878 * Allowing putmsg would prevent sockfs from tracking the state of
2879 * the socket/transport and would also invalidate the locking in sockfs.
2880 */
2881int
2882sock_putmsg(
2883	struct vnode *vp,
2884	struct strbuf *mctl,
2885	struct strbuf *mdata,
2886	uchar_t pri,
2887	int flag,
2888	int fmode
2889)
2890{
2891	struct sonode *so;
2892
2893	ASSERT(vp->v_type == VSOCK);
2894	/*
2895	 * Use the stream head to find the real socket vnode.
2896	 * This is needed when namefs sits above sockfs.
2897	 */
2898	if (!vp->v_stream) {
2899		return (ENOSTR);
2900	}
2901	ASSERT(vp->v_stream->sd_vnode);
2902	vp = vp->v_stream->sd_vnode;
2903	ASSERT(vn_matchops(vp, socket_vnodeops));
2904	so = VTOSO(vp);
2905
2906	dprintso(so, 1, ("sock_putmsg(%p) %s\n",
2907	    (void *)so, pr_state(so->so_state, so->so_mode)));
2908
2909	if (so->so_version == SOV_STREAM) {
2910		/* The imaginary "sockmod" has been popped - act as a stream */
2911		return (strputmsg(vp, mctl, mdata, pri, flag, fmode));
2912	}
2913	eprintsoline(so, ENOSTR);
2914	return (ENOSTR);
2915}
2916
2917/*
2918 * Special function called only from f_getfl().
2919 * Returns FASYNC if the SS_ASYNC flag is set on a socket, else 0.
2920 * No locks are acquired here, so it is safe to use while uf_lock is held.
2921 * This exists solely for BSD fcntl() FASYNC compatibility.
2922 */
2923int
2924sock_getfasync(vnode_t *vp)
2925{
2926	struct sonode *so;
2927
2928	ASSERT(vp->v_type == VSOCK);
2929	/*
2930	 * For stream model, v_stream is used; For non-stream, v_stream always
2931	 * equals NULL
2932	 */
2933	if (vp->v_stream != NULL)
2934		so = VTOSO(vp->v_stream->sd_vnode);
2935	else
2936		so = VTOSO(vp);
2937
2938	if (so->so_version == SOV_STREAM || !(so->so_state & SS_ASYNC))
2939		return (0);
2940
2941	return (FASYNC);
2942}
2943