sockcommon_sops.c revision 12198:4db936bda957
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#include <sys/types.h>
27#include <sys/param.h>
28#include <sys/systm.h>
29#include <sys/sysmacros.h>
30#include <sys/debug.h>
31#include <sys/cmn_err.h>
32
33#include <sys/stropts.h>
34#include <sys/socket.h>
35#include <sys/socketvar.h>
36
37#define	_SUN_TPI_VERSION	2
38#include <sys/tihdr.h>
39#include <sys/sockio.h>
40#include <sys/kmem_impl.h>
41
42#include <sys/strsubr.h>
43#include <sys/strsun.h>
44#include <sys/ddi.h>
45#include <netinet/in.h>
46#include <inet/ip.h>
47
48#include <fs/sockfs/sockcommon.h>
49
50#include <sys/socket_proto.h>
51
52#include <fs/sockfs/socktpi_impl.h>
53#include <fs/sockfs/sodirect.h>
54#include <sys/tihdr.h>
55#include <fs/sockfs/nl7c.h>
56#include <inet/kssl/ksslapi.h>
57
58
59extern int xnet_skip_checks;
60extern int xnet_check_print;
61
62static void so_queue_oob(sock_upper_handle_t, mblk_t *, size_t);
63
64
65/*ARGSUSED*/
66int
67so_accept_notsupp(struct sonode *lso, int fflag,
68    struct cred *cr, struct sonode **nsop)
69{
70	return (EOPNOTSUPP);
71}
72
73/*ARGSUSED*/
74int
75so_listen_notsupp(struct sonode *so, int backlog, struct cred *cr)
76{
77	return (EOPNOTSUPP);
78}
79
80/*ARGSUSED*/
81int
82so_getsockname_notsupp(struct sonode *so, struct sockaddr *sa,
83    socklen_t *len, struct cred *cr)
84{
85	return (EOPNOTSUPP);
86}
87
88/*ARGSUSED*/
89int
90so_getpeername_notsupp(struct sonode *so, struct sockaddr *addr,
91    socklen_t *addrlen, boolean_t accept, struct cred *cr)
92{
93	return (EOPNOTSUPP);
94}
95
96/*ARGSUSED*/
97int
98so_shutdown_notsupp(struct sonode *so, int how, struct cred *cr)
99{
100	return (EOPNOTSUPP);
101}
102
103/*ARGSUSED*/
104int
105so_sendmblk_notsupp(struct sonode *so, struct msghdr *msg, int fflag,
106    struct cred *cr, mblk_t **mpp)
107{
108	return (EOPNOTSUPP);
109}
110
111/*
112 * Generic Socket Ops
113 */
114
115/* ARGSUSED */
116int
117so_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags)
118{
119	return (socket_init_common(so, pso, flags, cr));
120}
121
122int
123so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
124    int flags, struct cred *cr)
125{
126	int error;
127
128	SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr));
129
130	ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD);
131
132	/* X/Open requires this check */
133	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
134		if (xnet_check_print) {
135			printf("sockfs: X/Open bind state check "
136			    "caused EINVAL\n");
137		}
138		error = EINVAL;
139		goto done;
140	}
141
142	/*
143	 * a bind to a NULL address is interpreted as unbind. So just
144	 * do the downcall.
145	 */
146	if (name == NULL)
147		goto dobind;
148
149	switch (so->so_family) {
150	case AF_INET:
151		if ((size_t)namelen != sizeof (sin_t)) {
152			error = name->sa_family != so->so_family ?
153			    EAFNOSUPPORT : EINVAL;
154			eprintsoline(so, error);
155			goto done;
156		}
157
158		if ((flags & _SOBIND_XPG4_2) &&
159		    (name->sa_family != so->so_family)) {
160			/*
161			 * This check has to be made for X/Open
162			 * sockets however application failures have
163			 * been observed when it is applied to
164			 * all sockets.
165			 */
166			error = EAFNOSUPPORT;
167			eprintsoline(so, error);
168			goto done;
169		}
170		/*
171		 * Force a zero sa_family to match so_family.
172		 *
173		 * Some programs like inetd(1M) don't set the
174		 * family field. Other programs leave
175		 * sin_family set to garbage - SunOS 4.X does
176		 * not check the family field on a bind.
177		 * We use the family field that
178		 * was passed in to the socket() call.
179		 */
180		name->sa_family = so->so_family;
181		break;
182
183	case AF_INET6: {
184#ifdef DEBUG
185		sin6_t *sin6 = (sin6_t *)name;
186#endif
187		if ((size_t)namelen != sizeof (sin6_t)) {
188			error = name->sa_family != so->so_family ?
189			    EAFNOSUPPORT : EINVAL;
190			eprintsoline(so, error);
191			goto done;
192		}
193
194		if (name->sa_family != so->so_family) {
195			/*
196			 * With IPv6 we require the family to match
197			 * unlike in IPv4.
198			 */
199			error = EAFNOSUPPORT;
200			eprintsoline(so, error);
201			goto done;
202		}
203#ifdef DEBUG
204		/*
205		 * Verify that apps don't forget to clear
206		 * sin6_scope_id etc
207		 */
208		if (sin6->sin6_scope_id != 0 &&
209		    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
210			zcmn_err(getzoneid(), CE_WARN,
211			    "bind with uninitialized sin6_scope_id "
212			    "(%d) on socket. Pid = %d\n",
213			    (int)sin6->sin6_scope_id,
214			    (int)curproc->p_pid);
215		}
216		if (sin6->__sin6_src_id != 0) {
217			zcmn_err(getzoneid(), CE_WARN,
218			    "bind with uninitialized __sin6_src_id "
219			    "(%d) on socket. Pid = %d\n",
220			    (int)sin6->__sin6_src_id,
221			    (int)curproc->p_pid);
222		}
223#endif /* DEBUG */
224
225		break;
226	}
227	default:
228		/* Just pass the request to the protocol */
229		goto dobind;
230	}
231
232	/*
233	 * First we check if either NCA or KSSL has been enabled for
234	 * the requested address, and if so, we fall back to TPI.
235	 * If neither of those two services are enabled, then we just
236	 * pass the request to the protocol.
237	 *
238	 * Note that KSSL can only be enabled on a socket if NCA is NOT
239	 * enabled for that socket, hence the else-statement below.
240	 */
241	if (nl7c_enabled && ((so->so_family == AF_INET ||
242	    so->so_family == AF_INET6) &&
243	    nl7c_lookup_addr(name, namelen) != NULL)) {
244		/*
245		 * NL7C is not supported in non-global zones,
246		 * we enforce this restriction here.
247		 */
248		if (so->so_zoneid == GLOBAL_ZONEID) {
249			/* NCA should be used, so fall back to TPI */
250			error = so_tpi_fallback(so, cr);
251			SO_UNBLOCK_FALLBACK(so);
252			if (error)
253				return (error);
254			else
255				return (SOP_BIND(so, name, namelen, flags, cr));
256		}
257	} else if (so->so_type == SOCK_STREAM) {
258		/* Check if KSSL has been configured for this address */
259		kssl_ent_t ent;
260		kssl_endpt_type_t type;
261		struct T_bind_req bind_req;
262		mblk_t *mp;
263
264		/*
265		 * TODO: Check with KSSL team if we could add a function call
266		 * that only queries whether KSSL is enabled for the given
267		 * address.
268		 */
269		bind_req.PRIM_type = T_BIND_REQ;
270		bind_req.ADDR_length = namelen;
271		bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
272		mp = soallocproto2(&bind_req, sizeof (bind_req),
273		    name, namelen, 0, _ALLOC_SLEEP, cr);
274
275		type = kssl_check_proxy(mp, so, &ent);
276		freemsg(mp);
277
278		if (type != KSSL_NO_PROXY) {
279			/*
280			 * KSSL has been configured for this address, so
281			 * we must fall back to TPI.
282			 */
283			kssl_release_ent(ent, so, type);
284			error = so_tpi_fallback(so, cr);
285			SO_UNBLOCK_FALLBACK(so);
286			if (error)
287				return (error);
288			else
289				return (SOP_BIND(so, name, namelen, flags, cr));
290		}
291	}
292
293dobind:
294	error = (*so->so_downcalls->sd_bind)
295	    (so->so_proto_handle, name, namelen, cr);
296done:
297	SO_UNBLOCK_FALLBACK(so);
298
299	return (error);
300}
301
302int
303so_listen(struct sonode *so, int backlog, struct cred *cr)
304{
305	int	error = 0;
306
307	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
308	SO_BLOCK_FALLBACK(so, SOP_LISTEN(so, backlog, cr));
309
310	error = (*so->so_downcalls->sd_listen)(so->so_proto_handle, backlog,
311	    cr);
312
313	SO_UNBLOCK_FALLBACK(so);
314
315	return (error);
316}
317
318
319int
320so_connect(struct sonode *so, const struct sockaddr *name,
321    socklen_t namelen, int fflag, int flags, struct cred *cr)
322{
323	int error = 0;
324	sock_connid_t id;
325
326	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
327	SO_BLOCK_FALLBACK(so, SOP_CONNECT(so, name, namelen, fflag, flags, cr));
328
329	/*
330	 * If there is a pending error, return error
331	 * This can happen if a non blocking operation caused an error.
332	 */
333
334	if (so->so_error != 0) {
335		mutex_enter(&so->so_lock);
336		error = sogeterr(so, B_TRUE);
337		mutex_exit(&so->so_lock);
338		if (error != 0)
339			goto done;
340	}
341
342	error = (*so->so_downcalls->sd_connect)(so->so_proto_handle,
343	    name, namelen, &id, cr);
344
345	if (error == EINPROGRESS)
346		error = so_wait_connected(so, fflag & (FNONBLOCK|FNDELAY), id);
347
348done:
349	SO_UNBLOCK_FALLBACK(so);
350	return (error);
351}
352
353/*ARGSUSED*/
354int
355so_accept(struct sonode *so, int fflag, struct cred *cr, struct sonode **nsop)
356{
357	int error = 0;
358	struct sonode *nso;
359
360	*nsop = NULL;
361
362	SO_BLOCK_FALLBACK(so, SOP_ACCEPT(so, fflag, cr, nsop));
363	if ((so->so_state & SS_ACCEPTCONN) == 0) {
364		SO_UNBLOCK_FALLBACK(so);
365		return ((so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW) ?
366		    EOPNOTSUPP : EINVAL);
367	}
368
369	if ((error = so_acceptq_dequeue(so, (fflag & (FNONBLOCK|FNDELAY)),
370	    &nso)) == 0) {
371		ASSERT(nso != NULL);
372
373		/* finish the accept */
374		error = (*so->so_downcalls->sd_accept)(so->so_proto_handle,
375		    nso->so_proto_handle, (sock_upper_handle_t)nso, cr);
376		if (error != 0) {
377			(void) socket_close(nso, 0, cr);
378			socket_destroy(nso);
379		} else {
380			*nsop = nso;
381		}
382	}
383
384	SO_UNBLOCK_FALLBACK(so);
385	return (error);
386}
387
388int
389so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
390    struct cred *cr)
391{
392	int error, flags;
393	boolean_t dontblock;
394	ssize_t orig_resid;
395	mblk_t  *mp;
396
397	SO_BLOCK_FALLBACK(so, SOP_SENDMSG(so, msg, uiop, cr));
398
399	flags = msg->msg_flags;
400	error = 0;
401	dontblock = (flags & MSG_DONTWAIT) ||
402	    (uiop->uio_fmode & (FNONBLOCK|FNDELAY));
403
404	if (!(flags & MSG_XPG4_2) && msg->msg_controllen != 0) {
405		/*
406		 * Old way of passing fd's is not supported
407		 */
408		SO_UNBLOCK_FALLBACK(so);
409		return (EOPNOTSUPP);
410	}
411
412	if ((so->so_mode & SM_ATOMIC) &&
413	    uiop->uio_resid > so->so_proto_props.sopp_maxpsz &&
414	    so->so_proto_props.sopp_maxpsz != -1) {
415		SO_UNBLOCK_FALLBACK(so);
416		return (EMSGSIZE);
417	}
418
419	/*
420	 * For atomic sends we will only do one iteration.
421	 */
422	do {
423		if (so->so_state & SS_CANTSENDMORE) {
424			error = EPIPE;
425			break;
426		}
427
428		if (so->so_error != 0) {
429			mutex_enter(&so->so_lock);
430			error = sogeterr(so, B_TRUE);
431			mutex_exit(&so->so_lock);
432			if (error != 0)
433				break;
434		}
435
436		/*
437		 * Send down OOB messages even if the send path is being
438		 * flow controlled (assuming the protocol supports OOB data).
439		 */
440		if (flags & MSG_OOB) {
441			if ((so->so_mode & SM_EXDATA) == 0) {
442				error = EOPNOTSUPP;
443				break;
444			}
445		} else if (so->so_snd_qfull) {
446			/*
447			 * Need to wait until the protocol is ready to receive
448			 * more data for transmission.
449			 */
450			if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0)
451				break;
452		}
453
454		/*
455		 * Time to send data to the protocol. We either copy the
456		 * data into mblks or pass the uio directly to the protocol.
457		 * We decide what to do based on the available down calls.
458		 */
459		if (so->so_downcalls->sd_send_uio != NULL) {
460			error = (*so->so_downcalls->sd_send_uio)
461			    (so->so_proto_handle, uiop, msg, cr);
462			if (error != 0)
463				break;
464		} else {
465			/* save the resid in case of failure */
466			orig_resid = uiop->uio_resid;
467
468			if ((mp = socopyinuio(uiop,
469			    so->so_proto_props.sopp_maxpsz,
470			    so->so_proto_props.sopp_wroff,
471			    so->so_proto_props.sopp_maxblk,
472			    so->so_proto_props.sopp_tail, &error)) == NULL) {
473				break;
474			}
475			ASSERT(uiop->uio_resid >= 0);
476
477			error = (*so->so_downcalls->sd_send)
478			    (so->so_proto_handle, mp, msg, cr);
479			if (error != 0) {
480				/*
481				 * The send failed. We do not have to free the
482				 * mblks, because that is the protocol's
483				 * responsibility. However, uio_resid must
484				 * remain accurate, so adjust that here.
485				 */
486				uiop->uio_resid = orig_resid;
487					break;
488			}
489		}
490	} while (uiop->uio_resid > 0);
491
492	SO_UNBLOCK_FALLBACK(so);
493
494	return (error);
495}
496
497int
498so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
499    struct cred *cr, mblk_t **mpp)
500{
501	int error;
502	boolean_t dontblock;
503	size_t size;
504	mblk_t *mp = *mpp;
505
506	SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
507
508	error = 0;
509	dontblock = (msg->msg_flags & MSG_DONTWAIT) ||
510	    (fflag & (FNONBLOCK|FNDELAY));
511	size = msgdsize(mp);
512
513	if ((so->so_mode & SM_SENDFILESUPP) == 0 ||
514	    so->so_downcalls->sd_send == NULL) {
515		SO_UNBLOCK_FALLBACK(so);
516		return (EOPNOTSUPP);
517	}
518
519	if ((so->so_mode & SM_ATOMIC) &&
520	    size > so->so_proto_props.sopp_maxpsz &&
521	    so->so_proto_props.sopp_maxpsz != -1) {
522		SO_UNBLOCK_FALLBACK(so);
523		return (EMSGSIZE);
524	}
525
526	while (mp != NULL) {
527		mblk_t *nmp, *last_mblk;
528		size_t mlen;
529
530		if (so->so_state & SS_CANTSENDMORE) {
531			error = EPIPE;
532			break;
533		}
534		if (so->so_error != 0) {
535			mutex_enter(&so->so_lock);
536			error = sogeterr(so, B_TRUE);
537			mutex_exit(&so->so_lock);
538			if (error != 0)
539				break;
540		}
541		if (so->so_snd_qfull) {
542			/*
543			 * Need to wait until the protocol is ready to receive
544			 * more data for transmission.
545			 */
546			if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0)
547				break;
548		}
549
550		/*
551		 * We only allow so_maxpsz of data to be sent down to
552		 * the protocol at time.
553		 */
554		mlen = MBLKL(mp);
555		nmp = mp->b_cont;
556		last_mblk = mp;
557		while (nmp != NULL) {
558			mlen += MBLKL(nmp);
559			if (mlen > so->so_proto_props.sopp_maxpsz) {
560				last_mblk->b_cont = NULL;
561				break;
562			}
563			last_mblk = nmp;
564			nmp = nmp->b_cont;
565		}
566
567		error = (*so->so_downcalls->sd_send)
568		    (so->so_proto_handle, mp, msg, cr);
569		if (error != 0) {
570			/*
571			 * The send failed. The protocol will free the mblks
572			 * that were sent down. Let the caller deal with the
573			 * rest.
574			 */
575			*mpp = nmp;
576			break;
577		}
578
579		*mpp = mp = nmp;
580	}
581
582	SO_UNBLOCK_FALLBACK(so);
583
584	return (error);
585}
586
587int
588so_shutdown(struct sonode *so, int how, struct cred *cr)
589{
590	int error;
591
592	SO_BLOCK_FALLBACK(so, SOP_SHUTDOWN(so, how, cr));
593
594	/*
595	 * SunOS 4.X has no check for datagram sockets.
596	 * 5.X checks that it is connected (ENOTCONN)
597	 * X/Open requires that we check the connected state.
598	 */
599	if (!(so->so_state & SS_ISCONNECTED)) {
600		if (!xnet_skip_checks) {
601			error = ENOTCONN;
602			if (xnet_check_print) {
603				printf("sockfs: X/Open shutdown check "
604				    "caused ENOTCONN\n");
605			}
606		}
607		goto done;
608	}
609
610	error = ((*so->so_downcalls->sd_shutdown)(so->so_proto_handle,
611	    how, cr));
612
613	/*
614	 * Protocol agreed to shutdown. We need to flush the
615	 * receive buffer if the receive side is being shutdown.
616	 */
617	if (error == 0 && how != SHUT_WR) {
618		mutex_enter(&so->so_lock);
619		/* wait for active reader to finish */
620		(void) so_lock_read(so, 0);
621
622		so_rcv_flush(so);
623
624		so_unlock_read(so);
625		mutex_exit(&so->so_lock);
626	}
627
628done:
629	SO_UNBLOCK_FALLBACK(so);
630	return (error);
631}
632
633int
634so_getsockname(struct sonode *so, struct sockaddr *addr,
635    socklen_t *addrlen, struct cred *cr)
636{
637	int error;
638
639	SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
640
641	error = (*so->so_downcalls->sd_getsockname)
642	    (so->so_proto_handle, addr, addrlen, cr);
643
644	SO_UNBLOCK_FALLBACK(so);
645	return (error);
646}
647
648int
649so_getpeername(struct sonode *so, struct sockaddr *addr,
650    socklen_t *addrlen, boolean_t accept, struct cred *cr)
651{
652	int error;
653
654	SO_BLOCK_FALLBACK(so, SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
655
656	if (accept) {
657		error = (*so->so_downcalls->sd_getpeername)
658		    (so->so_proto_handle, addr, addrlen, cr);
659	} else if (!(so->so_state & SS_ISCONNECTED)) {
660		error = ENOTCONN;
661	} else if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
662		/* Added this check for X/Open */
663		error = EINVAL;
664		if (xnet_check_print) {
665			printf("sockfs: X/Open getpeername check => EINVAL\n");
666		}
667	} else {
668		error = (*so->so_downcalls->sd_getpeername)
669		    (so->so_proto_handle, addr, addrlen, cr);
670	}
671
672	SO_UNBLOCK_FALLBACK(so);
673	return (error);
674}
675
676int
677so_getsockopt(struct sonode *so, int level, int option_name,
678    void *optval, socklen_t *optlenp, int flags, struct cred *cr)
679{
680	int error = 0;
681
682	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
683	SO_BLOCK_FALLBACK(so,
684	    SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr));
685
686	error = socket_getopt_common(so, level, option_name, optval, optlenp,
687	    flags);
688	if (error < 0) {
689		error = (*so->so_downcalls->sd_getsockopt)
690		    (so->so_proto_handle, level, option_name, optval, optlenp,
691		    cr);
692		if (error ==  ENOPROTOOPT) {
693			if (level == SOL_SOCKET) {
694				/*
695				 * If a protocol does not support a particular
696				 * socket option, set can fail (not allowed)
697				 * but get can not fail. This is the previous
698				 * sockfs bahvior.
699				 */
700				switch (option_name) {
701				case SO_LINGER:
702					if (*optlenp < (t_uscalar_t)
703					    sizeof (struct linger)) {
704						error = EINVAL;
705						break;
706					}
707					error = 0;
708					bzero(optval, sizeof (struct linger));
709					*optlenp = sizeof (struct linger);
710					break;
711				case SO_RCVTIMEO:
712				case SO_SNDTIMEO:
713					if (*optlenp < (t_uscalar_t)
714					    sizeof (struct timeval)) {
715						error = EINVAL;
716						break;
717					}
718					error = 0;
719					bzero(optval, sizeof (struct timeval));
720					*optlenp = sizeof (struct timeval);
721					break;
722				case SO_SND_BUFINFO:
723					if (*optlenp < (t_uscalar_t)
724					    sizeof (struct so_snd_bufinfo)) {
725						error = EINVAL;
726						break;
727					}
728					error = 0;
729					bzero(optval,
730					    sizeof (struct so_snd_bufinfo));
731					*optlenp =
732					    sizeof (struct so_snd_bufinfo);
733					break;
734				case SO_DEBUG:
735				case SO_REUSEADDR:
736				case SO_KEEPALIVE:
737				case SO_DONTROUTE:
738				case SO_BROADCAST:
739				case SO_USELOOPBACK:
740				case SO_OOBINLINE:
741				case SO_DGRAM_ERRIND:
742				case SO_SNDBUF:
743				case SO_RCVBUF:
744					error = 0;
745					*((int32_t *)optval) = 0;
746					*optlenp = sizeof (int32_t);
747					break;
748				default:
749					break;
750				}
751			}
752		}
753	}
754
755	SO_UNBLOCK_FALLBACK(so);
756	return (error);
757}
758
759int
760so_setsockopt(struct sonode *so, int level, int option_name,
761    const void *optval, socklen_t optlen, struct cred *cr)
762{
763	int error = 0;
764	struct timeval tl;
765	const void *opt = optval;
766
767	SO_BLOCK_FALLBACK(so,
768	    SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
769
770	/* X/Open requires this check */
771	if (so->so_state & SS_CANTSENDMORE && !xnet_skip_checks) {
772		SO_UNBLOCK_FALLBACK(so);
773		if (xnet_check_print)
774			printf("sockfs: X/Open setsockopt check => EINVAL\n");
775		return (EINVAL);
776	}
777
778	if (level == SOL_SOCKET) {
779		switch (option_name) {
780		case SO_RCVTIMEO:
781		case SO_SNDTIMEO: {
782			/*
783			 * We pass down these two options to protocol in order
784			 * to support some third part protocols which need to
785			 * know them. For those protocols which don't care
786			 * these two options, simply return 0.
787			 */
788			clock_t t_usec;
789
790			if (get_udatamodel() == DATAMODEL_NONE ||
791			    get_udatamodel() == DATAMODEL_NATIVE) {
792				if (optlen != sizeof (struct timeval)) {
793					error = EINVAL;
794					goto done;
795				}
796				bcopy((struct timeval *)optval, &tl,
797				    sizeof (struct timeval));
798			} else {
799				if (optlen != sizeof (struct timeval32)) {
800					error = EINVAL;
801					goto done;
802				}
803				TIMEVAL32_TO_TIMEVAL(&tl,
804				    (struct timeval32 *)optval);
805			}
806			opt = &tl;
807			optlen = sizeof (tl);
808			t_usec = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
809			mutex_enter(&so->so_lock);
810			if (option_name == SO_RCVTIMEO)
811				so->so_rcvtimeo = drv_usectohz(t_usec);
812			else
813				so->so_sndtimeo = drv_usectohz(t_usec);
814			mutex_exit(&so->so_lock);
815			break;
816		}
817		case SO_RCVBUF:
818			/*
819			 * XXX XPG 4.2 applications retrieve SO_RCVBUF from
820			 * sockfs since the transport might adjust the value
821			 * and not return exactly what was set by the
822			 * application.
823			 */
824			so->so_xpg_rcvbuf = *(int32_t *)optval;
825			break;
826		}
827	}
828	error = (*so->so_downcalls->sd_setsockopt)
829	    (so->so_proto_handle, level, option_name, opt, optlen, cr);
830done:
831	SO_UNBLOCK_FALLBACK(so);
832	return (error);
833}
834
835int
836so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
837    struct cred *cr, int32_t *rvalp)
838{
839	int error = 0;
840
841	SO_BLOCK_FALLBACK(so, SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
842
843	/*
844	 * If there is a pending error, return error
845	 * This can happen if a non blocking operation caused an error.
846	 */
847	if (so->so_error != 0) {
848		mutex_enter(&so->so_lock);
849		error = sogeterr(so, B_TRUE);
850		mutex_exit(&so->so_lock);
851		if (error != 0)
852			goto done;
853	}
854
855	/*
856	 * calling strioc can result in the socket falling back to TPI,
857	 * if that is supported.
858	 */
859	if ((error = socket_ioctl_common(so, cmd, arg, mode, cr, rvalp)) < 0 &&
860	    (error = socket_strioc_common(so, cmd, arg, mode, cr, rvalp)) < 0) {
861		error = (*so->so_downcalls->sd_ioctl)(so->so_proto_handle,
862		    cmd, arg, mode, rvalp, cr);
863	}
864
865done:
866	SO_UNBLOCK_FALLBACK(so);
867
868	return (error);
869}
870
871int
872so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
873    struct pollhead **phpp)
874{
875	int state = so->so_state;
876	*reventsp = 0;
877
878	/*
879	 * In sockets the errors are represented as input/output events
880	 */
881	if (so->so_error != 0 &&
882	    ((POLLIN|POLLRDNORM|POLLOUT) & events) != 0) {
883		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & events;
884		return (0);
885	}
886
887	/*
888	 * If the socket is in a state where it can send data
889	 * turn on POLLWRBAND and POLLOUT events.
890	 */
891	if ((so->so_mode & SM_CONNREQUIRED) == 0 || (state & SS_ISCONNECTED)) {
892		/*
893		 * out of band data is allowed even if the connection
894		 * is flow controlled
895		 */
896		*reventsp |= POLLWRBAND & events;
897		if (!so->so_snd_qfull) {
898			/*
899			 * As long as there is buffer to send data
900			 * turn on POLLOUT events
901			 */
902			*reventsp |= POLLOUT & events;
903		}
904	}
905
906	/*
907	 * Turn on POLLIN whenever there is data on the receive queue,
908	 * or the socket is in a state where no more data will be received.
909	 * Also, if the socket is accepting connections, flip the bit if
910	 * there is something on the queue.
911	 *
912	 * We do an initial check for events without holding locks. However,
913	 * if there are no event available, then we redo the check for POLLIN
914	 * events under the lock.
915	 */
916
917	/* Pending connections */
918	if (so->so_acceptq_len > 0)
919		*reventsp |= (POLLIN|POLLRDNORM) & events;
920
921	/* Data */
922	/* so_downcalls is null for sctp */
923	if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) {
924		*reventsp |= (*so->so_downcalls->sd_poll)
925		    (so->so_proto_handle, events & SO_PROTO_POLLEV, anyyet,
926		    CRED()) & events;
927		ASSERT((*reventsp & ~events) == 0);
928		/* do not recheck events */
929		events &= ~SO_PROTO_POLLEV;
930	} else {
931		if (SO_HAVE_DATA(so))
932			*reventsp |= (POLLIN|POLLRDNORM) & events;
933
934		/* Urgent data */
935		if ((state & SS_OOBPEND) != 0) {
936			*reventsp |= (POLLRDBAND | POLLPRI) & events;
937		}
938	}
939
940	if (!*reventsp && !anyyet) {
941		/* Check for read events again, but this time under lock */
942		if (events & (POLLIN|POLLRDNORM)) {
943			mutex_enter(&so->so_lock);
944			if (SO_HAVE_DATA(so) || so->so_acceptq_len > 0) {
945				mutex_exit(&so->so_lock);
946				*reventsp |= (POLLIN|POLLRDNORM) & events;
947				return (0);
948			} else {
949				so->so_pollev |= SO_POLLEV_IN;
950				mutex_exit(&so->so_lock);
951			}
952		}
953		*phpp = &so->so_poll_list;
954	}
955	return (0);
956}
957
958/*
959 * Generic Upcalls
960 */
961void
962so_connected(sock_upper_handle_t sock_handle, sock_connid_t id,
963    cred_t *peer_cred, pid_t peer_cpid)
964{
965	struct sonode *so = (struct sonode *)sock_handle;
966
967	mutex_enter(&so->so_lock);
968	ASSERT(so->so_proto_handle != NULL);
969
970	if (peer_cred != NULL) {
971		if (so->so_peercred != NULL)
972			crfree(so->so_peercred);
973		crhold(peer_cred);
974		so->so_peercred = peer_cred;
975		so->so_cpid = peer_cpid;
976	}
977
978	so->so_proto_connid = id;
979	soisconnected(so);
980	/*
981	 * Wake ones who're waiting for conn to become established.
982	 */
983	so_notify_connected(so);
984}
985
986int
987so_disconnected(sock_upper_handle_t sock_handle, sock_connid_t id, int error)
988{
989	struct sonode *so = (struct sonode *)sock_handle;
990
991	mutex_enter(&so->so_lock);
992
993	so->so_proto_connid = id;
994	soisdisconnected(so, error);
995	so_notify_disconnected(so, error);
996
997	return (0);
998}
999
1000void
1001so_opctl(sock_upper_handle_t sock_handle, sock_opctl_action_t action,
1002    uintptr_t arg)
1003{
1004	struct sonode *so = (struct sonode *)sock_handle;
1005
1006	switch (action) {
1007	case SOCK_OPCTL_SHUT_SEND:
1008		mutex_enter(&so->so_lock);
1009		socantsendmore(so);
1010		so_notify_disconnecting(so);
1011		break;
1012	case SOCK_OPCTL_SHUT_RECV: {
1013		mutex_enter(&so->so_lock);
1014		socantrcvmore(so);
1015		so_notify_eof(so);
1016		break;
1017	}
1018	case SOCK_OPCTL_ENAB_ACCEPT:
1019		mutex_enter(&so->so_lock);
1020		so->so_state |= SS_ACCEPTCONN;
1021		so->so_backlog = (unsigned int)arg;
1022		mutex_exit(&so->so_lock);
1023		break;
1024	default:
1025		ASSERT(0);
1026		break;
1027	}
1028}
1029
1030void
1031so_txq_full(sock_upper_handle_t sock_handle, boolean_t qfull)
1032{
1033	struct sonode *so = (struct sonode *)sock_handle;
1034
1035	if (qfull) {
1036		so_snd_qfull(so);
1037	} else {
1038		so_snd_qnotfull(so);
1039		mutex_enter(&so->so_lock);
1040		so_notify_writable(so);
1041	}
1042}
1043
1044sock_upper_handle_t
1045so_newconn(sock_upper_handle_t parenthandle,
1046    sock_lower_handle_t proto_handle, sock_downcalls_t *sock_downcalls,
1047    struct cred *peer_cred, pid_t peer_cpid, sock_upcalls_t **sock_upcallsp)
1048{
1049	struct sonode	*so = (struct sonode *)parenthandle;
1050	struct sonode	*nso;
1051	int error;
1052
1053	ASSERT(proto_handle != NULL);
1054
1055	if ((so->so_state & SS_ACCEPTCONN) == 0 ||
1056	    so->so_acceptq_len >= so->so_backlog)
1057		return (NULL);
1058
1059	nso = socket_newconn(so, proto_handle, sock_downcalls, SOCKET_NOSLEEP,
1060	    &error);
1061	if (nso == NULL)
1062		return (NULL);
1063
1064	if (peer_cred != NULL) {
1065		crhold(peer_cred);
1066		nso->so_peercred = peer_cred;
1067		nso->so_cpid = peer_cpid;
1068	}
1069
1070	/*
1071	 * The new socket (nso), proto_handle and sock_upcallsp are all
1072	 * valid at this point. But as soon as nso is placed in the accept
1073	 * queue that can no longer be assumed (since an accept() thread may
1074	 * pull it off the queue and close the socket).
1075	 */
1076	*sock_upcallsp = &so_upcalls;
1077
1078	(void) so_acceptq_enqueue(so, nso);
1079
1080	mutex_enter(&so->so_lock);
1081	so_notify_newconn(so);
1082
1083	return ((sock_upper_handle_t)nso);
1084}
1085
1086void
1087so_set_prop(sock_upper_handle_t sock_handle, struct sock_proto_props *soppp)
1088{
1089	struct sonode *so;
1090
1091	so = (struct sonode *)sock_handle;
1092
1093	mutex_enter(&so->so_lock);
1094
1095	if (soppp->sopp_flags & SOCKOPT_MAXBLK)
1096		so->so_proto_props.sopp_maxblk = soppp->sopp_maxblk;
1097	if (soppp->sopp_flags & SOCKOPT_WROFF)
1098		so->so_proto_props.sopp_wroff = soppp->sopp_wroff;
1099	if (soppp->sopp_flags & SOCKOPT_TAIL)
1100		so->so_proto_props.sopp_tail = soppp->sopp_tail;
1101	if (soppp->sopp_flags & SOCKOPT_RCVHIWAT)
1102		so->so_proto_props.sopp_rxhiwat = soppp->sopp_rxhiwat;
1103	if (soppp->sopp_flags & SOCKOPT_RCVLOWAT)
1104		so->so_proto_props.sopp_rxlowat = soppp->sopp_rxlowat;
1105	if (soppp->sopp_flags & SOCKOPT_MAXPSZ)
1106		so->so_proto_props.sopp_maxpsz = soppp->sopp_maxpsz;
1107	if (soppp->sopp_flags & SOCKOPT_MINPSZ)
1108		so->so_proto_props.sopp_minpsz = soppp->sopp_minpsz;
1109	if (soppp->sopp_flags & SOCKOPT_ZCOPY) {
1110		if (soppp->sopp_zcopyflag & ZCVMSAFE) {
1111			so->so_proto_props.sopp_zcopyflag |= STZCVMSAFE;
1112			so->so_proto_props.sopp_zcopyflag &= ~STZCVMUNSAFE;
1113		} else if (soppp->sopp_zcopyflag & ZCVMUNSAFE) {
1114			so->so_proto_props.sopp_zcopyflag |= STZCVMUNSAFE;
1115			so->so_proto_props.sopp_zcopyflag &= ~STZCVMSAFE;
1116		}
1117
1118		if (soppp->sopp_zcopyflag & COPYCACHED) {
1119			so->so_proto_props.sopp_zcopyflag |= STRCOPYCACHED;
1120		}
1121	}
1122	if (soppp->sopp_flags & SOCKOPT_OOBINLINE)
1123		so->so_proto_props.sopp_oobinline = soppp->sopp_oobinline;
1124	if (soppp->sopp_flags & SOCKOPT_RCVTIMER)
1125		so->so_proto_props.sopp_rcvtimer = soppp->sopp_rcvtimer;
1126	if (soppp->sopp_flags & SOCKOPT_RCVTHRESH)
1127		so->so_proto_props.sopp_rcvthresh = soppp->sopp_rcvthresh;
1128	if (soppp->sopp_flags & SOCKOPT_MAXADDRLEN)
1129		so->so_proto_props.sopp_maxaddrlen = soppp->sopp_maxaddrlen;
1130	if (soppp->sopp_flags & SOCKOPT_LOOPBACK)
1131		so->so_proto_props.sopp_loopback = soppp->sopp_loopback;
1132
1133	mutex_exit(&so->so_lock);
1134
1135#ifdef DEBUG
1136	soppp->sopp_flags &= ~(SOCKOPT_MAXBLK | SOCKOPT_WROFF | SOCKOPT_TAIL |
1137	    SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | SOCKOPT_MAXPSZ |
1138	    SOCKOPT_ZCOPY | SOCKOPT_OOBINLINE | SOCKOPT_RCVTIMER |
1139	    SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ |
1140	    SOCKOPT_LOOPBACK);
1141	ASSERT(soppp->sopp_flags == 0);
1142#endif
1143}
1144
1145/* ARGSUSED */
1146ssize_t
1147so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
1148    size_t msg_size, int flags, int *errorp,  boolean_t *force_pushp)
1149{
1150	struct sonode *so = (struct sonode *)sock_handle;
1151	boolean_t force_push = B_TRUE;
1152	int space_left;
1153	sodirect_t *sodp = so->so_direct;
1154
1155	ASSERT(errorp != NULL);
1156	*errorp = 0;
1157	if (mp == NULL) {
1158		if (so->so_downcalls->sd_recv_uio != NULL) {
1159			mutex_enter(&so->so_lock);
1160			/* the notify functions will drop the lock */
1161			if (flags & MSG_OOB)
1162				so_notify_oobdata(so, IS_SO_OOB_INLINE(so));
1163			else
1164				so_notify_data(so, msg_size);
1165			return (0);
1166		}
1167		ASSERT(msg_size == 0);
1168		/*
1169		 * recv space check
1170		 */
1171		mutex_enter(&so->so_lock);
1172		space_left = so->so_rcvbuf - so->so_rcv_queued;
1173		if (space_left <= 0) {
1174			so->so_flowctrld = B_TRUE;
1175			*errorp = ENOSPC;
1176			space_left = -1;
1177		}
1178		goto done_unlock;
1179	}
1180
1181	ASSERT(mp->b_next == NULL);
1182	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO);
1183	ASSERT(msg_size == msgdsize(mp));
1184
1185	if (flags & MSG_OOB) {
1186		so_queue_oob(sock_handle, mp, msg_size);
1187		return (0);
1188	}
1189
1190	if (force_pushp != NULL)
1191		force_push = *force_pushp;
1192
1193	if (DB_TYPE(mp) == M_PROTO && !__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
1194		/* The read pointer is not aligned correctly for TPI */
1195		zcmn_err(getzoneid(), CE_WARN,
1196		    "sockfs: Unaligned TPI message received. rptr = %p\n",
1197		    (void *)mp->b_rptr);
1198		freemsg(mp);
1199		mutex_enter(&so->so_lock);
1200		if (sodp != NULL)
1201			SOD_UIOAFINI(sodp);
1202		mutex_exit(&so->so_lock);
1203
1204		return (so->so_rcvbuf - so->so_rcv_queued);
1205	}
1206
1207	mutex_enter(&so->so_lock);
1208	if (so->so_state & (SS_FALLBACK_DRAIN | SS_FALLBACK_COMP)) {
1209		if (sodp != NULL)
1210			SOD_DISABLE(sodp);
1211		mutex_exit(&so->so_lock);
1212		*errorp = EOPNOTSUPP;
1213		return (-1);
1214	}
1215	if (so->so_state & SS_CANTRCVMORE) {
1216		freemsg(mp);
1217		if (sodp != NULL)
1218			SOD_DISABLE(sodp);
1219		mutex_exit(&so->so_lock);
1220		return (0);
1221	}
1222
1223	/* process the mblk via I/OAT if capable */
1224	if (sodp != NULL && sodp->sod_enabled) {
1225		if (DB_TYPE(mp) == M_DATA) {
1226			sod_uioa_mblk_init(sodp, mp, msg_size);
1227		} else {
1228			SOD_UIOAFINI(sodp);
1229		}
1230	}
1231
1232	if (mp->b_next == NULL) {
1233		so_enqueue_msg(so, mp, msg_size);
1234	} else {
1235		do {
1236			mblk_t *nmp;
1237
1238			if ((nmp = mp->b_next) != NULL) {
1239				mp->b_next = NULL;
1240			}
1241			so_enqueue_msg(so, mp, msgdsize(mp));
1242			mp = nmp;
1243		} while (mp != NULL);
1244	}
1245
1246	space_left = so->so_rcvbuf - so->so_rcv_queued;
1247	if (space_left <= 0) {
1248		so->so_flowctrld = B_TRUE;
1249		*errorp = ENOSPC;
1250		space_left = -1;
1251	}
1252
1253	if (force_push || so->so_rcv_queued >= so->so_rcv_thresh ||
1254	    so->so_rcv_queued >= so->so_rcv_wanted) {
1255		SOCKET_TIMER_CANCEL(so);
1256		/*
1257		 * so_notify_data will release the lock
1258		 */
1259		so_notify_data(so, so->so_rcv_queued);
1260
1261		if (force_pushp != NULL)
1262			*force_pushp = B_TRUE;
1263		goto done;
1264	} else if (so->so_rcv_timer_tid == 0) {
1265		/* Make sure the recv push timer is running */
1266		SOCKET_TIMER_START(so);
1267	}
1268
1269done_unlock:
1270	mutex_exit(&so->so_lock);
1271done:
1272	return (space_left);
1273}
1274
1275/*
1276 * Set the offset of where the oob data is relative to the bytes in
1277 * queued. Also generate SIGURG
1278 */
1279void
1280so_signal_oob(sock_upper_handle_t sock_handle, ssize_t offset)
1281{
1282	struct sonode *so;
1283
1284	ASSERT(offset >= 0);
1285	so = (struct sonode *)sock_handle;
1286	mutex_enter(&so->so_lock);
1287	if (so->so_direct != NULL)
1288		SOD_UIOAFINI(so->so_direct);
1289
1290	/*
1291	 * New urgent data on the way so forget about any old
1292	 * urgent data.
1293	 */
1294	so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA);
1295
1296	/*
1297	 * Record that urgent data is pending.
1298	 */
1299	so->so_state |= SS_OOBPEND;
1300
1301	if (so->so_oobmsg != NULL) {
1302		dprintso(so, 1, ("sock: discarding old oob\n"));
1303		freemsg(so->so_oobmsg);
1304		so->so_oobmsg = NULL;
1305	}
1306
1307	/*
1308	 * set the offset where the urgent byte is
1309	 */
1310	so->so_oobmark = so->so_rcv_queued + offset;
1311	if (so->so_oobmark == 0)
1312		so->so_state |= SS_RCVATMARK;
1313	else
1314		so->so_state &= ~SS_RCVATMARK;
1315
1316	so_notify_oobsig(so);
1317}
1318
1319/*
1320 * Queue the OOB byte
1321 */
1322static void
1323so_queue_oob(sock_upper_handle_t sock_handle, mblk_t *mp, size_t len)
1324{
1325	struct sonode *so;
1326
1327	so = (struct sonode *)sock_handle;
1328	mutex_enter(&so->so_lock);
1329	if (so->so_direct != NULL)
1330		SOD_UIOAFINI(so->so_direct);
1331
1332	ASSERT(mp != NULL);
1333	if (!IS_SO_OOB_INLINE(so)) {
1334		so->so_oobmsg = mp;
1335		so->so_state |= SS_HAVEOOBDATA;
1336	} else {
1337		so_enqueue_msg(so, mp, len);
1338	}
1339
1340	so_notify_oobdata(so, IS_SO_OOB_INLINE(so));
1341}
1342
1343int
1344so_close(struct sonode *so, int flag, struct cred *cr)
1345{
1346	int error;
1347
1348	error = (*so->so_downcalls->sd_close)(so->so_proto_handle, flag, cr);
1349
1350	/*
1351	 * At this point there will be no more upcalls from the protocol
1352	 */
1353	mutex_enter(&so->so_lock);
1354
1355	ASSERT(so_verify_oobstate(so));
1356
1357	so_rcv_flush(so);
1358	mutex_exit(&so->so_lock);
1359
1360	return (error);
1361}
1362
1363void
1364so_zcopy_notify(sock_upper_handle_t sock_handle)
1365{
1366	struct sonode *so = (struct sonode *)sock_handle;
1367
1368	mutex_enter(&so->so_lock);
1369	so->so_copyflag |= STZCNOTIFY;
1370	cv_broadcast(&so->so_copy_cv);
1371	mutex_exit(&so->so_lock);
1372}
1373
1374void
1375so_set_error(sock_upper_handle_t sock_handle, int error)
1376{
1377	struct sonode *so = (struct sonode *)sock_handle;
1378
1379	mutex_enter(&so->so_lock);
1380
1381	soseterror(so, error);
1382
1383	so_notify_error(so);
1384}
1385
1386/*
1387 * so_recvmsg - read data from the socket
1388 *
1389 * There are two ways of obtaining data; either we ask the protocol to
1390 * copy directly into the supplied buffer, or we copy data from the
1391 * sonode's receive queue. The decision which one to use depends on
1392 * whether the protocol has a sd_recv_uio down call.
1393 */
1394int
1395so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
1396    struct cred *cr)
1397{
1398	rval_t 		rval;
1399	int 		flags = 0;
1400	t_uscalar_t	controllen, namelen;
1401	int 		error = 0;
1402	int ret;
1403	mblk_t		*mctlp = NULL;
1404	union T_primitives *tpr;
1405	void		*control;
1406	ssize_t		saved_resid;
1407	struct uio	*suiop;
1408
1409	SO_BLOCK_FALLBACK(so, SOP_RECVMSG(so, msg, uiop, cr));
1410
1411	if ((so->so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
1412	    (so->so_mode & SM_CONNREQUIRED)) {
1413		SO_UNBLOCK_FALLBACK(so);
1414		return (ENOTCONN);
1415	}
1416
1417	if (msg->msg_flags & MSG_PEEK)
1418		msg->msg_flags &= ~MSG_WAITALL;
1419
1420	if (so->so_mode & SM_ATOMIC)
1421		msg->msg_flags |= MSG_TRUNC;
1422
1423	if (msg->msg_flags & MSG_OOB) {
1424		if ((so->so_mode & SM_EXDATA) == 0) {
1425			error = EOPNOTSUPP;
1426		} else if (so->so_downcalls->sd_recv_uio != NULL) {
1427			error = (*so->so_downcalls->sd_recv_uio)
1428			    (so->so_proto_handle, uiop, msg, cr);
1429		} else {
1430			error = sorecvoob(so, msg, uiop, msg->msg_flags,
1431			    IS_SO_OOB_INLINE(so));
1432		}
1433		SO_UNBLOCK_FALLBACK(so);
1434		return (error);
1435	}
1436
1437	/*
1438	 * If the protocol has the recv down call, then pass the request
1439	 * down.
1440	 */
1441	if (so->so_downcalls->sd_recv_uio != NULL) {
1442		error = (*so->so_downcalls->sd_recv_uio)
1443		    (so->so_proto_handle, uiop, msg, cr);
1444		SO_UNBLOCK_FALLBACK(so);
1445		return (error);
1446	}
1447
1448	/*
1449	 * Reading data from the socket buffer
1450	 */
1451	flags = msg->msg_flags;
1452	msg->msg_flags = 0;
1453
1454	/*
1455	 * Set msg_controllen and msg_namelen to zero here to make it
1456	 * simpler in the cases that no control or name is returned.
1457	 */
1458	controllen = msg->msg_controllen;
1459	namelen = msg->msg_namelen;
1460	msg->msg_controllen = 0;
1461	msg->msg_namelen = 0;
1462
1463	mutex_enter(&so->so_lock);
1464	/* Set SOREADLOCKED */
1465	error = so_lock_read_intr(so,
1466	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
1467	mutex_exit(&so->so_lock);
1468	if (error) {
1469		SO_UNBLOCK_FALLBACK(so);
1470		return (error);
1471	}
1472
1473	suiop = sod_rcv_init(so, flags, &uiop);
1474retry:
1475	saved_resid = uiop->uio_resid;
1476	error = so_dequeue_msg(so, &mctlp, uiop, &rval, flags);
1477	if (error != 0) {
1478		goto out;
1479	}
1480	/*
1481	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
1482	 * For non-datagrams MOREDATA is used to set MSG_EOR.
1483	 */
1484	ASSERT(!(rval.r_val1 & MORECTL));
1485	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
1486		msg->msg_flags |= MSG_TRUNC;
1487	if (mctlp == NULL) {
1488		dprintso(so, 1, ("so_recvmsg: got M_DATA\n"));
1489
1490		mutex_enter(&so->so_lock);
1491		/* Set MSG_EOR based on MOREDATA */
1492		if (!(rval.r_val1 & MOREDATA)) {
1493			if (so->so_state & SS_SAVEDEOR) {
1494				msg->msg_flags |= MSG_EOR;
1495				so->so_state &= ~SS_SAVEDEOR;
1496			}
1497		}
1498		/*
1499		 * If some data was received (i.e. not EOF) and the
1500		 * read/recv* has not been satisfied wait for some more.
1501		 */
1502		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
1503		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
1504			mutex_exit(&so->so_lock);
1505			flags |= MSG_NOMARK;
1506			goto retry;
1507		}
1508
1509		goto out_locked;
1510	}
1511	/* so_queue_msg has already verified length and alignment */
1512	tpr = (union T_primitives *)mctlp->b_rptr;
1513	dprintso(so, 1, ("so_recvmsg: type %d\n", tpr->type));
1514	switch (tpr->type) {
1515	case T_DATA_IND: {
1516		/*
1517		 * Set msg_flags to MSG_EOR based on
1518		 * MORE_flag and MOREDATA.
1519		 */
1520		mutex_enter(&so->so_lock);
1521		so->so_state &= ~SS_SAVEDEOR;
1522		if (!(tpr->data_ind.MORE_flag & 1)) {
1523			if (!(rval.r_val1 & MOREDATA))
1524				msg->msg_flags |= MSG_EOR;
1525			else
1526				so->so_state |= SS_SAVEDEOR;
1527		}
1528		freemsg(mctlp);
1529		/*
1530		 * If some data was received (i.e. not EOF) and the
1531		 * read/recv* has not been satisfied wait for some more.
1532		 */
1533		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
1534		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
1535			mutex_exit(&so->so_lock);
1536			flags |= MSG_NOMARK;
1537			goto retry;
1538		}
1539		goto out_locked;
1540	}
1541	case T_UNITDATA_IND: {
1542		void *addr;
1543		t_uscalar_t addrlen;
1544		void *abuf;
1545		t_uscalar_t optlen;
1546		void *opt;
1547
1548		if (namelen != 0) {
1549			/* Caller wants source address */
1550			addrlen = tpr->unitdata_ind.SRC_length;
1551			addr = sogetoff(mctlp, tpr->unitdata_ind.SRC_offset,
1552			    addrlen, 1);
1553			if (addr == NULL) {
1554				freemsg(mctlp);
1555				error = EPROTO;
1556				eprintsoline(so, error);
1557				goto out;
1558			}
1559			ASSERT(so->so_family != AF_UNIX);
1560		}
1561		optlen = tpr->unitdata_ind.OPT_length;
1562		if (optlen != 0) {
1563			t_uscalar_t ncontrollen;
1564
1565			/*
1566			 * Extract any source address option.
1567			 * Determine how large cmsg buffer is needed.
1568			 */
1569			opt = sogetoff(mctlp, tpr->unitdata_ind.OPT_offset,
1570			    optlen, __TPI_ALIGN_SIZE);
1571
1572			if (opt == NULL) {
1573				freemsg(mctlp);
1574				error = EPROTO;
1575				eprintsoline(so, error);
1576				goto out;
1577			}
1578			if (so->so_family == AF_UNIX)
1579				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
1580			ncontrollen = so_cmsglen(mctlp, opt, optlen,
1581			    !(flags & MSG_XPG4_2));
1582			if (controllen != 0)
1583				controllen = ncontrollen;
1584			else if (ncontrollen != 0)
1585				msg->msg_flags |= MSG_CTRUNC;
1586		} else {
1587			controllen = 0;
1588		}
1589
1590		if (namelen != 0) {
1591			/*
1592			 * Return address to caller.
1593			 * Caller handles truncation if length
1594			 * exceeds msg_namelen.
1595			 * NOTE: AF_UNIX NUL termination is ensured by
1596			 * the sender's copyin_name().
1597			 */
1598			abuf = kmem_alloc(addrlen, KM_SLEEP);
1599
1600			bcopy(addr, abuf, addrlen);
1601			msg->msg_name = abuf;
1602			msg->msg_namelen = addrlen;
1603		}
1604
1605		if (controllen != 0) {
1606			/*
1607			 * Return control msg to caller.
1608			 * Caller handles truncation if length
1609			 * exceeds msg_controllen.
1610			 */
1611			control = kmem_zalloc(controllen, KM_SLEEP);
1612
1613			error = so_opt2cmsg(mctlp, opt, optlen,
1614			    !(flags & MSG_XPG4_2), control, controllen);
1615			if (error) {
1616				freemsg(mctlp);
1617				if (msg->msg_namelen != 0)
1618					kmem_free(msg->msg_name,
1619					    msg->msg_namelen);
1620				kmem_free(control, controllen);
1621				eprintsoline(so, error);
1622				goto out;
1623			}
1624			msg->msg_control = control;
1625			msg->msg_controllen = controllen;
1626		}
1627
1628		freemsg(mctlp);
1629		goto out;
1630	}
1631	case T_OPTDATA_IND: {
1632		struct T_optdata_req *tdr;
1633		void *opt;
1634		t_uscalar_t optlen;
1635
1636		tdr = (struct T_optdata_req *)mctlp->b_rptr;
1637		optlen = tdr->OPT_length;
1638		if (optlen != 0) {
1639			t_uscalar_t ncontrollen;
1640			/*
1641			 * Determine how large cmsg buffer is needed.
1642			 */
1643			opt = sogetoff(mctlp,
1644			    tpr->optdata_ind.OPT_offset, optlen,
1645			    __TPI_ALIGN_SIZE);
1646
1647			if (opt == NULL) {
1648				freemsg(mctlp);
1649				error = EPROTO;
1650				eprintsoline(so, error);
1651				goto out;
1652			}
1653
1654			ncontrollen = so_cmsglen(mctlp, opt, optlen,
1655			    !(flags & MSG_XPG4_2));
1656			if (controllen != 0)
1657				controllen = ncontrollen;
1658			else if (ncontrollen != 0)
1659				msg->msg_flags |= MSG_CTRUNC;
1660		} else {
1661			controllen = 0;
1662		}
1663
1664		if (controllen != 0) {
1665			/*
1666			 * Return control msg to caller.
1667			 * Caller handles truncation if length
1668			 * exceeds msg_controllen.
1669			 */
1670			control = kmem_zalloc(controllen, KM_SLEEP);
1671
1672			error = so_opt2cmsg(mctlp, opt, optlen,
1673			    !(flags & MSG_XPG4_2), control, controllen);
1674			if (error) {
1675				freemsg(mctlp);
1676				kmem_free(control, controllen);
1677				eprintsoline(so, error);
1678				goto out;
1679			}
1680			msg->msg_control = control;
1681			msg->msg_controllen = controllen;
1682		}
1683
1684		/*
1685		 * Set msg_flags to MSG_EOR based on
1686		 * DATA_flag and MOREDATA.
1687		 */
1688		mutex_enter(&so->so_lock);
1689		so->so_state &= ~SS_SAVEDEOR;
1690		if (!(tpr->data_ind.MORE_flag & 1)) {
1691			if (!(rval.r_val1 & MOREDATA))
1692				msg->msg_flags |= MSG_EOR;
1693			else
1694				so->so_state |= SS_SAVEDEOR;
1695		}
1696		freemsg(mctlp);
1697		/*
1698		 * If some data was received (i.e. not EOF) and the
1699		 * read/recv* has not been satisfied wait for some more.
1700		 * Not possible to wait if control info was received.
1701		 */
1702		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
1703		    controllen == 0 &&
1704		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
1705			mutex_exit(&so->so_lock);
1706			flags |= MSG_NOMARK;
1707			goto retry;
1708		}
1709		goto out_locked;
1710	}
1711	default:
1712		cmn_err(CE_CONT, "so_recvmsg bad type %x \n",
1713		    tpr->type);
1714		freemsg(mctlp);
1715		error = EPROTO;
1716		ASSERT(0);
1717	}
1718out:
1719	mutex_enter(&so->so_lock);
1720out_locked:
1721	ret = sod_rcv_done(so, suiop, uiop);
1722	if (ret != 0 && error == 0)
1723		error = ret;
1724
1725	so_unlock_read(so);	/* Clear SOREADLOCKED */
1726	mutex_exit(&so->so_lock);
1727
1728	SO_UNBLOCK_FALLBACK(so);
1729
1730	return (error);
1731}
1732
1733sonodeops_t so_sonodeops = {
1734	so_init,		/* sop_init	*/
1735	so_accept,		/* sop_accept   */
1736	so_bind,		/* sop_bind	*/
1737	so_listen,		/* sop_listen   */
1738	so_connect,		/* sop_connect  */
1739	so_recvmsg,		/* sop_recvmsg  */
1740	so_sendmsg,		/* sop_sendmsg  */
1741	so_sendmblk,		/* sop_sendmblk */
1742	so_getpeername,		/* sop_getpeername */
1743	so_getsockname,		/* sop_getsockname */
1744	so_shutdown,		/* sop_shutdown */
1745	so_getsockopt,		/* sop_getsockopt */
1746	so_setsockopt,		/* sop_setsockopt */
1747	so_ioctl,		/* sop_ioctl    */
1748	so_poll,		/* sop_poll	*/
1749	so_close,		/* sop_close */
1750};
1751
1752sock_upcalls_t so_upcalls = {
1753	so_newconn,
1754	so_connected,
1755	so_disconnected,
1756	so_opctl,
1757	so_queue_msg,
1758	so_set_prop,
1759	so_txq_full,
1760	so_signal_oob,
1761	so_zcopy_notify,
1762	so_set_error
1763};
1764