uipc_socket2.c revision 1.103
1/*	$NetBSD: uipc_socket2.c,v 1.103 2009/07/24 01:09:49 christos Exp $	*/
2
3/*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 *	The Regents of the University of California.  All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 * 3. Neither the name of the University nor the names of its contributors
42 *    may be used to endorse or promote products derived from this software
43 *    without specific prior written permission.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 *
57 *	@(#)uipc_socket2.c	8.2 (Berkeley) 2/14/95
58 */
59
60#include <sys/cdefs.h>
61__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.103 2009/07/24 01:09:49 christos Exp $");
62
63#include "opt_mbuftrace.h"
64#include "opt_sb_max.h"
65
66#include <sys/param.h>
67#include <sys/systm.h>
68#include <sys/proc.h>
69#include <sys/file.h>
70#include <sys/buf.h>
71#include <sys/malloc.h>
72#include <sys/mbuf.h>
73#include <sys/protosw.h>
74#include <sys/domain.h>
75#include <sys/poll.h>
76#include <sys/socket.h>
77#include <sys/socketvar.h>
78#include <sys/signalvar.h>
79#include <sys/kauth.h>
80#include <sys/pool.h>
81#include <sys/uidinfo.h>
82
83/*
84 * Primitive routines for operating on sockets and socket buffers.
85 *
86 * Locking rules and assumptions:
87 *
88 * o socket::so_lock can change on the fly.  The low level routines used
89 *   to lock sockets are aware of this.  When so_lock is acquired, the
90 *   routine locking must check to see if so_lock still points to the
91 *   lock that was acquired.  If so_lock has changed in the meantime, the
92 *   now irellevant lock that was acquired must be dropped and the lock
93 *   operation retried.  Although not proven here, this is completely safe
94 *   on a multiprocessor system, even with relaxed memory ordering, given
95 *   the next two rules:
96 *
97 * o In order to mutate so_lock, the lock pointed to by the current value
98 *   of so_lock must be held: i.e., the socket must be held locked by the
99 *   changing thread.  The thread must issue membar_exit() to prevent
100 *   memory accesses being reordered, and can set so_lock to the desired
101 *   value.  If the lock pointed to by the new value of so_lock is not
102 *   held by the changing thread, the socket must then be considered
103 *   unlocked.
104 *
105 * o If so_lock is mutated, and the previous lock referred to by so_lock
106 *   could still be visible to other threads in the system (e.g. via file
107 *   descriptor or protocol-internal reference), then the old lock must
108 *   remain valid until the socket and/or protocol control block has been
109 *   torn down.
110 *
111 * o If a socket has a non-NULL so_head value (i.e. is in the process of
112 *   connecting), then locking the socket must also lock the socket pointed
113 *   to by so_head: their lock pointers must match.
114 *
115 * o If a socket has connections in progress (so_q, so_q0 not empty) then
116 *   locking the socket must also lock the sockets attached to both queues.
117 *   Again, their lock pointers must match.
118 *
119 * o Beyond the initial lock assigment in socreate(), assigning locks to
120 *   sockets is the responsibility of the individual protocols / protocol
121 *   domains.
122 */
123
124static pool_cache_t socket_cache;
125
126u_long	sb_max = SB_MAX;	/* maximum socket buffer size */
127static u_long sb_max_adj;	/* adjusted sb_max */
128
129/*
130 * Procedures to manipulate state flags of socket
131 * and do appropriate wakeups.  Normal sequence from the
132 * active (originating) side is that soisconnecting() is
133 * called during processing of connect() call,
134 * resulting in an eventual call to soisconnected() if/when the
135 * connection is established.  When the connection is torn down
136 * soisdisconnecting() is called during processing of disconnect() call,
137 * and soisdisconnected() is called when the connection to the peer
138 * is totally severed.  The semantics of these routines are such that
139 * connectionless protocols can call soisconnected() and soisdisconnected()
140 * only, bypassing the in-progress calls when setting up a ``connection''
141 * takes no time.
142 *
143 * From the passive side, a socket is created with
144 * two queues of sockets: so_q0 for connections in progress
145 * and so_q for connections already made and awaiting user acceptance.
146 * As a protocol is preparing incoming connections, it creates a socket
147 * structure queued on so_q0 by calling sonewconn().  When the connection
148 * is established, soisconnected() is called, and transfers the
149 * socket structure to so_q, making it available to accept().
150 *
151 * If a socket is closed with sockets on either
152 * so_q0 or so_q, these sockets are dropped.
153 *
154 * If higher level protocols are implemented in
155 * the kernel, the wakeups done here will sometimes
156 * cause software-interrupt process scheduling.
157 */
158
159void
160soisconnecting(struct socket *so)
161{
162
163	KASSERT(solocked(so));
164
165	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
166	so->so_state |= SS_ISCONNECTING;
167}
168
169void
170soisconnected(struct socket *so)
171{
172	struct socket	*head;
173
174	head = so->so_head;
175
176	KASSERT(solocked(so));
177	KASSERT(head == NULL || solocked2(so, head));
178
179	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
180	so->so_state |= SS_ISCONNECTED;
181	if (head && so->so_onq == &head->so_q0) {
182		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
183			soqremque(so, 0);
184			soqinsque(head, so, 1);
185			sorwakeup(head);
186			cv_broadcast(&head->so_cv);
187		} else {
188			so->so_upcall =
189			    head->so_accf->so_accept_filter->accf_callback;
190			so->so_upcallarg = head->so_accf->so_accept_filter_arg;
191			so->so_rcv.sb_flags |= SB_UPCALL;
192			so->so_options &= ~SO_ACCEPTFILTER;
193			(*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
194		}
195	} else {
196		cv_broadcast(&so->so_cv);
197		sorwakeup(so);
198		sowwakeup(so);
199	}
200}
201
202void
203soisdisconnecting(struct socket *so)
204{
205
206	KASSERT(solocked(so));
207
208	so->so_state &= ~SS_ISCONNECTING;
209	so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
210	cv_broadcast(&so->so_cv);
211	sowwakeup(so);
212	sorwakeup(so);
213}
214
215void
216soisdisconnected(struct socket *so)
217{
218
219	KASSERT(solocked(so));
220
221	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
222	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
223	cv_broadcast(&so->so_cv);
224	sowwakeup(so);
225	sorwakeup(so);
226}
227
228void
229soinit2(void)
230{
231
232	socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0,
233	    "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL);
234}
235
236/*
237 * When an attempt at a new connection is noted on a socket
238 * which accepts connections, sonewconn is called.  If the
239 * connection is possible (subject to space constraints, etc.)
240 * then we allocate a new structure, propoerly linked into the
241 * data structure of the original socket, and return this.
242 * Connstatus may be 0, SS_ISCONFIRMING, or SS_ISCONNECTED.
243 */
244struct socket *
245sonewconn(struct socket *head, int connstatus)
246{
247	struct socket	*so;
248	int		soqueue, error;
249
250	KASSERT(connstatus == 0 || connstatus == SS_ISCONFIRMING ||
251	    connstatus == SS_ISCONNECTED);
252	KASSERT(solocked(head));
253
254	if ((head->so_options & SO_ACCEPTFILTER) != 0)
255		connstatus = 0;
256	soqueue = connstatus ? 1 : 0;
257	if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2)
258		return NULL;
259	so = soget(false);
260	if (so == NULL)
261		return NULL;
262	mutex_obj_hold(head->so_lock);
263	so->so_lock = head->so_lock;
264	so->so_type = head->so_type;
265	so->so_options = head->so_options &~ SO_ACCEPTCONN;
266	so->so_linger = head->so_linger;
267	so->so_state = head->so_state | SS_NOFDREF;
268	so->so_nbio = head->so_nbio;
269	so->so_proto = head->so_proto;
270	so->so_timeo = head->so_timeo;
271	so->so_pgid = head->so_pgid;
272	so->so_send = head->so_send;
273	so->so_receive = head->so_receive;
274	so->so_uidinfo = head->so_uidinfo;
275	so->so_egid = head->so_egid;
276	so->so_cpid = head->so_cpid;
277#ifdef MBUFTRACE
278	so->so_mowner = head->so_mowner;
279	so->so_rcv.sb_mowner = head->so_rcv.sb_mowner;
280	so->so_snd.sb_mowner = head->so_snd.sb_mowner;
281#endif
282	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) != 0)
283		goto out;
284	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
285	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
286	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
287	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
288	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
289	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
290	soqinsque(head, so, soqueue);
291	error = (*so->so_proto->pr_usrreq)(so, PRU_ATTACH, NULL, NULL,
292	    NULL, NULL);
293	KASSERT(solocked(so));
294	if (error != 0) {
295		(void) soqremque(so, soqueue);
296out:
297		/*
298		 * Remove acccept filter if one is present.
299		 * XXX Is this really needed?
300		 */
301		if (so->so_accf != NULL)
302			(void)accept_filt_clear(so);
303		soput(so);
304		return NULL;
305	}
306	if (connstatus) {
307		sorwakeup(head);
308		cv_broadcast(&head->so_cv);
309		so->so_state |= connstatus;
310	}
311	return so;
312}
313
314struct socket *
315soget(bool waitok)
316{
317	struct socket *so;
318
319	so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
320	if (__predict_false(so == NULL))
321		return (NULL);
322	memset(so, 0, sizeof(*so));
323	TAILQ_INIT(&so->so_q0);
324	TAILQ_INIT(&so->so_q);
325	cv_init(&so->so_cv, "socket");
326	cv_init(&so->so_rcv.sb_cv, "netio");
327	cv_init(&so->so_snd.sb_cv, "netio");
328	selinit(&so->so_rcv.sb_sel);
329	selinit(&so->so_snd.sb_sel);
330	so->so_rcv.sb_so = so;
331	so->so_snd.sb_so = so;
332	return so;
333}
334
335void
336soput(struct socket *so)
337{
338
339	KASSERT(!cv_has_waiters(&so->so_cv));
340	KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
341	KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
342	seldestroy(&so->so_rcv.sb_sel);
343	seldestroy(&so->so_snd.sb_sel);
344	mutex_obj_free(so->so_lock);
345	cv_destroy(&so->so_cv);
346	cv_destroy(&so->so_rcv.sb_cv);
347	cv_destroy(&so->so_snd.sb_cv);
348	pool_cache_put(socket_cache, so);
349}
350
351void
352soqinsque(struct socket *head, struct socket *so, int q)
353{
354
355	KASSERT(solocked2(head, so));
356
357#ifdef DIAGNOSTIC
358	if (so->so_onq != NULL)
359		panic("soqinsque");
360#endif
361
362	so->so_head = head;
363	if (q == 0) {
364		head->so_q0len++;
365		so->so_onq = &head->so_q0;
366	} else {
367		head->so_qlen++;
368		so->so_onq = &head->so_q;
369	}
370	TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
371}
372
373int
374soqremque(struct socket *so, int q)
375{
376	struct socket	*head;
377
378	head = so->so_head;
379
380	KASSERT(solocked(so));
381	if (q == 0) {
382		if (so->so_onq != &head->so_q0)
383			return (0);
384		head->so_q0len--;
385	} else {
386		if (so->so_onq != &head->so_q)
387			return (0);
388		head->so_qlen--;
389	}
390	KASSERT(solocked2(so, head));
391	TAILQ_REMOVE(so->so_onq, so, so_qe);
392	so->so_onq = NULL;
393	so->so_head = NULL;
394	return (1);
395}
396
397/*
398 * Socantsendmore indicates that no more data will be sent on the
399 * socket; it would normally be applied to a socket when the user
400 * informs the system that no more data is to be sent, by the protocol
401 * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
402 * will be received, and will normally be applied to the socket by a
403 * protocol when it detects that the peer will send no more data.
404 * Data queued for reading in the socket may yet be read.
405 */
406
407void
408socantsendmore(struct socket *so)
409{
410
411	KASSERT(solocked(so));
412
413	so->so_state |= SS_CANTSENDMORE;
414	sowwakeup(so);
415}
416
417void
418socantrcvmore(struct socket *so)
419{
420
421	KASSERT(solocked(so));
422
423	so->so_state |= SS_CANTRCVMORE;
424	sorwakeup(so);
425}
426
427/*
428 * Wait for data to arrive at/drain from a socket buffer.
429 */
430int
431sbwait(struct sockbuf *sb)
432{
433	struct socket *so;
434	kmutex_t *lock;
435	int error;
436
437	so = sb->sb_so;
438
439	KASSERT(solocked(so));
440
441	sb->sb_flags |= SB_NOTIFY;
442	lock = so->so_lock;
443	if ((sb->sb_flags & SB_NOINTR) != 0)
444		error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo);
445	else
446		error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo);
447	if (__predict_false(lock != so->so_lock))
448		solockretry(so, lock);
449	return error;
450}
451
452/*
453 * Wakeup processes waiting on a socket buffer.
454 * Do asynchronous notification via SIGIO
455 * if the socket buffer has the SB_ASYNC flag set.
456 */
457void
458sowakeup(struct socket *so, struct sockbuf *sb, int code)
459{
460	int band;
461
462	KASSERT(solocked(so));
463	KASSERT(sb->sb_so == so);
464
465	if (code == POLL_IN)
466		band = POLLIN|POLLRDNORM;
467	else
468		band = POLLOUT|POLLWRNORM;
469	sb->sb_flags &= ~SB_NOTIFY;
470	selnotify(&sb->sb_sel, band, NOTE_SUBMIT);
471	cv_broadcast(&sb->sb_cv);
472	if (sb->sb_flags & SB_ASYNC)
473		fownsignal(so->so_pgid, SIGIO, code, band, so);
474	if (sb->sb_flags & SB_UPCALL)
475		(*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
476}
477
478/*
479 * Reset a socket's lock pointer.  Wake all threads waiting on the
480 * socket's condition variables so that they can restart their waits
481 * using the new lock.  The existing lock must be held.
482 */
483void
484solockreset(struct socket *so, kmutex_t *lock)
485{
486
487	KASSERT(solocked(so));
488
489	so->so_lock = lock;
490	cv_broadcast(&so->so_snd.sb_cv);
491	cv_broadcast(&so->so_rcv.sb_cv);
492	cv_broadcast(&so->so_cv);
493}
494
495/*
496 * Socket buffer (struct sockbuf) utility routines.
497 *
498 * Each socket contains two socket buffers: one for sending data and
499 * one for receiving data.  Each buffer contains a queue of mbufs,
500 * information about the number of mbufs and amount of data in the
501 * queue, and other fields allowing poll() statements and notification
502 * on data availability to be implemented.
503 *
504 * Data stored in a socket buffer is maintained as a list of records.
505 * Each record is a list of mbufs chained together with the m_next
506 * field.  Records are chained together with the m_nextpkt field. The upper
507 * level routine soreceive() expects the following conventions to be
508 * observed when placing information in the receive buffer:
509 *
510 * 1. If the protocol requires each message be preceded by the sender's
511 *    name, then a record containing that name must be present before
512 *    any associated data (mbuf's must be of type MT_SONAME).
513 * 2. If the protocol supports the exchange of ``access rights'' (really
514 *    just additional data associated with the message), and there are
515 *    ``rights'' to be received, then a record containing this data
516 *    should be present (mbuf's must be of type MT_CONTROL).
517 * 3. If a name or rights record exists, then it must be followed by
518 *    a data record, perhaps of zero length.
519 *
520 * Before using a new socket structure it is first necessary to reserve
521 * buffer space to the socket, by calling sbreserve().  This should commit
522 * some of the available buffer space in the system buffer pool for the
523 * socket (currently, it does nothing but enforce limits).  The space
524 * should be released by calling sbrelease() when the socket is destroyed.
525 */
526
527int
528sb_max_set(u_long new_sbmax)
529{
530	int s;
531
532	if (new_sbmax < (16 * 1024))
533		return (EINVAL);
534
535	s = splsoftnet();
536	sb_max = new_sbmax;
537	sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES);
538	splx(s);
539
540	return (0);
541}
542
543int
544soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
545{
546
547	KASSERT(so->so_lock == NULL || solocked(so));
548
549	/*
550	 * there's at least one application (a configure script of screen)
551	 * which expects a fifo is writable even if it has "some" bytes
552	 * in its buffer.
553	 * so we want to make sure (hiwat - lowat) >= (some bytes).
554	 *
555	 * PIPE_BUF here is an arbitrary value chosen as (some bytes) above.
556	 * we expect it's large enough for such applications.
557	 */
558	u_long  lowat = MAX(sock_loan_thresh, MCLBYTES);
559	u_long  hiwat = lowat + PIPE_BUF;
560
561	if (sndcc < hiwat)
562		sndcc = hiwat;
563	if (sbreserve(&so->so_snd, sndcc, so) == 0)
564		goto bad;
565	if (sbreserve(&so->so_rcv, rcvcc, so) == 0)
566		goto bad2;
567	if (so->so_rcv.sb_lowat == 0)
568		so->so_rcv.sb_lowat = 1;
569	if (so->so_snd.sb_lowat == 0)
570		so->so_snd.sb_lowat = lowat;
571	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
572		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
573	return (0);
574 bad2:
575	sbrelease(&so->so_snd, so);
576 bad:
577	return (ENOBUFS);
578}
579
580/*
581 * Allot mbufs to a sockbuf.
582 * Attempt to scale mbmax so that mbcnt doesn't become limiting
583 * if buffering efficiency is near the normal case.
584 */
585int
586sbreserve(struct sockbuf *sb, u_long cc, struct socket *so)
587{
588	struct lwp *l = curlwp; /* XXX */
589	rlim_t maxcc;
590	struct uidinfo *uidinfo;
591
592	KASSERT(so->so_lock == NULL || solocked(so));
593	KASSERT(sb->sb_so == so);
594	KASSERT(sb_max_adj != 0);
595
596	if (cc == 0 || cc > sb_max_adj)
597		return (0);
598
599	if (kauth_cred_geteuid(l->l_cred) == so->so_uidinfo->ui_uid)
600		maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur;
601	else
602		maxcc = RLIM_INFINITY;
603
604	uidinfo = so->so_uidinfo;
605	if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc))
606		return 0;
607	sb->sb_mbmax = min(cc * 2, sb_max);
608	if (sb->sb_lowat > sb->sb_hiwat)
609		sb->sb_lowat = sb->sb_hiwat;
610	return (1);
611}
612
613/*
614 * Free mbufs held by a socket, and reserved mbuf space.  We do not assert
615 * that the socket is held locked here: see sorflush().
616 */
617void
618sbrelease(struct sockbuf *sb, struct socket *so)
619{
620
621	KASSERT(sb->sb_so == so);
622
623	sbflush(sb);
624	(void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY);
625	sb->sb_mbmax = 0;
626}
627
628/*
629 * Routines to add and remove
630 * data from an mbuf queue.
631 *
632 * The routines sbappend() or sbappendrecord() are normally called to
633 * append new mbufs to a socket buffer, after checking that adequate
634 * space is available, comparing the function sbspace() with the amount
635 * of data to be added.  sbappendrecord() differs from sbappend() in
636 * that data supplied is treated as the beginning of a new record.
637 * To place a sender's address, optional access rights, and data in a
638 * socket receive buffer, sbappendaddr() should be used.  To place
639 * access rights and data in a socket receive buffer, sbappendrights()
640 * should be used.  In either case, the new data begins a new record.
641 * Note that unlike sbappend() and sbappendrecord(), these routines check
642 * for the caller that there will be enough space to store the data.
643 * Each fails if there is not enough space, or if it cannot find mbufs
644 * to store additional information in.
645 *
646 * Reliable protocols may use the socket send buffer to hold data
647 * awaiting acknowledgement.  Data is normally copied from a socket
648 * send buffer in a protocol with m_copy for output to a peer,
649 * and then removing the data from the socket buffer with sbdrop()
650 * or sbdroprecord() when the data is acknowledged by the peer.
651 */
652
653#ifdef SOCKBUF_DEBUG
654void
655sblastrecordchk(struct sockbuf *sb, const char *where)
656{
657	struct mbuf *m = sb->sb_mb;
658
659	KASSERT(solocked(sb->sb_so));
660
661	while (m && m->m_nextpkt)
662		m = m->m_nextpkt;
663
664	if (m != sb->sb_lastrecord) {
665		printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
666		    sb->sb_mb, sb->sb_lastrecord, m);
667		printf("packet chain:\n");
668		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
669			printf("\t%p\n", m);
670		panic("sblastrecordchk from %s", where);
671	}
672}
673
674void
675sblastmbufchk(struct sockbuf *sb, const char *where)
676{
677	struct mbuf *m = sb->sb_mb;
678	struct mbuf *n;
679
680	KASSERT(solocked(sb->sb_so));
681
682	while (m && m->m_nextpkt)
683		m = m->m_nextpkt;
684
685	while (m && m->m_next)
686		m = m->m_next;
687
688	if (m != sb->sb_mbtail) {
689		printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
690		    sb->sb_mb, sb->sb_mbtail, m);
691		printf("packet tree:\n");
692		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
693			printf("\t");
694			for (n = m; n != NULL; n = n->m_next)
695				printf("%p ", n);
696			printf("\n");
697		}
698		panic("sblastmbufchk from %s", where);
699	}
700}
701#endif /* SOCKBUF_DEBUG */
702
703/*
704 * Link a chain of records onto a socket buffer
705 */
706#define	SBLINKRECORDCHAIN(sb, m0, mlast)				\
707do {									\
708	if ((sb)->sb_lastrecord != NULL)				\
709		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
710	else								\
711		(sb)->sb_mb = (m0);					\
712	(sb)->sb_lastrecord = (mlast);					\
713} while (/*CONSTCOND*/0)
714
715
716#define	SBLINKRECORD(sb, m0)						\
717    SBLINKRECORDCHAIN(sb, m0, m0)
718
719/*
720 * Append mbuf chain m to the last record in the
721 * socket buffer sb.  The additional space associated
722 * the mbuf chain is recorded in sb.  Empty mbufs are
723 * discarded and mbufs are compacted where possible.
724 */
725void
726sbappend(struct sockbuf *sb, struct mbuf *m)
727{
728	struct mbuf	*n;
729
730	KASSERT(solocked(sb->sb_so));
731
732	if (m == 0)
733		return;
734
735#ifdef MBUFTRACE
736	m_claimm(m, sb->sb_mowner);
737#endif
738
739	SBLASTRECORDCHK(sb, "sbappend 1");
740
741	if ((n = sb->sb_lastrecord) != NULL) {
742		/*
743		 * XXX Would like to simply use sb_mbtail here, but
744		 * XXX I need to verify that I won't miss an EOR that
745		 * XXX way.
746		 */
747		do {
748			if (n->m_flags & M_EOR) {
749				sbappendrecord(sb, m); /* XXXXXX!!!! */
750				return;
751			}
752		} while (n->m_next && (n = n->m_next));
753	} else {
754		/*
755		 * If this is the first record in the socket buffer, it's
756		 * also the last record.
757		 */
758		sb->sb_lastrecord = m;
759	}
760	sbcompress(sb, m, n);
761	SBLASTRECORDCHK(sb, "sbappend 2");
762}
763
764/*
765 * This version of sbappend() should only be used when the caller
766 * absolutely knows that there will never be more than one record
767 * in the socket buffer, that is, a stream protocol (such as TCP).
768 */
769void
770sbappendstream(struct sockbuf *sb, struct mbuf *m)
771{
772
773	KASSERT(solocked(sb->sb_so));
774	KDASSERT(m->m_nextpkt == NULL);
775	KASSERT(sb->sb_mb == sb->sb_lastrecord);
776
777	SBLASTMBUFCHK(sb, __func__);
778
779#ifdef MBUFTRACE
780	m_claimm(m, sb->sb_mowner);
781#endif
782
783	sbcompress(sb, m, sb->sb_mbtail);
784
785	sb->sb_lastrecord = sb->sb_mb;
786	SBLASTRECORDCHK(sb, __func__);
787}
788
789#ifdef SOCKBUF_DEBUG
790void
791sbcheck(struct sockbuf *sb)
792{
793	struct mbuf	*m, *m2;
794	u_long		len, mbcnt;
795
796	KASSERT(solocked(sb->sb_so));
797
798	len = 0;
799	mbcnt = 0;
800	for (m = sb->sb_mb; m; m = m->m_nextpkt) {
801		for (m2 = m; m2 != NULL; m2 = m2->m_next) {
802			len += m2->m_len;
803			mbcnt += MSIZE;
804			if (m2->m_flags & M_EXT)
805				mbcnt += m2->m_ext.ext_size;
806			if (m2->m_nextpkt != NULL)
807				panic("sbcheck nextpkt");
808		}
809	}
810	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
811		printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
812		    mbcnt, sb->sb_mbcnt);
813		panic("sbcheck");
814	}
815}
816#endif
817
818/*
819 * As above, except the mbuf chain
820 * begins a new record.
821 */
822void
823sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
824{
825	struct mbuf	*m;
826
827	KASSERT(solocked(sb->sb_so));
828
829	if (m0 == 0)
830		return;
831
832#ifdef MBUFTRACE
833	m_claimm(m0, sb->sb_mowner);
834#endif
835	/*
836	 * Put the first mbuf on the queue.
837	 * Note this permits zero length records.
838	 */
839	sballoc(sb, m0);
840	SBLASTRECORDCHK(sb, "sbappendrecord 1");
841	SBLINKRECORD(sb, m0);
842	m = m0->m_next;
843	m0->m_next = 0;
844	if (m && (m0->m_flags & M_EOR)) {
845		m0->m_flags &= ~M_EOR;
846		m->m_flags |= M_EOR;
847	}
848	sbcompress(sb, m, m0);
849	SBLASTRECORDCHK(sb, "sbappendrecord 2");
850}
851
852/*
853 * As above except that OOB data
854 * is inserted at the beginning of the sockbuf,
855 * but after any other OOB data.
856 */
857void
858sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
859{
860	struct mbuf	*m, **mp;
861
862	KASSERT(solocked(sb->sb_so));
863
864	if (m0 == 0)
865		return;
866
867	SBLASTRECORDCHK(sb, "sbinsertoob 1");
868
869	for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
870	    again:
871		switch (m->m_type) {
872
873		case MT_OOBDATA:
874			continue;		/* WANT next train */
875
876		case MT_CONTROL:
877			if ((m = m->m_next) != NULL)
878				goto again;	/* inspect THIS train further */
879		}
880		break;
881	}
882	/*
883	 * Put the first mbuf on the queue.
884	 * Note this permits zero length records.
885	 */
886	sballoc(sb, m0);
887	m0->m_nextpkt = *mp;
888	if (*mp == NULL) {
889		/* m0 is actually the new tail */
890		sb->sb_lastrecord = m0;
891	}
892	*mp = m0;
893	m = m0->m_next;
894	m0->m_next = 0;
895	if (m && (m0->m_flags & M_EOR)) {
896		m0->m_flags &= ~M_EOR;
897		m->m_flags |= M_EOR;
898	}
899	sbcompress(sb, m, m0);
900	SBLASTRECORDCHK(sb, "sbinsertoob 2");
901}
902
903/*
904 * Append address and data, and optionally, control (ancillary) data
905 * to the receive queue of a socket.  If present,
906 * m0 must include a packet header with total length.
907 * Returns 0 if no space in sockbuf or insufficient mbufs.
908 */
909int
910sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
911	struct mbuf *control)
912{
913	struct mbuf	*m, *n, *nlast;
914	int		space, len;
915
916	KASSERT(solocked(sb->sb_so));
917
918	space = asa->sa_len;
919
920	if (m0 != NULL) {
921		if ((m0->m_flags & M_PKTHDR) == 0)
922			panic("sbappendaddr");
923		space += m0->m_pkthdr.len;
924#ifdef MBUFTRACE
925		m_claimm(m0, sb->sb_mowner);
926#endif
927	}
928	for (n = control; n; n = n->m_next) {
929		space += n->m_len;
930		MCLAIM(n, sb->sb_mowner);
931		if (n->m_next == 0)	/* keep pointer to last control buf */
932			break;
933	}
934	if (space > sbspace(sb))
935		return (0);
936	MGET(m, M_DONTWAIT, MT_SONAME);
937	if (m == 0)
938		return (0);
939	MCLAIM(m, sb->sb_mowner);
940	/*
941	 * XXX avoid 'comparison always true' warning which isn't easily
942	 * avoided.
943	 */
944	len = asa->sa_len;
945	if (len > MLEN) {
946		MEXTMALLOC(m, asa->sa_len, M_NOWAIT);
947		if ((m->m_flags & M_EXT) == 0) {
948			m_free(m);
949			return (0);
950		}
951	}
952	m->m_len = asa->sa_len;
953	memcpy(mtod(m, void *), asa, asa->sa_len);
954	if (n)
955		n->m_next = m0;		/* concatenate data to control */
956	else
957		control = m0;
958	m->m_next = control;
959
960	SBLASTRECORDCHK(sb, "sbappendaddr 1");
961
962	for (n = m; n->m_next != NULL; n = n->m_next)
963		sballoc(sb, n);
964	sballoc(sb, n);
965	nlast = n;
966	SBLINKRECORD(sb, m);
967
968	sb->sb_mbtail = nlast;
969	SBLASTMBUFCHK(sb, "sbappendaddr");
970	SBLASTRECORDCHK(sb, "sbappendaddr 2");
971
972	return (1);
973}
974
975/*
976 * Helper for sbappendchainaddr: prepend a struct sockaddr* to
977 * an mbuf chain.
978 */
979static inline struct mbuf *
980m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0,
981		   const struct sockaddr *asa)
982{
983	struct mbuf *m;
984	const int salen = asa->sa_len;
985
986	KASSERT(solocked(sb->sb_so));
987
988	/* only the first in each chain need be a pkthdr */
989	MGETHDR(m, M_DONTWAIT, MT_SONAME);
990	if (m == 0)
991		return (0);
992	MCLAIM(m, sb->sb_mowner);
993#ifdef notyet
994	if (salen > MHLEN) {
995		MEXTMALLOC(m, salen, M_NOWAIT);
996		if ((m->m_flags & M_EXT) == 0) {
997			m_free(m);
998			return (0);
999		}
1000	}
1001#else
1002	KASSERT(salen <= MHLEN);
1003#endif
1004	m->m_len = salen;
1005	memcpy(mtod(m, void *), asa, salen);
1006	m->m_next = m0;
1007	m->m_pkthdr.len = salen + m0->m_pkthdr.len;
1008
1009	return m;
1010}
1011
1012int
1013sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
1014		  struct mbuf *m0, int sbprio)
1015{
1016	int space;
1017	struct mbuf *m, *n, *n0, *nlast;
1018	int error;
1019
1020	KASSERT(solocked(sb->sb_so));
1021
1022	/*
1023	 * XXX sbprio reserved for encoding priority of this* request:
1024	 *  SB_PRIO_NONE --> honour normal sb limits
1025	 *  SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
1026	 *	take whole chain. Intended for large requests
1027	 *      that should be delivered atomically (all, or none).
1028	 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
1029	 *       over normal socket limits, for messages indicating
1030	 *       buffer overflow in earlier normal/lower-priority messages
1031	 * SB_PRIO_BESTEFFORT -->  ignore limits entirely.
1032	 *       Intended for  kernel-generated messages only.
1033	 *        Up to generator to avoid total mbuf resource exhaustion.
1034	 */
1035	(void)sbprio;
1036
1037	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
1038		panic("sbappendaddrchain");
1039
1040	space = sbspace(sb);
1041
1042#ifdef notyet
1043	/*
1044	 * Enforce SB_PRIO_* limits as described above.
1045	 */
1046#endif
1047
1048	n0 = NULL;
1049	nlast = NULL;
1050	for (m = m0; m; m = m->m_nextpkt) {
1051		struct mbuf *np;
1052
1053#ifdef MBUFTRACE
1054		m_claimm(m, sb->sb_mowner);
1055#endif
1056
1057		/* Prepend sockaddr to this record (m) of input chain m0 */
1058	  	n = m_prepend_sockaddr(sb, m, asa);
1059		if (n == NULL) {
1060			error = ENOBUFS;
1061			goto bad;
1062		}
1063
1064		/* Append record (asa+m) to end of new chain n0 */
1065		if (n0 == NULL) {
1066			n0 = n;
1067		} else {
1068			nlast->m_nextpkt = n;
1069		}
1070		/* Keep track of last record on new chain */
1071		nlast = n;
1072
1073		for (np = n; np; np = np->m_next)
1074			sballoc(sb, np);
1075	}
1076
1077	SBLASTRECORDCHK(sb, "sbappendaddrchain 1");
1078
1079	/* Drop the entire chain of (asa+m) records onto the socket */
1080	SBLINKRECORDCHAIN(sb, n0, nlast);
1081
1082	SBLASTRECORDCHK(sb, "sbappendaddrchain 2");
1083
1084	for (m = nlast; m->m_next; m = m->m_next)
1085		;
1086	sb->sb_mbtail = m;
1087	SBLASTMBUFCHK(sb, "sbappendaddrchain");
1088
1089	return (1);
1090
1091bad:
1092	/*
1093	 * On error, free the prepended addreseses. For consistency
1094	 * with sbappendaddr(), leave it to our caller to free
1095	 * the input record chain passed to us as m0.
1096	 */
1097	while ((n = n0) != NULL) {
1098	  	struct mbuf *np;
1099
1100		/* Undo the sballoc() of this record */
1101		for (np = n; np; np = np->m_next)
1102			sbfree(sb, np);
1103
1104		n0 = n->m_nextpkt;	/* iterate at next prepended address */
1105		MFREE(n, np);		/* free prepended address (not data) */
1106	}
1107	return 0;
1108}
1109
1110
1111int
1112sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
1113{
1114	struct mbuf	*m, *mlast, *n;
1115	int		space;
1116
1117	KASSERT(solocked(sb->sb_so));
1118
1119	space = 0;
1120	if (control == 0)
1121		panic("sbappendcontrol");
1122	for (m = control; ; m = m->m_next) {
1123		space += m->m_len;
1124		MCLAIM(m, sb->sb_mowner);
1125		if (m->m_next == 0)
1126			break;
1127	}
1128	n = m;			/* save pointer to last control buffer */
1129	for (m = m0; m; m = m->m_next) {
1130		MCLAIM(m, sb->sb_mowner);
1131		space += m->m_len;
1132	}
1133	if (space > sbspace(sb))
1134		return (0);
1135	n->m_next = m0;			/* concatenate data to control */
1136
1137	SBLASTRECORDCHK(sb, "sbappendcontrol 1");
1138
1139	for (m = control; m->m_next != NULL; m = m->m_next)
1140		sballoc(sb, m);
1141	sballoc(sb, m);
1142	mlast = m;
1143	SBLINKRECORD(sb, control);
1144
1145	sb->sb_mbtail = mlast;
1146	SBLASTMBUFCHK(sb, "sbappendcontrol");
1147	SBLASTRECORDCHK(sb, "sbappendcontrol 2");
1148
1149	return (1);
1150}
1151
1152/*
1153 * Compress mbuf chain m into the socket
1154 * buffer sb following mbuf n.  If n
1155 * is null, the buffer is presumed empty.
1156 */
1157void
1158sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1159{
1160	int		eor;
1161	struct mbuf	*o;
1162
1163	KASSERT(solocked(sb->sb_so));
1164
1165	eor = 0;
1166	while (m) {
1167		eor |= m->m_flags & M_EOR;
1168		if (m->m_len == 0 &&
1169		    (eor == 0 ||
1170		     (((o = m->m_next) || (o = n)) &&
1171		      o->m_type == m->m_type))) {
1172			if (sb->sb_lastrecord == m)
1173				sb->sb_lastrecord = m->m_next;
1174			m = m_free(m);
1175			continue;
1176		}
1177		if (n && (n->m_flags & M_EOR) == 0 &&
1178		    /* M_TRAILINGSPACE() checks buffer writeability */
1179		    m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */
1180		    m->m_len <= M_TRAILINGSPACE(n) &&
1181		    n->m_type == m->m_type) {
1182			memcpy(mtod(n, char *) + n->m_len, mtod(m, void *),
1183			    (unsigned)m->m_len);
1184			n->m_len += m->m_len;
1185			sb->sb_cc += m->m_len;
1186			m = m_free(m);
1187			continue;
1188		}
1189		if (n)
1190			n->m_next = m;
1191		else
1192			sb->sb_mb = m;
1193		sb->sb_mbtail = m;
1194		sballoc(sb, m);
1195		n = m;
1196		m->m_flags &= ~M_EOR;
1197		m = m->m_next;
1198		n->m_next = 0;
1199	}
1200	if (eor) {
1201		if (n)
1202			n->m_flags |= eor;
1203		else
1204			printf("semi-panic: sbcompress\n");
1205	}
1206	SBLASTMBUFCHK(sb, __func__);
1207}
1208
1209/*
1210 * Free all mbufs in a sockbuf.
1211 * Check that all resources are reclaimed.
1212 */
1213void
1214sbflush(struct sockbuf *sb)
1215{
1216
1217	KASSERT(solocked(sb->sb_so));
1218	KASSERT((sb->sb_flags & SB_LOCK) == 0);
1219
1220	while (sb->sb_mbcnt)
1221		sbdrop(sb, (int)sb->sb_cc);
1222
1223	KASSERT(sb->sb_cc == 0);
1224	KASSERT(sb->sb_mb == NULL);
1225	KASSERT(sb->sb_mbtail == NULL);
1226	KASSERT(sb->sb_lastrecord == NULL);
1227}
1228
1229/*
1230 * Drop data from (the front of) a sockbuf.
1231 */
1232void
1233sbdrop(struct sockbuf *sb, int len)
1234{
1235	struct mbuf	*m, *mn, *next;
1236
1237	KASSERT(solocked(sb->sb_so));
1238
1239	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
1240	while (len > 0) {
1241		if (m == 0) {
1242			if (next == 0)
1243				panic("sbdrop");
1244			m = next;
1245			next = m->m_nextpkt;
1246			continue;
1247		}
1248		if (m->m_len > len) {
1249			m->m_len -= len;
1250			m->m_data += len;
1251			sb->sb_cc -= len;
1252			break;
1253		}
1254		len -= m->m_len;
1255		sbfree(sb, m);
1256		MFREE(m, mn);
1257		m = mn;
1258	}
1259	while (m && m->m_len == 0) {
1260		sbfree(sb, m);
1261		MFREE(m, mn);
1262		m = mn;
1263	}
1264	if (m) {
1265		sb->sb_mb = m;
1266		m->m_nextpkt = next;
1267	} else
1268		sb->sb_mb = next;
1269	/*
1270	 * First part is an inline SB_EMPTY_FIXUP().  Second part
1271	 * makes sure sb_lastrecord is up-to-date if we dropped
1272	 * part of the last record.
1273	 */
1274	m = sb->sb_mb;
1275	if (m == NULL) {
1276		sb->sb_mbtail = NULL;
1277		sb->sb_lastrecord = NULL;
1278	} else if (m->m_nextpkt == NULL)
1279		sb->sb_lastrecord = m;
1280}
1281
1282/*
1283 * Drop a record off the front of a sockbuf
1284 * and move the next record to the front.
1285 */
1286void
1287sbdroprecord(struct sockbuf *sb)
1288{
1289	struct mbuf	*m, *mn;
1290
1291	KASSERT(solocked(sb->sb_so));
1292
1293	m = sb->sb_mb;
1294	if (m) {
1295		sb->sb_mb = m->m_nextpkt;
1296		do {
1297			sbfree(sb, m);
1298			MFREE(m, mn);
1299		} while ((m = mn) != NULL);
1300	}
1301	SB_EMPTY_FIXUP(sb);
1302}
1303
1304/*
1305 * Create a "control" mbuf containing the specified data
1306 * with the specified type for presentation on a socket buffer.
1307 */
1308struct mbuf *
1309sbcreatecontrol(void *p, int size, int type, int level)
1310{
1311	struct cmsghdr	*cp;
1312	struct mbuf	*m;
1313
1314	if (CMSG_SPACE(size) > MCLBYTES) {
1315		printf("sbcreatecontrol: message too large %d\n", size);
1316		return NULL;
1317	}
1318
1319	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
1320		return ((struct mbuf *) NULL);
1321	if (CMSG_SPACE(size) > MLEN) {
1322		MCLGET(m, M_DONTWAIT);
1323		if ((m->m_flags & M_EXT) == 0) {
1324			m_free(m);
1325			return NULL;
1326		}
1327	}
1328	cp = mtod(m, struct cmsghdr *);
1329	memcpy(CMSG_DATA(cp), p, size);
1330	m->m_len = CMSG_SPACE(size);
1331	cp->cmsg_len = CMSG_LEN(size);
1332	cp->cmsg_level = level;
1333	cp->cmsg_type = type;
1334	return (m);
1335}
1336
1337void
1338solockretry(struct socket *so, kmutex_t *lock)
1339{
1340
1341	while (lock != so->so_lock) {
1342		mutex_exit(lock);
1343		lock = so->so_lock;
1344		mutex_enter(lock);
1345	}
1346}
1347
1348bool
1349solocked(struct socket *so)
1350{
1351
1352	return mutex_owned(so->so_lock);
1353}
1354
1355bool
1356solocked2(struct socket *so1, struct socket *so2)
1357{
1358	kmutex_t *lock;
1359
1360	lock = so1->so_lock;
1361	if (lock != so2->so_lock)
1362		return false;
1363	return mutex_owned(lock);
1364}
1365
1366/*
1367 * Assign a default lock to a new socket.  For PRU_ATTACH, and done by
1368 * protocols that do not have special locking requirements.
1369 */
1370void
1371sosetlock(struct socket *so)
1372{
1373	kmutex_t *lock;
1374
1375	if (so->so_lock == NULL) {
1376		lock = softnet_lock;
1377		so->so_lock = lock;
1378		mutex_obj_hold(lock);
1379		mutex_enter(lock);
1380	}
1381
1382	/* In all cases, lock must be held on return from PRU_ATTACH. */
1383	KASSERT(solocked(so));
1384}
1385
1386/*
1387 * Set lock on sockbuf sb; sleep if lock is already held.
1388 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1389 * Returns error without lock if sleep is interrupted.
1390 */
1391int
1392sblock(struct sockbuf *sb, int wf)
1393{
1394	struct socket *so;
1395	kmutex_t *lock;
1396	int error;
1397
1398	KASSERT(solocked(sb->sb_so));
1399
1400	for (;;) {
1401		if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) {
1402			sb->sb_flags |= SB_LOCK;
1403			return 0;
1404		}
1405		if (wf != M_WAITOK)
1406			return EWOULDBLOCK;
1407		so = sb->sb_so;
1408		lock = so->so_lock;
1409		if ((sb->sb_flags & SB_NOINTR) != 0) {
1410			cv_wait(&so->so_cv, lock);
1411			error = 0;
1412		} else
1413			error = cv_wait_sig(&so->so_cv, lock);
1414		if (__predict_false(lock != so->so_lock))
1415			solockretry(so, lock);
1416		if (error != 0)
1417			return error;
1418	}
1419}
1420
1421void
1422sbunlock(struct sockbuf *sb)
1423{
1424	struct socket *so;
1425
1426	so = sb->sb_so;
1427
1428	KASSERT(solocked(so));
1429	KASSERT((sb->sb_flags & SB_LOCK) != 0);
1430
1431	sb->sb_flags &= ~SB_LOCK;
1432	cv_broadcast(&so->so_cv);
1433}
1434
1435int
1436sowait(struct socket *so, bool catch, int timo)
1437{
1438	kmutex_t *lock;
1439	int error;
1440
1441	KASSERT(solocked(so));
1442	KASSERT(catch || timo != 0);
1443
1444	lock = so->so_lock;
1445	if (catch)
1446		error = cv_timedwait_sig(&so->so_cv, lock, timo);
1447	else
1448		error = cv_timedwait(&so->so_cv, lock, timo);
1449	if (__predict_false(lock != so->so_lock))
1450		solockretry(so, lock);
1451	return error;
1452}
1453