uipc_socket2.c revision 1.116
1/*	$NetBSD: uipc_socket2.c,v 1.116 2014/05/17 22:52:36 rmind Exp $	*/
2
3/*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 *	The Regents of the University of California.  All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 * 3. Neither the name of the University nor the names of its contributors
42 *    may be used to endorse or promote products derived from this software
43 *    without specific prior written permission.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 *
57 *	@(#)uipc_socket2.c	8.2 (Berkeley) 2/14/95
58 */
59
60#include <sys/cdefs.h>
61__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.116 2014/05/17 22:52:36 rmind Exp $");
62
63#include "opt_mbuftrace.h"
64#include "opt_sb_max.h"
65
66#include <sys/param.h>
67#include <sys/systm.h>
68#include <sys/proc.h>
69#include <sys/file.h>
70#include <sys/buf.h>
71#include <sys/mbuf.h>
72#include <sys/protosw.h>
73#include <sys/domain.h>
74#include <sys/poll.h>
75#include <sys/socket.h>
76#include <sys/socketvar.h>
77#include <sys/signalvar.h>
78#include <sys/kauth.h>
79#include <sys/pool.h>
80#include <sys/uidinfo.h>
81
82/*
83 * Primitive routines for operating on sockets and socket buffers.
84 *
85 * Connection life-cycle:
86 *
87 *	Normal sequence from the active (originating) side:
88 *
89 *	- soisconnecting() is called during processing of connect() call,
90 *	- resulting in an eventual call to soisconnected() if/when the
91 *	  connection is established.
92 *
93 *	When the connection is torn down during processing of disconnect():
94 *
95 *	- soisdisconnecting() is called and,
96 *	- soisdisconnected() is called when the connection to the peer
97 *	  is totally severed.
98 *
99 *	The semantics of these routines are such that connectionless protocols
100 *	can call soisconnected() and soisdisconnected() only, bypassing the
101 *	in-progress calls when setting up a ``connection'' takes no time.
102 *
103 *	From the passive side, a socket is created with two queues of sockets:
104 *
105 *	- so_q0 (0) for partial connections (i.e. connections in progress)
106 *	- so_q (1) for connections already made and awaiting user acceptance.
107 *
108 *	As a protocol is preparing incoming connections, it creates a socket
109 *	structure queued on so_q0 by calling sonewconn().  When the connection
110 *	is established, soisconnected() is called, and transfers the
111 *	socket structure to so_q, making it available to accept().
112 *
113 *	If a socket is closed with sockets on either so_q0 or so_q, these
114 *	sockets are dropped.
115 *
116 * Locking rules and assumptions:
117 *
118 * o socket::so_lock can change on the fly.  The low level routines used
119 *   to lock sockets are aware of this.  When so_lock is acquired, the
120 *   routine locking must check to see if so_lock still points to the
121 *   lock that was acquired.  If so_lock has changed in the meantime, the
122 *   now irrelevant lock that was acquired must be dropped and the lock
123 *   operation retried.  Although not proven here, this is completely safe
124 *   on a multiprocessor system, even with relaxed memory ordering, given
125 *   the next two rules:
126 *
127 * o In order to mutate so_lock, the lock pointed to by the current value
128 *   of so_lock must be held: i.e., the socket must be held locked by the
129 *   changing thread.  The thread must issue membar_exit() to prevent
130 *   memory accesses being reordered, and can set so_lock to the desired
131 *   value.  If the lock pointed to by the new value of so_lock is not
132 *   held by the changing thread, the socket must then be considered
133 *   unlocked.
134 *
135 * o If so_lock is mutated, and the previous lock referred to by so_lock
136 *   could still be visible to other threads in the system (e.g. via file
137 *   descriptor or protocol-internal reference), then the old lock must
138 *   remain valid until the socket and/or protocol control block has been
139 *   torn down.
140 *
141 * o If a socket has a non-NULL so_head value (i.e. is in the process of
142 *   connecting), then locking the socket must also lock the socket pointed
143 *   to by so_head: their lock pointers must match.
144 *
145 * o If a socket has connections in progress (so_q, so_q0 not empty) then
146 *   locking the socket must also lock the sockets attached to both queues.
147 *   Again, their lock pointers must match.
148 *
149 * o Beyond the initial lock assignment in socreate(), assigning locks to
150 *   sockets is the responsibility of the individual protocols / protocol
151 *   domains.
152 */
153
154static pool_cache_t	socket_cache;
155u_long			sb_max = SB_MAX;/* maximum socket buffer size */
156static u_long		sb_max_adj;	/* adjusted sb_max */
157
158void
159soisconnecting(struct socket *so)
160{
161
162	KASSERT(solocked(so));
163
164	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
165	so->so_state |= SS_ISCONNECTING;
166}
167
168void
169soisconnected(struct socket *so)
170{
171	struct socket	*head;
172
173	head = so->so_head;
174
175	KASSERT(solocked(so));
176	KASSERT(head == NULL || solocked2(so, head));
177
178	so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING);
179	so->so_state |= SS_ISCONNECTED;
180	if (head && so->so_onq == &head->so_q0) {
181		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
182			/*
183			 * Re-enqueue and wake up any waiters, e.g.
184			 * processes blocking on accept().
185			 */
186			soqremque(so, 0);
187			soqinsque(head, so, 1);
188			sorwakeup(head);
189			cv_broadcast(&head->so_cv);
190		} else {
191			so->so_upcall =
192			    head->so_accf->so_accept_filter->accf_callback;
193			so->so_upcallarg = head->so_accf->so_accept_filter_arg;
194			so->so_rcv.sb_flags |= SB_UPCALL;
195			so->so_options &= ~SO_ACCEPTFILTER;
196			(*so->so_upcall)(so, so->so_upcallarg,
197					 POLLIN|POLLRDNORM, M_DONTWAIT);
198		}
199	} else {
200		cv_broadcast(&so->so_cv);
201		sorwakeup(so);
202		sowwakeup(so);
203	}
204}
205
206void
207soisdisconnecting(struct socket *so)
208{
209
210	KASSERT(solocked(so));
211
212	so->so_state &= ~SS_ISCONNECTING;
213	so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
214	cv_broadcast(&so->so_cv);
215	sowwakeup(so);
216	sorwakeup(so);
217}
218
219void
220soisdisconnected(struct socket *so)
221{
222
223	KASSERT(solocked(so));
224
225	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
226	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
227	cv_broadcast(&so->so_cv);
228	sowwakeup(so);
229	sorwakeup(so);
230}
231
232void
233soinit2(void)
234{
235
236	socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0,
237	    "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL);
238}
239
240/*
241 * sonewconn: accept a new connection.
242 *
243 * When an attempt at a new connection is noted on a socket which accepts
244 * connections, sonewconn(9) is called.  If the connection is possible
245 * (subject to space constraints, etc) then we allocate a new structure,
246 * properly linked into the data structure of the original socket.
247 *
248 * => If 'soready' is true, then socket will become ready for accept() i.e.
249 *    inserted into the so_q queue, SS_ISCONNECTED set and waiters awoken.
250 * => May be called from soft-interrupt context.
251 * => Listening socket should be locked.
252 * => Returns the new socket locked.
253 */
254struct socket *
255sonewconn(struct socket *head, bool soready)
256{
257	struct socket *so;
258	int soqueue, error;
259
260	KASSERT(solocked(head));
261
262	if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) {
263		/* Listen queue overflow. */
264		return NULL;
265	}
266	if ((head->so_options & SO_ACCEPTFILTER) != 0) {
267		soready = false;
268	}
269	soqueue = soready ? 1 : 0;
270
271	if ((so = soget(false)) == NULL) {
272		return NULL;
273	}
274	so->so_type = head->so_type;
275	so->so_options = head->so_options & ~SO_ACCEPTCONN;
276	so->so_linger = head->so_linger;
277	so->so_state = head->so_state | SS_NOFDREF;
278	so->so_proto = head->so_proto;
279	so->so_timeo = head->so_timeo;
280	so->so_pgid = head->so_pgid;
281	so->so_send = head->so_send;
282	so->so_receive = head->so_receive;
283	so->so_uidinfo = head->so_uidinfo;
284	so->so_cpid = head->so_cpid;
285#ifdef MBUFTRACE
286	so->so_mowner = head->so_mowner;
287	so->so_rcv.sb_mowner = head->so_rcv.sb_mowner;
288	so->so_snd.sb_mowner = head->so_snd.sb_mowner;
289#endif
290	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) != 0)
291		goto out;
292	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
293	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
294	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
295	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
296	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & (SB_AUTOSIZE | SB_ASYNC);
297	so->so_snd.sb_flags |= head->so_snd.sb_flags & (SB_AUTOSIZE | SB_ASYNC);
298
299	/*
300	 * Share the lock with the listening-socket, it may get unshared
301	 * once the connection is complete.
302	 */
303	mutex_obj_hold(head->so_lock);
304	so->so_lock = head->so_lock;
305	soqinsque(head, so, soqueue);
306
307	error = (*so->so_proto->pr_usrreq)(so, PRU_ATTACH, NULL, NULL,
308	    NULL, NULL);
309	KASSERT(solocked(so));
310	if (error) {
311		(void) soqremque(so, soqueue);
312out:
313		KASSERT(so->so_accf == NULL);
314		soput(so);
315
316		/* Note: the listening socket shall stay locked. */
317		KASSERT(solocked(head));
318		return NULL;
319	}
320
321	/*
322	 * Update the connection status and wake up any waiters,
323	 * e.g. processes blocking on accept().
324	 */
325	if (soready) {
326		so->so_state |= SS_ISCONNECTED;
327		sorwakeup(head);
328		cv_broadcast(&head->so_cv);
329	}
330	KASSERT(solocked2(head, so));
331	return so;
332}
333
334struct socket *
335soget(bool waitok)
336{
337	struct socket *so;
338
339	so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
340	if (__predict_false(so == NULL))
341		return (NULL);
342	memset(so, 0, sizeof(*so));
343	TAILQ_INIT(&so->so_q0);
344	TAILQ_INIT(&so->so_q);
345	cv_init(&so->so_cv, "socket");
346	cv_init(&so->so_rcv.sb_cv, "netio");
347	cv_init(&so->so_snd.sb_cv, "netio");
348	selinit(&so->so_rcv.sb_sel);
349	selinit(&so->so_snd.sb_sel);
350	so->so_rcv.sb_so = so;
351	so->so_snd.sb_so = so;
352	return so;
353}
354
355void
356soput(struct socket *so)
357{
358
359	KASSERT(!cv_has_waiters(&so->so_cv));
360	KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
361	KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
362	seldestroy(&so->so_rcv.sb_sel);
363	seldestroy(&so->so_snd.sb_sel);
364	mutex_obj_free(so->so_lock);
365	cv_destroy(&so->so_cv);
366	cv_destroy(&so->so_rcv.sb_cv);
367	cv_destroy(&so->so_snd.sb_cv);
368	pool_cache_put(socket_cache, so);
369}
370
371/*
372 * soqinsque: insert socket of a new connection into the specified
373 * accept queue of the listening socket (head).
374 *
375 *	q = 0: queue of partial connections
376 *	q = 1: queue of incoming connections
377 */
378void
379soqinsque(struct socket *head, struct socket *so, int q)
380{
381	KASSERT(q == 0 || q == 1);
382	KASSERT(solocked2(head, so));
383	KASSERT(so->so_onq == NULL);
384	KASSERT(so->so_head == NULL);
385
386	so->so_head = head;
387	if (q == 0) {
388		head->so_q0len++;
389		so->so_onq = &head->so_q0;
390	} else {
391		head->so_qlen++;
392		so->so_onq = &head->so_q;
393	}
394	TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
395}
396
397/*
398 * soqremque: remove socket from the specified queue.
399 *
400 * => Returns true if socket was removed from the specified queue.
401 * => False if socket was not removed (because it was in other queue).
402 */
403bool
404soqremque(struct socket *so, int q)
405{
406	struct socket *head = so->so_head;
407
408	KASSERT(q == 0 || q == 1);
409	KASSERT(solocked(so));
410	KASSERT(so->so_onq != NULL);
411	KASSERT(head != NULL);
412
413	if (q == 0) {
414		if (so->so_onq != &head->so_q0)
415			return false;
416		head->so_q0len--;
417	} else {
418		if (so->so_onq != &head->so_q)
419			return false;
420		head->so_qlen--;
421	}
422	KASSERT(solocked2(so, head));
423	TAILQ_REMOVE(so->so_onq, so, so_qe);
424	so->so_onq = NULL;
425	so->so_head = NULL;
426	return true;
427}
428
429/*
430 * socantsendmore: indicates that no more data will be sent on the
431 * socket; it would normally be applied to a socket when the user
432 * informs the system that no more data is to be sent, by the protocol
433 * code (in case PRU_SHUTDOWN).
434 */
435void
436socantsendmore(struct socket *so)
437{
438	KASSERT(solocked(so));
439
440	so->so_state |= SS_CANTSENDMORE;
441	sowwakeup(so);
442}
443
444/*
445 * socantrcvmore(): indicates that no more data will be received and
446 * will normally be applied to the socket by a protocol when it detects
447 * that the peer will send no more data.  Data queued for reading in
448 * the socket may yet be read.
449 */
450void
451socantrcvmore(struct socket *so)
452{
453	KASSERT(solocked(so));
454
455	so->so_state |= SS_CANTRCVMORE;
456	sorwakeup(so);
457}
458
459/*
460 * Wait for data to arrive at/drain from a socket buffer.
461 */
462int
463sbwait(struct sockbuf *sb)
464{
465	struct socket *so;
466	kmutex_t *lock;
467	int error;
468
469	so = sb->sb_so;
470
471	KASSERT(solocked(so));
472
473	sb->sb_flags |= SB_NOTIFY;
474	lock = so->so_lock;
475	if ((sb->sb_flags & SB_NOINTR) != 0)
476		error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo);
477	else
478		error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo);
479	if (__predict_false(lock != so->so_lock))
480		solockretry(so, lock);
481	return error;
482}
483
484/*
485 * Wakeup processes waiting on a socket buffer.
486 * Do asynchronous notification via SIGIO
487 * if the socket buffer has the SB_ASYNC flag set.
488 */
489void
490sowakeup(struct socket *so, struct sockbuf *sb, int code)
491{
492	int band;
493
494	KASSERT(solocked(so));
495	KASSERT(sb->sb_so == so);
496
497	if (code == POLL_IN)
498		band = POLLIN|POLLRDNORM;
499	else
500		band = POLLOUT|POLLWRNORM;
501	sb->sb_flags &= ~SB_NOTIFY;
502	selnotify(&sb->sb_sel, band, NOTE_SUBMIT);
503	cv_broadcast(&sb->sb_cv);
504	if (sb->sb_flags & SB_ASYNC)
505		fownsignal(so->so_pgid, SIGIO, code, band, so);
506	if (sb->sb_flags & SB_UPCALL)
507		(*so->so_upcall)(so, so->so_upcallarg, band, M_DONTWAIT);
508}
509
510/*
511 * Reset a socket's lock pointer.  Wake all threads waiting on the
512 * socket's condition variables so that they can restart their waits
513 * using the new lock.  The existing lock must be held.
514 */
515void
516solockreset(struct socket *so, kmutex_t *lock)
517{
518
519	KASSERT(solocked(so));
520
521	so->so_lock = lock;
522	cv_broadcast(&so->so_snd.sb_cv);
523	cv_broadcast(&so->so_rcv.sb_cv);
524	cv_broadcast(&so->so_cv);
525}
526
527/*
528 * Socket buffer (struct sockbuf) utility routines.
529 *
530 * Each socket contains two socket buffers: one for sending data and
531 * one for receiving data.  Each buffer contains a queue of mbufs,
532 * information about the number of mbufs and amount of data in the
533 * queue, and other fields allowing poll() statements and notification
534 * on data availability to be implemented.
535 *
536 * Data stored in a socket buffer is maintained as a list of records.
537 * Each record is a list of mbufs chained together with the m_next
538 * field.  Records are chained together with the m_nextpkt field. The upper
539 * level routine soreceive() expects the following conventions to be
540 * observed when placing information in the receive buffer:
541 *
542 * 1. If the protocol requires each message be preceded by the sender's
543 *    name, then a record containing that name must be present before
544 *    any associated data (mbuf's must be of type MT_SONAME).
545 * 2. If the protocol supports the exchange of ``access rights'' (really
546 *    just additional data associated with the message), and there are
547 *    ``rights'' to be received, then a record containing this data
548 *    should be present (mbuf's must be of type MT_CONTROL).
549 * 3. If a name or rights record exists, then it must be followed by
550 *    a data record, perhaps of zero length.
551 *
552 * Before using a new socket structure it is first necessary to reserve
553 * buffer space to the socket, by calling sbreserve().  This should commit
554 * some of the available buffer space in the system buffer pool for the
555 * socket (currently, it does nothing but enforce limits).  The space
556 * should be released by calling sbrelease() when the socket is destroyed.
557 */
558
559int
560sb_max_set(u_long new_sbmax)
561{
562	int s;
563
564	if (new_sbmax < (16 * 1024))
565		return (EINVAL);
566
567	s = splsoftnet();
568	sb_max = new_sbmax;
569	sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES);
570	splx(s);
571
572	return (0);
573}
574
575int
576soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
577{
578	KASSERT(so->so_pcb == NULL || solocked(so));
579
580	/*
581	 * there's at least one application (a configure script of screen)
582	 * which expects a fifo is writable even if it has "some" bytes
583	 * in its buffer.
584	 * so we want to make sure (hiwat - lowat) >= (some bytes).
585	 *
586	 * PIPE_BUF here is an arbitrary value chosen as (some bytes) above.
587	 * we expect it's large enough for such applications.
588	 */
589	u_long  lowat = MAX(sock_loan_thresh, MCLBYTES);
590	u_long  hiwat = lowat + PIPE_BUF;
591
592	if (sndcc < hiwat)
593		sndcc = hiwat;
594	if (sbreserve(&so->so_snd, sndcc, so) == 0)
595		goto bad;
596	if (sbreserve(&so->so_rcv, rcvcc, so) == 0)
597		goto bad2;
598	if (so->so_rcv.sb_lowat == 0)
599		so->so_rcv.sb_lowat = 1;
600	if (so->so_snd.sb_lowat == 0)
601		so->so_snd.sb_lowat = lowat;
602	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
603		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
604	return (0);
605 bad2:
606	sbrelease(&so->so_snd, so);
607 bad:
608	return (ENOBUFS);
609}
610
611/*
612 * Allot mbufs to a sockbuf.
613 * Attempt to scale mbmax so that mbcnt doesn't become limiting
614 * if buffering efficiency is near the normal case.
615 */
616int
617sbreserve(struct sockbuf *sb, u_long cc, struct socket *so)
618{
619	struct lwp *l = curlwp; /* XXX */
620	rlim_t maxcc;
621	struct uidinfo *uidinfo;
622
623	KASSERT(so->so_pcb == NULL || solocked(so));
624	KASSERT(sb->sb_so == so);
625	KASSERT(sb_max_adj != 0);
626
627	if (cc == 0 || cc > sb_max_adj)
628		return (0);
629
630	maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur;
631
632	uidinfo = so->so_uidinfo;
633	if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc))
634		return 0;
635	sb->sb_mbmax = min(cc * 2, sb_max);
636	if (sb->sb_lowat > sb->sb_hiwat)
637		sb->sb_lowat = sb->sb_hiwat;
638	return (1);
639}
640
641/*
642 * Free mbufs held by a socket, and reserved mbuf space.  We do not assert
643 * that the socket is held locked here: see sorflush().
644 */
645void
646sbrelease(struct sockbuf *sb, struct socket *so)
647{
648
649	KASSERT(sb->sb_so == so);
650
651	sbflush(sb);
652	(void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY);
653	sb->sb_mbmax = 0;
654}
655
656/*
657 * Routines to add and remove
658 * data from an mbuf queue.
659 *
660 * The routines sbappend() or sbappendrecord() are normally called to
661 * append new mbufs to a socket buffer, after checking that adequate
662 * space is available, comparing the function sbspace() with the amount
663 * of data to be added.  sbappendrecord() differs from sbappend() in
664 * that data supplied is treated as the beginning of a new record.
665 * To place a sender's address, optional access rights, and data in a
666 * socket receive buffer, sbappendaddr() should be used.  To place
667 * access rights and data in a socket receive buffer, sbappendrights()
668 * should be used.  In either case, the new data begins a new record.
669 * Note that unlike sbappend() and sbappendrecord(), these routines check
670 * for the caller that there will be enough space to store the data.
671 * Each fails if there is not enough space, or if it cannot find mbufs
672 * to store additional information in.
673 *
674 * Reliable protocols may use the socket send buffer to hold data
675 * awaiting acknowledgement.  Data is normally copied from a socket
676 * send buffer in a protocol with m_copy for output to a peer,
677 * and then removing the data from the socket buffer with sbdrop()
678 * or sbdroprecord() when the data is acknowledged by the peer.
679 */
680
681#ifdef SOCKBUF_DEBUG
682void
683sblastrecordchk(struct sockbuf *sb, const char *where)
684{
685	struct mbuf *m = sb->sb_mb;
686
687	KASSERT(solocked(sb->sb_so));
688
689	while (m && m->m_nextpkt)
690		m = m->m_nextpkt;
691
692	if (m != sb->sb_lastrecord) {
693		printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
694		    sb->sb_mb, sb->sb_lastrecord, m);
695		printf("packet chain:\n");
696		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
697			printf("\t%p\n", m);
698		panic("sblastrecordchk from %s", where);
699	}
700}
701
702void
703sblastmbufchk(struct sockbuf *sb, const char *where)
704{
705	struct mbuf *m = sb->sb_mb;
706	struct mbuf *n;
707
708	KASSERT(solocked(sb->sb_so));
709
710	while (m && m->m_nextpkt)
711		m = m->m_nextpkt;
712
713	while (m && m->m_next)
714		m = m->m_next;
715
716	if (m != sb->sb_mbtail) {
717		printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
718		    sb->sb_mb, sb->sb_mbtail, m);
719		printf("packet tree:\n");
720		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
721			printf("\t");
722			for (n = m; n != NULL; n = n->m_next)
723				printf("%p ", n);
724			printf("\n");
725		}
726		panic("sblastmbufchk from %s", where);
727	}
728}
729#endif /* SOCKBUF_DEBUG */
730
731/*
732 * Link a chain of records onto a socket buffer
733 */
734#define	SBLINKRECORDCHAIN(sb, m0, mlast)				\
735do {									\
736	if ((sb)->sb_lastrecord != NULL)				\
737		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
738	else								\
739		(sb)->sb_mb = (m0);					\
740	(sb)->sb_lastrecord = (mlast);					\
741} while (/*CONSTCOND*/0)
742
743
744#define	SBLINKRECORD(sb, m0)						\
745    SBLINKRECORDCHAIN(sb, m0, m0)
746
747/*
748 * Append mbuf chain m to the last record in the
749 * socket buffer sb.  The additional space associated
750 * the mbuf chain is recorded in sb.  Empty mbufs are
751 * discarded and mbufs are compacted where possible.
752 */
753void
754sbappend(struct sockbuf *sb, struct mbuf *m)
755{
756	struct mbuf	*n;
757
758	KASSERT(solocked(sb->sb_so));
759
760	if (m == NULL)
761		return;
762
763#ifdef MBUFTRACE
764	m_claimm(m, sb->sb_mowner);
765#endif
766
767	SBLASTRECORDCHK(sb, "sbappend 1");
768
769	if ((n = sb->sb_lastrecord) != NULL) {
770		/*
771		 * XXX Would like to simply use sb_mbtail here, but
772		 * XXX I need to verify that I won't miss an EOR that
773		 * XXX way.
774		 */
775		do {
776			if (n->m_flags & M_EOR) {
777				sbappendrecord(sb, m); /* XXXXXX!!!! */
778				return;
779			}
780		} while (n->m_next && (n = n->m_next));
781	} else {
782		/*
783		 * If this is the first record in the socket buffer, it's
784		 * also the last record.
785		 */
786		sb->sb_lastrecord = m;
787	}
788	sbcompress(sb, m, n);
789	SBLASTRECORDCHK(sb, "sbappend 2");
790}
791
792/*
793 * This version of sbappend() should only be used when the caller
794 * absolutely knows that there will never be more than one record
795 * in the socket buffer, that is, a stream protocol (such as TCP).
796 */
797void
798sbappendstream(struct sockbuf *sb, struct mbuf *m)
799{
800
801	KASSERT(solocked(sb->sb_so));
802	KDASSERT(m->m_nextpkt == NULL);
803	KASSERT(sb->sb_mb == sb->sb_lastrecord);
804
805	SBLASTMBUFCHK(sb, __func__);
806
807#ifdef MBUFTRACE
808	m_claimm(m, sb->sb_mowner);
809#endif
810
811	sbcompress(sb, m, sb->sb_mbtail);
812
813	sb->sb_lastrecord = sb->sb_mb;
814	SBLASTRECORDCHK(sb, __func__);
815}
816
817#ifdef SOCKBUF_DEBUG
818void
819sbcheck(struct sockbuf *sb)
820{
821	struct mbuf	*m, *m2;
822	u_long		len, mbcnt;
823
824	KASSERT(solocked(sb->sb_so));
825
826	len = 0;
827	mbcnt = 0;
828	for (m = sb->sb_mb; m; m = m->m_nextpkt) {
829		for (m2 = m; m2 != NULL; m2 = m2->m_next) {
830			len += m2->m_len;
831			mbcnt += MSIZE;
832			if (m2->m_flags & M_EXT)
833				mbcnt += m2->m_ext.ext_size;
834			if (m2->m_nextpkt != NULL)
835				panic("sbcheck nextpkt");
836		}
837	}
838	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
839		printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
840		    mbcnt, sb->sb_mbcnt);
841		panic("sbcheck");
842	}
843}
844#endif
845
846/*
847 * As above, except the mbuf chain
848 * begins a new record.
849 */
850void
851sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
852{
853	struct mbuf	*m;
854
855	KASSERT(solocked(sb->sb_so));
856
857	if (m0 == NULL)
858		return;
859
860#ifdef MBUFTRACE
861	m_claimm(m0, sb->sb_mowner);
862#endif
863	/*
864	 * Put the first mbuf on the queue.
865	 * Note this permits zero length records.
866	 */
867	sballoc(sb, m0);
868	SBLASTRECORDCHK(sb, "sbappendrecord 1");
869	SBLINKRECORD(sb, m0);
870	m = m0->m_next;
871	m0->m_next = 0;
872	if (m && (m0->m_flags & M_EOR)) {
873		m0->m_flags &= ~M_EOR;
874		m->m_flags |= M_EOR;
875	}
876	sbcompress(sb, m, m0);
877	SBLASTRECORDCHK(sb, "sbappendrecord 2");
878}
879
880/*
881 * As above except that OOB data
882 * is inserted at the beginning of the sockbuf,
883 * but after any other OOB data.
884 */
885void
886sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
887{
888	struct mbuf	*m, **mp;
889
890	KASSERT(solocked(sb->sb_so));
891
892	if (m0 == NULL)
893		return;
894
895	SBLASTRECORDCHK(sb, "sbinsertoob 1");
896
897	for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
898	    again:
899		switch (m->m_type) {
900
901		case MT_OOBDATA:
902			continue;		/* WANT next train */
903
904		case MT_CONTROL:
905			if ((m = m->m_next) != NULL)
906				goto again;	/* inspect THIS train further */
907		}
908		break;
909	}
910	/*
911	 * Put the first mbuf on the queue.
912	 * Note this permits zero length records.
913	 */
914	sballoc(sb, m0);
915	m0->m_nextpkt = *mp;
916	if (*mp == NULL) {
917		/* m0 is actually the new tail */
918		sb->sb_lastrecord = m0;
919	}
920	*mp = m0;
921	m = m0->m_next;
922	m0->m_next = 0;
923	if (m && (m0->m_flags & M_EOR)) {
924		m0->m_flags &= ~M_EOR;
925		m->m_flags |= M_EOR;
926	}
927	sbcompress(sb, m, m0);
928	SBLASTRECORDCHK(sb, "sbinsertoob 2");
929}
930
931/*
932 * Append address and data, and optionally, control (ancillary) data
933 * to the receive queue of a socket.  If present,
934 * m0 must include a packet header with total length.
935 * Returns 0 if no space in sockbuf or insufficient mbufs.
936 */
937int
938sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
939	struct mbuf *control)
940{
941	struct mbuf	*m, *n, *nlast;
942	int		space, len;
943
944	KASSERT(solocked(sb->sb_so));
945
946	space = asa->sa_len;
947
948	if (m0 != NULL) {
949		if ((m0->m_flags & M_PKTHDR) == 0)
950			panic("sbappendaddr");
951		space += m0->m_pkthdr.len;
952#ifdef MBUFTRACE
953		m_claimm(m0, sb->sb_mowner);
954#endif
955	}
956	for (n = control; n; n = n->m_next) {
957		space += n->m_len;
958		MCLAIM(n, sb->sb_mowner);
959		if (n->m_next == NULL)	/* keep pointer to last control buf */
960			break;
961	}
962	if (space > sbspace(sb))
963		return (0);
964	m = m_get(M_DONTWAIT, MT_SONAME);
965	if (m == NULL)
966		return (0);
967	MCLAIM(m, sb->sb_mowner);
968	/*
969	 * XXX avoid 'comparison always true' warning which isn't easily
970	 * avoided.
971	 */
972	len = asa->sa_len;
973	if (len > MLEN) {
974		MEXTMALLOC(m, asa->sa_len, M_NOWAIT);
975		if ((m->m_flags & M_EXT) == 0) {
976			m_free(m);
977			return (0);
978		}
979	}
980	m->m_len = asa->sa_len;
981	memcpy(mtod(m, void *), asa, asa->sa_len);
982	if (n)
983		n->m_next = m0;		/* concatenate data to control */
984	else
985		control = m0;
986	m->m_next = control;
987
988	SBLASTRECORDCHK(sb, "sbappendaddr 1");
989
990	for (n = m; n->m_next != NULL; n = n->m_next)
991		sballoc(sb, n);
992	sballoc(sb, n);
993	nlast = n;
994	SBLINKRECORD(sb, m);
995
996	sb->sb_mbtail = nlast;
997	SBLASTMBUFCHK(sb, "sbappendaddr");
998	SBLASTRECORDCHK(sb, "sbappendaddr 2");
999
1000	return (1);
1001}
1002
1003/*
1004 * Helper for sbappendchainaddr: prepend a struct sockaddr* to
1005 * an mbuf chain.
1006 */
1007static inline struct mbuf *
1008m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0,
1009		   const struct sockaddr *asa)
1010{
1011	struct mbuf *m;
1012	const int salen = asa->sa_len;
1013
1014	KASSERT(solocked(sb->sb_so));
1015
1016	/* only the first in each chain need be a pkthdr */
1017	m = m_gethdr(M_DONTWAIT, MT_SONAME);
1018	if (m == NULL)
1019		return NULL;
1020	MCLAIM(m, sb->sb_mowner);
1021#ifdef notyet
1022	if (salen > MHLEN) {
1023		MEXTMALLOC(m, salen, M_NOWAIT);
1024		if ((m->m_flags & M_EXT) == 0) {
1025			m_free(m);
1026			return NULL;
1027		}
1028	}
1029#else
1030	KASSERT(salen <= MHLEN);
1031#endif
1032	m->m_len = salen;
1033	memcpy(mtod(m, void *), asa, salen);
1034	m->m_next = m0;
1035	m->m_pkthdr.len = salen + m0->m_pkthdr.len;
1036
1037	return m;
1038}
1039
1040int
1041sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
1042		  struct mbuf *m0, int sbprio)
1043{
1044	struct mbuf *m, *n, *n0, *nlast;
1045	int error;
1046
1047	KASSERT(solocked(sb->sb_so));
1048
1049	/*
1050	 * XXX sbprio reserved for encoding priority of this* request:
1051	 *  SB_PRIO_NONE --> honour normal sb limits
1052	 *  SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
1053	 *	take whole chain. Intended for large requests
1054	 *      that should be delivered atomically (all, or none).
1055	 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
1056	 *       over normal socket limits, for messages indicating
1057	 *       buffer overflow in earlier normal/lower-priority messages
1058	 * SB_PRIO_BESTEFFORT -->  ignore limits entirely.
1059	 *       Intended for  kernel-generated messages only.
1060	 *        Up to generator to avoid total mbuf resource exhaustion.
1061	 */
1062	(void)sbprio;
1063
1064	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
1065		panic("sbappendaddrchain");
1066
1067#ifdef notyet
1068	space = sbspace(sb);
1069
1070	/*
1071	 * Enforce SB_PRIO_* limits as described above.
1072	 */
1073#endif
1074
1075	n0 = NULL;
1076	nlast = NULL;
1077	for (m = m0; m; m = m->m_nextpkt) {
1078		struct mbuf *np;
1079
1080#ifdef MBUFTRACE
1081		m_claimm(m, sb->sb_mowner);
1082#endif
1083
1084		/* Prepend sockaddr to this record (m) of input chain m0 */
1085	  	n = m_prepend_sockaddr(sb, m, asa);
1086		if (n == NULL) {
1087			error = ENOBUFS;
1088			goto bad;
1089		}
1090
1091		/* Append record (asa+m) to end of new chain n0 */
1092		if (n0 == NULL) {
1093			n0 = n;
1094		} else {
1095			nlast->m_nextpkt = n;
1096		}
1097		/* Keep track of last record on new chain */
1098		nlast = n;
1099
1100		for (np = n; np; np = np->m_next)
1101			sballoc(sb, np);
1102	}
1103
1104	SBLASTRECORDCHK(sb, "sbappendaddrchain 1");
1105
1106	/* Drop the entire chain of (asa+m) records onto the socket */
1107	SBLINKRECORDCHAIN(sb, n0, nlast);
1108
1109	SBLASTRECORDCHK(sb, "sbappendaddrchain 2");
1110
1111	for (m = nlast; m->m_next; m = m->m_next)
1112		;
1113	sb->sb_mbtail = m;
1114	SBLASTMBUFCHK(sb, "sbappendaddrchain");
1115
1116	return (1);
1117
1118bad:
1119	/*
1120	 * On error, free the prepended addreseses. For consistency
1121	 * with sbappendaddr(), leave it to our caller to free
1122	 * the input record chain passed to us as m0.
1123	 */
1124	while ((n = n0) != NULL) {
1125	  	struct mbuf *np;
1126
1127		/* Undo the sballoc() of this record */
1128		for (np = n; np; np = np->m_next)
1129			sbfree(sb, np);
1130
1131		n0 = n->m_nextpkt;	/* iterate at next prepended address */
1132		MFREE(n, np);		/* free prepended address (not data) */
1133	}
1134	return error;
1135}
1136
1137
1138int
1139sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
1140{
1141	struct mbuf	*m, *mlast, *n;
1142	int		space;
1143
1144	KASSERT(solocked(sb->sb_so));
1145
1146	space = 0;
1147	if (control == NULL)
1148		panic("sbappendcontrol");
1149	for (m = control; ; m = m->m_next) {
1150		space += m->m_len;
1151		MCLAIM(m, sb->sb_mowner);
1152		if (m->m_next == NULL)
1153			break;
1154	}
1155	n = m;			/* save pointer to last control buffer */
1156	for (m = m0; m; m = m->m_next) {
1157		MCLAIM(m, sb->sb_mowner);
1158		space += m->m_len;
1159	}
1160	if (space > sbspace(sb))
1161		return (0);
1162	n->m_next = m0;			/* concatenate data to control */
1163
1164	SBLASTRECORDCHK(sb, "sbappendcontrol 1");
1165
1166	for (m = control; m->m_next != NULL; m = m->m_next)
1167		sballoc(sb, m);
1168	sballoc(sb, m);
1169	mlast = m;
1170	SBLINKRECORD(sb, control);
1171
1172	sb->sb_mbtail = mlast;
1173	SBLASTMBUFCHK(sb, "sbappendcontrol");
1174	SBLASTRECORDCHK(sb, "sbappendcontrol 2");
1175
1176	return (1);
1177}
1178
1179/*
1180 * Compress mbuf chain m into the socket
1181 * buffer sb following mbuf n.  If n
1182 * is null, the buffer is presumed empty.
1183 */
1184void
1185sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1186{
1187	int		eor;
1188	struct mbuf	*o;
1189
1190	KASSERT(solocked(sb->sb_so));
1191
1192	eor = 0;
1193	while (m) {
1194		eor |= m->m_flags & M_EOR;
1195		if (m->m_len == 0 &&
1196		    (eor == 0 ||
1197		     (((o = m->m_next) || (o = n)) &&
1198		      o->m_type == m->m_type))) {
1199			if (sb->sb_lastrecord == m)
1200				sb->sb_lastrecord = m->m_next;
1201			m = m_free(m);
1202			continue;
1203		}
1204		if (n && (n->m_flags & M_EOR) == 0 &&
1205		    /* M_TRAILINGSPACE() checks buffer writeability */
1206		    m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */
1207		    m->m_len <= M_TRAILINGSPACE(n) &&
1208		    n->m_type == m->m_type) {
1209			memcpy(mtod(n, char *) + n->m_len, mtod(m, void *),
1210			    (unsigned)m->m_len);
1211			n->m_len += m->m_len;
1212			sb->sb_cc += m->m_len;
1213			m = m_free(m);
1214			continue;
1215		}
1216		if (n)
1217			n->m_next = m;
1218		else
1219			sb->sb_mb = m;
1220		sb->sb_mbtail = m;
1221		sballoc(sb, m);
1222		n = m;
1223		m->m_flags &= ~M_EOR;
1224		m = m->m_next;
1225		n->m_next = 0;
1226	}
1227	if (eor) {
1228		if (n)
1229			n->m_flags |= eor;
1230		else
1231			printf("semi-panic: sbcompress\n");
1232	}
1233	SBLASTMBUFCHK(sb, __func__);
1234}
1235
1236/*
1237 * Free all mbufs in a sockbuf.
1238 * Check that all resources are reclaimed.
1239 */
1240void
1241sbflush(struct sockbuf *sb)
1242{
1243
1244	KASSERT(solocked(sb->sb_so));
1245	KASSERT((sb->sb_flags & SB_LOCK) == 0);
1246
1247	while (sb->sb_mbcnt)
1248		sbdrop(sb, (int)sb->sb_cc);
1249
1250	KASSERT(sb->sb_cc == 0);
1251	KASSERT(sb->sb_mb == NULL);
1252	KASSERT(sb->sb_mbtail == NULL);
1253	KASSERT(sb->sb_lastrecord == NULL);
1254}
1255
1256/*
1257 * Drop data from (the front of) a sockbuf.
1258 */
1259void
1260sbdrop(struct sockbuf *sb, int len)
1261{
1262	struct mbuf	*m, *mn, *next;
1263
1264	KASSERT(solocked(sb->sb_so));
1265
1266	next = (m = sb->sb_mb) ? m->m_nextpkt : NULL;
1267	while (len > 0) {
1268		if (m == NULL) {
1269			if (next == NULL)
1270				panic("sbdrop(%p,%d): cc=%lu",
1271				    sb, len, sb->sb_cc);
1272			m = next;
1273			next = m->m_nextpkt;
1274			continue;
1275		}
1276		if (m->m_len > len) {
1277			m->m_len -= len;
1278			m->m_data += len;
1279			sb->sb_cc -= len;
1280			break;
1281		}
1282		len -= m->m_len;
1283		sbfree(sb, m);
1284		MFREE(m, mn);
1285		m = mn;
1286	}
1287	while (m && m->m_len == 0) {
1288		sbfree(sb, m);
1289		MFREE(m, mn);
1290		m = mn;
1291	}
1292	if (m) {
1293		sb->sb_mb = m;
1294		m->m_nextpkt = next;
1295	} else
1296		sb->sb_mb = next;
1297	/*
1298	 * First part is an inline SB_EMPTY_FIXUP().  Second part
1299	 * makes sure sb_lastrecord is up-to-date if we dropped
1300	 * part of the last record.
1301	 */
1302	m = sb->sb_mb;
1303	if (m == NULL) {
1304		sb->sb_mbtail = NULL;
1305		sb->sb_lastrecord = NULL;
1306	} else if (m->m_nextpkt == NULL)
1307		sb->sb_lastrecord = m;
1308}
1309
1310/*
1311 * Drop a record off the front of a sockbuf
1312 * and move the next record to the front.
1313 */
1314void
1315sbdroprecord(struct sockbuf *sb)
1316{
1317	struct mbuf	*m, *mn;
1318
1319	KASSERT(solocked(sb->sb_so));
1320
1321	m = sb->sb_mb;
1322	if (m) {
1323		sb->sb_mb = m->m_nextpkt;
1324		do {
1325			sbfree(sb, m);
1326			MFREE(m, mn);
1327		} while ((m = mn) != NULL);
1328	}
1329	SB_EMPTY_FIXUP(sb);
1330}
1331
1332/*
1333 * Create a "control" mbuf containing the specified data
1334 * with the specified type for presentation on a socket buffer.
1335 */
1336struct mbuf *
1337sbcreatecontrol1(void **p, int size, int type, int level, int flags)
1338{
1339	struct cmsghdr	*cp;
1340	struct mbuf	*m;
1341	int space = CMSG_SPACE(size);
1342
1343	if ((flags & M_DONTWAIT) && space > MCLBYTES) {
1344		printf("%s: message too large %d\n", __func__, space);
1345		return NULL;
1346	}
1347
1348	if ((m = m_get(flags, MT_CONTROL)) == NULL)
1349		return NULL;
1350	if (space > MLEN) {
1351		if (space > MCLBYTES)
1352			MEXTMALLOC(m, space, M_WAITOK);
1353		else
1354			MCLGET(m, flags);
1355		if ((m->m_flags & M_EXT) == 0) {
1356			m_free(m);
1357			return NULL;
1358		}
1359	}
1360	cp = mtod(m, struct cmsghdr *);
1361	*p = CMSG_DATA(cp);
1362	m->m_len = space;
1363	cp->cmsg_len = CMSG_LEN(size);
1364	cp->cmsg_level = level;
1365	cp->cmsg_type = type;
1366	return m;
1367}
1368
1369struct mbuf *
1370sbcreatecontrol(void *p, int size, int type, int level)
1371{
1372	struct mbuf *m;
1373	void *v;
1374
1375	m = sbcreatecontrol1(&v, size, type, level, M_DONTWAIT);
1376	if (m == NULL)
1377		return NULL;
1378	memcpy(v, p, size);
1379	return m;
1380}
1381
1382void
1383solockretry(struct socket *so, kmutex_t *lock)
1384{
1385
1386	while (lock != so->so_lock) {
1387		mutex_exit(lock);
1388		lock = so->so_lock;
1389		mutex_enter(lock);
1390	}
1391}
1392
1393bool
1394solocked(struct socket *so)
1395{
1396
1397	return mutex_owned(so->so_lock);
1398}
1399
1400bool
1401solocked2(struct socket *so1, struct socket *so2)
1402{
1403	kmutex_t *lock;
1404
1405	lock = so1->so_lock;
1406	if (lock != so2->so_lock)
1407		return false;
1408	return mutex_owned(lock);
1409}
1410
1411/*
1412 * sosetlock: assign a default lock to a new socket.
1413 */
1414void
1415sosetlock(struct socket *so)
1416{
1417	if (so->so_lock == NULL) {
1418		kmutex_t *lock = softnet_lock;
1419
1420		so->so_lock = lock;
1421		mutex_obj_hold(lock);
1422		mutex_enter(lock);
1423	}
1424
1425	/* In all cases, lock must be held on return from PRU_ATTACH. */
1426	KASSERT(solocked(so));
1427}
1428
1429/*
1430 * Set lock on sockbuf sb; sleep if lock is already held.
1431 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1432 * Returns error without lock if sleep is interrupted.
1433 */
1434int
1435sblock(struct sockbuf *sb, int wf)
1436{
1437	struct socket *so;
1438	kmutex_t *lock;
1439	int error;
1440
1441	KASSERT(solocked(sb->sb_so));
1442
1443	for (;;) {
1444		if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) {
1445			sb->sb_flags |= SB_LOCK;
1446			return 0;
1447		}
1448		if (wf != M_WAITOK)
1449			return EWOULDBLOCK;
1450		so = sb->sb_so;
1451		lock = so->so_lock;
1452		if ((sb->sb_flags & SB_NOINTR) != 0) {
1453			cv_wait(&so->so_cv, lock);
1454			error = 0;
1455		} else
1456			error = cv_wait_sig(&so->so_cv, lock);
1457		if (__predict_false(lock != so->so_lock))
1458			solockretry(so, lock);
1459		if (error != 0)
1460			return error;
1461	}
1462}
1463
1464void
1465sbunlock(struct sockbuf *sb)
1466{
1467	struct socket *so;
1468
1469	so = sb->sb_so;
1470
1471	KASSERT(solocked(so));
1472	KASSERT((sb->sb_flags & SB_LOCK) != 0);
1473
1474	sb->sb_flags &= ~SB_LOCK;
1475	cv_broadcast(&so->so_cv);
1476}
1477
1478int
1479sowait(struct socket *so, bool catch, int timo)
1480{
1481	kmutex_t *lock;
1482	int error;
1483
1484	KASSERT(solocked(so));
1485	KASSERT(catch || timo != 0);
1486
1487	lock = so->so_lock;
1488	if (catch)
1489		error = cv_timedwait_sig(&so->so_cv, lock, timo);
1490	else
1491		error = cv_timedwait(&so->so_cv, lock, timo);
1492	if (__predict_false(lock != so->so_lock))
1493		solockretry(so, lock);
1494	return error;
1495}
1496