uipc_sockbuf.c revision 160621
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/kern/uipc_sockbuf.c 160621 2006-07-24 16:21:31Z rwatson $");
34
35#include "opt_param.h"
36
37#include <sys/param.h>
38#include <sys/aio.h> /* for aio_swake proto */
39#include <sys/kernel.h>
40#include <sys/lock.h>
41#include <sys/mbuf.h>
42#include <sys/mutex.h>
43#include <sys/proc.h>
44#include <sys/protosw.h>
45#include <sys/resourcevar.h>
46#include <sys/signalvar.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/sysctl.h>
50
51/*
52 * Function pointer set by the AIO routines so that the socket buffer code
53 * can call back into the AIO module if it is loaded.
54 */
55void	(*aio_swake)(struct socket *, struct sockbuf *);
56
57/*
58 * Primitive routines for operating on socket buffers
59 */
60
61u_long	sb_max = SB_MAX;
62static	u_long sb_max_adj =
63    SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
64
65static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
66
67/*
68 * Socantsendmore indicates that no more data will be sent on the
69 * socket; it would normally be applied to a socket when the user
70 * informs the system that no more data is to be sent, by the protocol
71 * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
72 * will be received, and will normally be applied to the socket by a
73 * protocol when it detects that the peer will send no more data.
74 * Data queued for reading in the socket may yet be read.
75 */
76void
77socantsendmore_locked(so)
78	struct socket *so;
79{
80
81	SOCKBUF_LOCK_ASSERT(&so->so_snd);
82
83	so->so_snd.sb_state |= SBS_CANTSENDMORE;
84	sowwakeup_locked(so);
85	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
86}
87
88void
89socantsendmore(so)
90	struct socket *so;
91{
92
93	SOCKBUF_LOCK(&so->so_snd);
94	socantsendmore_locked(so);
95	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
96}
97
98void
99socantrcvmore_locked(so)
100	struct socket *so;
101{
102
103	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
104
105	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
106	sorwakeup_locked(so);
107	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
108}
109
110void
111socantrcvmore(so)
112	struct socket *so;
113{
114
115	SOCKBUF_LOCK(&so->so_rcv);
116	socantrcvmore_locked(so);
117	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
118}
119
120/*
121 * Wait for data to arrive at/drain from a socket buffer.
122 */
123int
124sbwait(sb)
125	struct sockbuf *sb;
126{
127
128	SOCKBUF_LOCK_ASSERT(sb);
129
130	sb->sb_flags |= SB_WAIT;
131	return (msleep(&sb->sb_cc, &sb->sb_mtx,
132	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
133	    sb->sb_timeo));
134}
135
136/*
137 * Lock a sockbuf already known to be locked;
138 * return any error returned from sleep (EINTR).
139 */
140int
141sb_lock(sb)
142	register struct sockbuf *sb;
143{
144	int error;
145
146	SOCKBUF_LOCK_ASSERT(sb);
147
148	while (sb->sb_flags & SB_LOCK) {
149		sb->sb_flags |= SB_WANT;
150		error = msleep(&sb->sb_flags, &sb->sb_mtx,
151		    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
152		    "sblock", 0);
153		if (error)
154			return (error);
155	}
156	sb->sb_flags |= SB_LOCK;
157	return (0);
158}
159
160/*
161 * Wakeup processes waiting on a socket buffer.  Do asynchronous
162 * notification via SIGIO if the socket has the SS_ASYNC flag set.
163 *
164 * Called with the socket buffer lock held; will release the lock by the end
165 * of the function.  This allows the caller to acquire the socket buffer lock
166 * while testing for the need for various sorts of wakeup and hold it through
167 * to the point where it's no longer required.  We currently hold the lock
168 * through calls out to other subsystems (with the exception of kqueue), and
169 * then release it to avoid lock order issues.  It's not clear that's
170 * correct.
171 */
172void
173sowakeup(so, sb)
174	register struct socket *so;
175	register struct sockbuf *sb;
176{
177
178	SOCKBUF_LOCK_ASSERT(sb);
179
180	selwakeuppri(&sb->sb_sel, PSOCK);
181	sb->sb_flags &= ~SB_SEL;
182	if (sb->sb_flags & SB_WAIT) {
183		sb->sb_flags &= ~SB_WAIT;
184		wakeup(&sb->sb_cc);
185	}
186	KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
187	SOCKBUF_UNLOCK(sb);
188	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
189		pgsigio(&so->so_sigio, SIGIO, 0);
190	if (sb->sb_flags & SB_UPCALL)
191		(*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
192	if (sb->sb_flags & SB_AIO)
193		aio_swake(so, sb);
194	mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED);
195}
196
197/*
198 * Socket buffer (struct sockbuf) utility routines.
199 *
200 * Each socket contains two socket buffers: one for sending data and
201 * one for receiving data.  Each buffer contains a queue of mbufs,
202 * information about the number of mbufs and amount of data in the
203 * queue, and other fields allowing select() statements and notification
204 * on data availability to be implemented.
205 *
206 * Data stored in a socket buffer is maintained as a list of records.
207 * Each record is a list of mbufs chained together with the m_next
208 * field.  Records are chained together with the m_nextpkt field. The upper
209 * level routine soreceive() expects the following conventions to be
210 * observed when placing information in the receive buffer:
211 *
212 * 1. If the protocol requires each message be preceded by the sender's
213 *    name, then a record containing that name must be present before
214 *    any associated data (mbuf's must be of type MT_SONAME).
215 * 2. If the protocol supports the exchange of ``access rights'' (really
216 *    just additional data associated with the message), and there are
217 *    ``rights'' to be received, then a record containing this data
218 *    should be present (mbuf's must be of type MT_RIGHTS).
219 * 3. If a name or rights record exists, then it must be followed by
220 *    a data record, perhaps of zero length.
221 *
222 * Before using a new socket structure it is first necessary to reserve
223 * buffer space to the socket, by calling sbreserve().  This should commit
224 * some of the available buffer space in the system buffer pool for the
225 * socket (currently, it does nothing but enforce limits).  The space
226 * should be released by calling sbrelease() when the socket is destroyed.
227 */
228
229int
230soreserve(so, sndcc, rcvcc)
231	register struct socket *so;
232	u_long sndcc, rcvcc;
233{
234	struct thread *td = curthread;
235
236	SOCKBUF_LOCK(&so->so_snd);
237	SOCKBUF_LOCK(&so->so_rcv);
238	if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0)
239		goto bad;
240	if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0)
241		goto bad2;
242	if (so->so_rcv.sb_lowat == 0)
243		so->so_rcv.sb_lowat = 1;
244	if (so->so_snd.sb_lowat == 0)
245		so->so_snd.sb_lowat = MCLBYTES;
246	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
247		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
248	SOCKBUF_UNLOCK(&so->so_rcv);
249	SOCKBUF_UNLOCK(&so->so_snd);
250	return (0);
251bad2:
252	sbrelease_locked(&so->so_snd, so);
253bad:
254	SOCKBUF_UNLOCK(&so->so_rcv);
255	SOCKBUF_UNLOCK(&so->so_snd);
256	return (ENOBUFS);
257}
258
259static int
260sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
261{
262	int error = 0;
263	u_long old_sb_max = sb_max;
264
265	error = SYSCTL_OUT(req, arg1, sizeof(u_long));
266	if (error || !req->newptr)
267		return (error);
268	error = SYSCTL_IN(req, arg1, sizeof(u_long));
269	if (error)
270		return (error);
271	if (sb_max < MSIZE + MCLBYTES) {
272		sb_max = old_sb_max;
273		return (EINVAL);
274	}
275	sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
276	return (0);
277}
278
279/*
280 * Allot mbufs to a sockbuf.
281 * Attempt to scale mbmax so that mbcnt doesn't become limiting
282 * if buffering efficiency is near the normal case.
283 */
284int
285sbreserve_locked(sb, cc, so, td)
286	struct sockbuf *sb;
287	u_long cc;
288	struct socket *so;
289	struct thread *td;
290{
291	rlim_t sbsize_limit;
292
293	SOCKBUF_LOCK_ASSERT(sb);
294
295	/*
296	 * td will only be NULL when we're in an interrupt
297	 * (e.g. in tcp_input())
298	 */
299	if (cc > sb_max_adj)
300		return (0);
301	if (td != NULL) {
302		PROC_LOCK(td->td_proc);
303		sbsize_limit = lim_cur(td->td_proc, RLIMIT_SBSIZE);
304		PROC_UNLOCK(td->td_proc);
305	} else
306		sbsize_limit = RLIM_INFINITY;
307	if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
308	    sbsize_limit))
309		return (0);
310	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
311	if (sb->sb_lowat > sb->sb_hiwat)
312		sb->sb_lowat = sb->sb_hiwat;
313	return (1);
314}
315
316int
317sbreserve(sb, cc, so, td)
318	struct sockbuf *sb;
319	u_long cc;
320	struct socket *so;
321	struct thread *td;
322{
323	int error;
324
325	SOCKBUF_LOCK(sb);
326	error = sbreserve_locked(sb, cc, so, td);
327	SOCKBUF_UNLOCK(sb);
328	return (error);
329}
330
331/*
332 * Free mbufs held by a socket, and reserved mbuf space.
333 */
334void
335sbrelease_locked(sb, so)
336	struct sockbuf *sb;
337	struct socket *so;
338{
339
340	SOCKBUF_LOCK_ASSERT(sb);
341
342	sbflush_locked(sb);
343	(void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
344	    RLIM_INFINITY);
345	sb->sb_mbmax = 0;
346}
347
348void
349sbrelease(sb, so)
350	struct sockbuf *sb;
351	struct socket *so;
352{
353
354	SOCKBUF_LOCK(sb);
355	sbrelease_locked(sb, so);
356	SOCKBUF_UNLOCK(sb);
357}
358/*
359 * Routines to add and remove
360 * data from an mbuf queue.
361 *
362 * The routines sbappend() or sbappendrecord() are normally called to
363 * append new mbufs to a socket buffer, after checking that adequate
364 * space is available, comparing the function sbspace() with the amount
365 * of data to be added.  sbappendrecord() differs from sbappend() in
366 * that data supplied is treated as the beginning of a new record.
367 * To place a sender's address, optional access rights, and data in a
368 * socket receive buffer, sbappendaddr() should be used.  To place
369 * access rights and data in a socket receive buffer, sbappendrights()
370 * should be used.  In either case, the new data begins a new record.
371 * Note that unlike sbappend() and sbappendrecord(), these routines check
372 * for the caller that there will be enough space to store the data.
373 * Each fails if there is not enough space, or if it cannot find mbufs
374 * to store additional information in.
375 *
376 * Reliable protocols may use the socket send buffer to hold data
377 * awaiting acknowledgement.  Data is normally copied from a socket
378 * send buffer in a protocol with m_copy for output to a peer,
379 * and then removing the data from the socket buffer with sbdrop()
380 * or sbdroprecord() when the data is acknowledged by the peer.
381 */
382
383#ifdef SOCKBUF_DEBUG
384void
385sblastrecordchk(struct sockbuf *sb, const char *file, int line)
386{
387	struct mbuf *m = sb->sb_mb;
388
389	SOCKBUF_LOCK_ASSERT(sb);
390
391	while (m && m->m_nextpkt)
392		m = m->m_nextpkt;
393
394	if (m != sb->sb_lastrecord) {
395		printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
396			__func__, sb->sb_mb, sb->sb_lastrecord, m);
397		printf("packet chain:\n");
398		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
399			printf("\t%p\n", m);
400		panic("%s from %s:%u", __func__, file, line);
401	}
402}
403
404void
405sblastmbufchk(struct sockbuf *sb, const char *file, int line)
406{
407	struct mbuf *m = sb->sb_mb;
408	struct mbuf *n;
409
410	SOCKBUF_LOCK_ASSERT(sb);
411
412	while (m && m->m_nextpkt)
413		m = m->m_nextpkt;
414
415	while (m && m->m_next)
416		m = m->m_next;
417
418	if (m != sb->sb_mbtail) {
419		printf("%s: sb_mb %p sb_mbtail %p last %p\n",
420			__func__, sb->sb_mb, sb->sb_mbtail, m);
421		printf("packet tree:\n");
422		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
423			printf("\t");
424			for (n = m; n != NULL; n = n->m_next)
425				printf("%p ", n);
426			printf("\n");
427		}
428		panic("%s from %s:%u", __func__, file, line);
429	}
430}
431#endif /* SOCKBUF_DEBUG */
432
433#define SBLINKRECORD(sb, m0) do {					\
434	SOCKBUF_LOCK_ASSERT(sb);					\
435	if ((sb)->sb_lastrecord != NULL)				\
436		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
437	else								\
438		(sb)->sb_mb = (m0);					\
439	(sb)->sb_lastrecord = (m0);					\
440} while (/*CONSTCOND*/0)
441
442/*
443 * Append mbuf chain m to the last record in the
444 * socket buffer sb.  The additional space associated
445 * the mbuf chain is recorded in sb.  Empty mbufs are
446 * discarded and mbufs are compacted where possible.
447 */
448void
449sbappend_locked(sb, m)
450	struct sockbuf *sb;
451	struct mbuf *m;
452{
453	register struct mbuf *n;
454
455	SOCKBUF_LOCK_ASSERT(sb);
456
457	if (m == 0)
458		return;
459
460	SBLASTRECORDCHK(sb);
461	n = sb->sb_mb;
462	if (n) {
463		while (n->m_nextpkt)
464			n = n->m_nextpkt;
465		do {
466			if (n->m_flags & M_EOR) {
467				sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
468				return;
469			}
470		} while (n->m_next && (n = n->m_next));
471	} else {
472		/*
473		 * XXX Would like to simply use sb_mbtail here, but
474		 * XXX I need to verify that I won't miss an EOR that
475		 * XXX way.
476		 */
477		if ((n = sb->sb_lastrecord) != NULL) {
478			do {
479				if (n->m_flags & M_EOR) {
480					sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
481					return;
482				}
483			} while (n->m_next && (n = n->m_next));
484		} else {
485			/*
486			 * If this is the first record in the socket buffer,
487			 * it's also the last record.
488			 */
489			sb->sb_lastrecord = m;
490		}
491	}
492	sbcompress(sb, m, n);
493	SBLASTRECORDCHK(sb);
494}
495
496/*
497 * Append mbuf chain m to the last record in the
498 * socket buffer sb.  The additional space associated
499 * the mbuf chain is recorded in sb.  Empty mbufs are
500 * discarded and mbufs are compacted where possible.
501 */
502void
503sbappend(sb, m)
504	struct sockbuf *sb;
505	struct mbuf *m;
506{
507
508	SOCKBUF_LOCK(sb);
509	sbappend_locked(sb, m);
510	SOCKBUF_UNLOCK(sb);
511}
512
513/*
514 * This version of sbappend() should only be used when the caller
515 * absolutely knows that there will never be more than one record
516 * in the socket buffer, that is, a stream protocol (such as TCP).
517 */
518void
519sbappendstream_locked(struct sockbuf *sb, struct mbuf *m)
520{
521	SOCKBUF_LOCK_ASSERT(sb);
522
523	KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
524	KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
525
526	SBLASTMBUFCHK(sb);
527
528	sbcompress(sb, m, sb->sb_mbtail);
529
530	sb->sb_lastrecord = sb->sb_mb;
531	SBLASTRECORDCHK(sb);
532}
533
534/*
535 * This version of sbappend() should only be used when the caller
536 * absolutely knows that there will never be more than one record
537 * in the socket buffer, that is, a stream protocol (such as TCP).
538 */
539void
540sbappendstream(struct sockbuf *sb, struct mbuf *m)
541{
542
543	SOCKBUF_LOCK(sb);
544	sbappendstream_locked(sb, m);
545	SOCKBUF_UNLOCK(sb);
546}
547
548#ifdef SOCKBUF_DEBUG
549void
550sbcheck(sb)
551	struct sockbuf *sb;
552{
553	struct mbuf *m;
554	struct mbuf *n = 0;
555	u_long len = 0, mbcnt = 0;
556
557	SOCKBUF_LOCK_ASSERT(sb);
558
559	for (m = sb->sb_mb; m; m = n) {
560	    n = m->m_nextpkt;
561	    for (; m; m = m->m_next) {
562		len += m->m_len;
563		mbcnt += MSIZE;
564		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
565			mbcnt += m->m_ext.ext_size;
566	    }
567	}
568	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
569		printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc,
570		    mbcnt, sb->sb_mbcnt);
571		panic("sbcheck");
572	}
573}
574#endif
575
576/*
577 * As above, except the mbuf chain
578 * begins a new record.
579 */
580void
581sbappendrecord_locked(sb, m0)
582	register struct sockbuf *sb;
583	register struct mbuf *m0;
584{
585	register struct mbuf *m;
586
587	SOCKBUF_LOCK_ASSERT(sb);
588
589	if (m0 == 0)
590		return;
591	m = sb->sb_mb;
592	if (m)
593		while (m->m_nextpkt)
594			m = m->m_nextpkt;
595	/*
596	 * Put the first mbuf on the queue.
597	 * Note this permits zero length records.
598	 */
599	sballoc(sb, m0);
600	SBLASTRECORDCHK(sb);
601	SBLINKRECORD(sb, m0);
602	if (m)
603		m->m_nextpkt = m0;
604	else
605		sb->sb_mb = m0;
606	m = m0->m_next;
607	m0->m_next = 0;
608	if (m && (m0->m_flags & M_EOR)) {
609		m0->m_flags &= ~M_EOR;
610		m->m_flags |= M_EOR;
611	}
612	sbcompress(sb, m, m0);
613}
614
615/*
616 * As above, except the mbuf chain
617 * begins a new record.
618 */
619void
620sbappendrecord(sb, m0)
621	register struct sockbuf *sb;
622	register struct mbuf *m0;
623{
624
625	SOCKBUF_LOCK(sb);
626	sbappendrecord_locked(sb, m0);
627	SOCKBUF_UNLOCK(sb);
628}
629
630/*
631 * Append address and data, and optionally, control (ancillary) data
632 * to the receive queue of a socket.  If present,
633 * m0 must include a packet header with total length.
634 * Returns 0 if no space in sockbuf or insufficient mbufs.
635 */
636int
637sbappendaddr_locked(sb, asa, m0, control)
638	struct sockbuf *sb;
639	const struct sockaddr *asa;
640	struct mbuf *m0, *control;
641{
642	struct mbuf *m, *n, *nlast;
643	int space = asa->sa_len;
644
645	SOCKBUF_LOCK_ASSERT(sb);
646
647	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
648		panic("sbappendaddr_locked");
649	if (m0)
650		space += m0->m_pkthdr.len;
651	space += m_length(control, &n);
652
653	if (space > sbspace(sb))
654		return (0);
655#if MSIZE <= 256
656	if (asa->sa_len > MLEN)
657		return (0);
658#endif
659	MGET(m, M_DONTWAIT, MT_SONAME);
660	if (m == 0)
661		return (0);
662	m->m_len = asa->sa_len;
663	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
664	if (n)
665		n->m_next = m0;		/* concatenate data to control */
666	else
667		control = m0;
668	m->m_next = control;
669	for (n = m; n->m_next != NULL; n = n->m_next)
670		sballoc(sb, n);
671	sballoc(sb, n);
672	nlast = n;
673	SBLINKRECORD(sb, m);
674
675	sb->sb_mbtail = nlast;
676	SBLASTMBUFCHK(sb);
677
678	SBLASTRECORDCHK(sb);
679	return (1);
680}
681
682/*
683 * Append address and data, and optionally, control (ancillary) data
684 * to the receive queue of a socket.  If present,
685 * m0 must include a packet header with total length.
686 * Returns 0 if no space in sockbuf or insufficient mbufs.
687 */
688int
689sbappendaddr(sb, asa, m0, control)
690	struct sockbuf *sb;
691	const struct sockaddr *asa;
692	struct mbuf *m0, *control;
693{
694	int retval;
695
696	SOCKBUF_LOCK(sb);
697	retval = sbappendaddr_locked(sb, asa, m0, control);
698	SOCKBUF_UNLOCK(sb);
699	return (retval);
700}
701
702int
703sbappendcontrol_locked(sb, m0, control)
704	struct sockbuf *sb;
705	struct mbuf *control, *m0;
706{
707	struct mbuf *m, *n, *mlast;
708	int space;
709
710	SOCKBUF_LOCK_ASSERT(sb);
711
712	if (control == 0)
713		panic("sbappendcontrol_locked");
714	space = m_length(control, &n) + m_length(m0, NULL);
715
716	if (space > sbspace(sb))
717		return (0);
718	n->m_next = m0;			/* concatenate data to control */
719
720	SBLASTRECORDCHK(sb);
721
722	for (m = control; m->m_next; m = m->m_next)
723		sballoc(sb, m);
724	sballoc(sb, m);
725	mlast = m;
726	SBLINKRECORD(sb, control);
727
728	sb->sb_mbtail = mlast;
729	SBLASTMBUFCHK(sb);
730
731	SBLASTRECORDCHK(sb);
732	return (1);
733}
734
735int
736sbappendcontrol(sb, m0, control)
737	struct sockbuf *sb;
738	struct mbuf *control, *m0;
739{
740	int retval;
741
742	SOCKBUF_LOCK(sb);
743	retval = sbappendcontrol_locked(sb, m0, control);
744	SOCKBUF_UNLOCK(sb);
745	return (retval);
746}
747
748/*
749 * Append the data in mbuf chain (m) into the socket buffer sb following mbuf
750 * (n).  If (n) is NULL, the buffer is presumed empty.
751 *
752 * When the data is compressed, mbufs in the chain may be handled in one of
753 * three ways:
754 *
755 * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no
756 *     record boundary, and no change in data type).
757 *
758 * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into
759 *     an mbuf already in the socket buffer.  This can occur if an
760 *     appropriate mbuf exists, there is room, and no merging of data types
761 *     will occur.
762 *
763 * (3) The mbuf may be appended to the end of the existing mbuf chain.
764 *
765 * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as
766 * end-of-record.
767 */
768void
769sbcompress(sb, m, n)
770	register struct sockbuf *sb;
771	register struct mbuf *m, *n;
772{
773	register int eor = 0;
774	register struct mbuf *o;
775
776	SOCKBUF_LOCK_ASSERT(sb);
777
778	while (m) {
779		eor |= m->m_flags & M_EOR;
780		if (m->m_len == 0 &&
781		    (eor == 0 ||
782		     (((o = m->m_next) || (o = n)) &&
783		      o->m_type == m->m_type))) {
784			if (sb->sb_lastrecord == m)
785				sb->sb_lastrecord = m->m_next;
786			m = m_free(m);
787			continue;
788		}
789		if (n && (n->m_flags & M_EOR) == 0 &&
790		    M_WRITABLE(n) &&
791		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
792		    m->m_len <= M_TRAILINGSPACE(n) &&
793		    n->m_type == m->m_type) {
794			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
795			    (unsigned)m->m_len);
796			n->m_len += m->m_len;
797			sb->sb_cc += m->m_len;
798			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
799				/* XXX: Probably don't need.*/
800				sb->sb_ctl += m->m_len;
801			m = m_free(m);
802			continue;
803		}
804		if (n)
805			n->m_next = m;
806		else
807			sb->sb_mb = m;
808		sb->sb_mbtail = m;
809		sballoc(sb, m);
810		n = m;
811		m->m_flags &= ~M_EOR;
812		m = m->m_next;
813		n->m_next = 0;
814	}
815	if (eor) {
816		KASSERT(n != NULL, ("sbcompress: eor && n == NULL"));
817		n->m_flags |= eor;
818	}
819	SBLASTMBUFCHK(sb);
820}
821
822/*
823 * Free all mbufs in a sockbuf.
824 * Check that all resources are reclaimed.
825 */
826void
827sbflush_locked(sb)
828	register struct sockbuf *sb;
829{
830
831	SOCKBUF_LOCK_ASSERT(sb);
832
833	if (sb->sb_flags & SB_LOCK)
834		panic("sbflush_locked: locked");
835	while (sb->sb_mbcnt) {
836		/*
837		 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
838		 * we would loop forever. Panic instead.
839		 */
840		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
841			break;
842		sbdrop_locked(sb, (int)sb->sb_cc);
843	}
844	if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
845		panic("sbflush_locked: cc %u || mb %p || mbcnt %u", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
846}
847
848void
849sbflush(sb)
850	register struct sockbuf *sb;
851{
852
853	SOCKBUF_LOCK(sb);
854	sbflush_locked(sb);
855	SOCKBUF_UNLOCK(sb);
856}
857
858/*
859 * Drop data from (the front of) a sockbuf.
860 */
861void
862sbdrop_locked(sb, len)
863	register struct sockbuf *sb;
864	register int len;
865{
866	register struct mbuf *m;
867	struct mbuf *next;
868
869	SOCKBUF_LOCK_ASSERT(sb);
870
871	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
872	while (len > 0) {
873		if (m == 0) {
874			if (next == 0)
875				panic("sbdrop");
876			m = next;
877			next = m->m_nextpkt;
878			continue;
879		}
880		if (m->m_len > len) {
881			m->m_len -= len;
882			m->m_data += len;
883			sb->sb_cc -= len;
884			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
885				sb->sb_ctl -= len;
886			break;
887		}
888		len -= m->m_len;
889		sbfree(sb, m);
890		m = m_free(m);
891	}
892	while (m && m->m_len == 0) {
893		sbfree(sb, m);
894		m = m_free(m);
895	}
896	if (m) {
897		sb->sb_mb = m;
898		m->m_nextpkt = next;
899	} else
900		sb->sb_mb = next;
901	/*
902	 * First part is an inline SB_EMPTY_FIXUP().  Second part
903	 * makes sure sb_lastrecord is up-to-date if we dropped
904	 * part of the last record.
905	 */
906	m = sb->sb_mb;
907	if (m == NULL) {
908		sb->sb_mbtail = NULL;
909		sb->sb_lastrecord = NULL;
910	} else if (m->m_nextpkt == NULL) {
911		sb->sb_lastrecord = m;
912	}
913}
914
915/*
916 * Drop data from (the front of) a sockbuf.
917 */
918void
919sbdrop(sb, len)
920	register struct sockbuf *sb;
921	register int len;
922{
923
924	SOCKBUF_LOCK(sb);
925	sbdrop_locked(sb, len);
926	SOCKBUF_UNLOCK(sb);
927}
928
929/*
930 * Drop a record off the front of a sockbuf
931 * and move the next record to the front.
932 */
933void
934sbdroprecord_locked(sb)
935	register struct sockbuf *sb;
936{
937	register struct mbuf *m;
938
939	SOCKBUF_LOCK_ASSERT(sb);
940
941	m = sb->sb_mb;
942	if (m) {
943		sb->sb_mb = m->m_nextpkt;
944		do {
945			sbfree(sb, m);
946			m = m_free(m);
947		} while (m);
948	}
949	SB_EMPTY_FIXUP(sb);
950}
951
952/*
953 * Drop a record off the front of a sockbuf
954 * and move the next record to the front.
955 */
956void
957sbdroprecord(sb)
958	register struct sockbuf *sb;
959{
960
961	SOCKBUF_LOCK(sb);
962	sbdroprecord_locked(sb);
963	SOCKBUF_UNLOCK(sb);
964}
965
966/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
967static int dummy;
968SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
969SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW,
970    &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size");
971SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
972    &sb_efficiency, 0, "");
973