1139804Simp/*-
21541Srgrimes * Copyright (c) 1982, 1986, 1988, 1990, 1993
31541Srgrimes *	The Regents of the University of California.  All rights reserved.
41541Srgrimes *
51541Srgrimes * Redistribution and use in source and binary forms, with or without
61541Srgrimes * modification, are permitted provided that the following conditions
71541Srgrimes * are met:
81541Srgrimes * 1. Redistributions of source code must retain the above copyright
91541Srgrimes *    notice, this list of conditions and the following disclaimer.
101541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
111541Srgrimes *    notice, this list of conditions and the following disclaimer in the
121541Srgrimes *    documentation and/or other materials provided with the distribution.
131541Srgrimes * 4. Neither the name of the University nor the names of its contributors
141541Srgrimes *    may be used to endorse or promote products derived from this software
151541Srgrimes *    without specific prior written permission.
161541Srgrimes *
171541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
181541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
191541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
201541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
211541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
221541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
231541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
241541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
251541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
261541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
271541Srgrimes * SUCH DAMAGE.
281541Srgrimes *
291541Srgrimes *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
301541Srgrimes */
311541Srgrimes
32116182Sobrien#include <sys/cdefs.h>
33116182Sobrien__FBSDID("$FreeBSD: releng/11.0/sys/kern/uipc_sockbuf.c 298069 2016-04-15 16:10:11Z pfg $");
34116182Sobrien
3577598Sjesper#include "opt_param.h"
36101013Srwatson
371541Srgrimes#include <sys/param.h>
3895759Stanimura#include <sys/aio.h> /* for aio_swake proto */
3912041Swollman#include <sys/kernel.h>
4076166Smarkm#include <sys/lock.h>
41295126Sglebius#include <sys/malloc.h>
421541Srgrimes#include <sys/mbuf.h>
4395759Stanimura#include <sys/mutex.h>
4476166Smarkm#include <sys/proc.h>
451541Srgrimes#include <sys/protosw.h>
4651381Sgreen#include <sys/resourcevar.h>
4795759Stanimura#include <sys/signalvar.h>
481541Srgrimes#include <sys/socket.h>
491541Srgrimes#include <sys/socketvar.h>
50169236Srwatson#include <sys/sx.h>
5112041Swollman#include <sys/sysctl.h>
521541Srgrimes
53160621Srwatson/*
54160621Srwatson * Function pointer set by the AIO routines so that the socket buffer code
55160621Srwatson * can call back into the AIO module if it is loaded.
56160621Srwatson */
57160621Srwatsonvoid	(*aio_swake)(struct socket *, struct sockbuf *);
5888633Salfred
591541Srgrimes/*
60160621Srwatson * Primitive routines for operating on socket buffers
611541Srgrimes */
621541Srgrimes
63101996Sdgu_long	sb_max = SB_MAX;
64172557Smohansu_long sb_max_adj =
65225169Sbz       (quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
661541Srgrimes
6713267Swollmanstatic	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
6813267Swollman
69256185Sglebiusstatic struct mbuf	*sbcut_internal(struct sockbuf *sb, int len);
70160915Srwatsonstatic void	sbflush_internal(struct sockbuf *sb);
71160875Srwatson
721541Srgrimes/*
73293432Sglebius * Our own version of m_clrprotoflags(), that can preserve M_NOTREADY.
74293432Sglebius */
75293432Sglebiusstatic void
76293432Sglebiussbm_clrprotoflags(struct mbuf *m, int flags)
77293432Sglebius{
78293432Sglebius	int mask;
79293432Sglebius
80293432Sglebius	mask = ~M_PROTOFLAGS;
81293432Sglebius	if (flags & PRUS_NOTREADY)
82293432Sglebius		mask |= M_NOTREADY;
83293432Sglebius	while (m) {
84293432Sglebius		m->m_flags &= mask;
85293432Sglebius		m = m->m_next;
86293432Sglebius	}
87293432Sglebius}
88293432Sglebius
89293432Sglebius/*
90275326Sglebius * Mark ready "count" mbufs starting with "m".
91275326Sglebius */
92275326Sglebiusint
93275326Sglebiussbready(struct sockbuf *sb, struct mbuf *m, int count)
94275326Sglebius{
95275326Sglebius	u_int blocker;
96275326Sglebius
97275326Sglebius	SOCKBUF_LOCK_ASSERT(sb);
98275326Sglebius	KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb));
99275326Sglebius
100275326Sglebius	blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0;
101275326Sglebius
102275326Sglebius	for (int i = 0; i < count; i++, m = m->m_next) {
103275326Sglebius		KASSERT(m->m_flags & M_NOTREADY,
104275326Sglebius		    ("%s: m %p !M_NOTREADY", __func__, m));
105275326Sglebius		m->m_flags &= ~(M_NOTREADY | blocker);
106275326Sglebius		if (blocker)
107275326Sglebius			sb->sb_acc += m->m_len;
108275326Sglebius	}
109275326Sglebius
110275326Sglebius	if (!blocker)
111275326Sglebius		return (EINPROGRESS);
112275326Sglebius
113275326Sglebius	/* This one was blocking all the queue. */
114275326Sglebius	for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) {
115275326Sglebius		KASSERT(m->m_flags & M_BLOCKED,
116275326Sglebius		    ("%s: m %p !M_BLOCKED", __func__, m));
117275326Sglebius		m->m_flags &= ~M_BLOCKED;
118275326Sglebius		sb->sb_acc += m->m_len;
119275326Sglebius	}
120275326Sglebius
121275326Sglebius	sb->sb_fnrdy = m;
122275326Sglebius
123275326Sglebius	return (0);
124275326Sglebius}
125275326Sglebius
126275326Sglebius/*
127275312Sglebius * Adjust sockbuf state reflecting allocation of m.
128275312Sglebius */
129275312Sglebiusvoid
130275312Sglebiussballoc(struct sockbuf *sb, struct mbuf *m)
131275312Sglebius{
132275312Sglebius
133275312Sglebius	SOCKBUF_LOCK_ASSERT(sb);
134275312Sglebius
135275326Sglebius	sb->sb_ccc += m->m_len;
136275312Sglebius
137275326Sglebius	if (sb->sb_fnrdy == NULL) {
138275326Sglebius		if (m->m_flags & M_NOTREADY)
139275326Sglebius			sb->sb_fnrdy = m;
140275326Sglebius		else
141275326Sglebius			sb->sb_acc += m->m_len;
142275326Sglebius	} else
143275326Sglebius		m->m_flags |= M_BLOCKED;
144275326Sglebius
145275312Sglebius	if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
146275312Sglebius		sb->sb_ctl += m->m_len;
147275312Sglebius
148275312Sglebius	sb->sb_mbcnt += MSIZE;
149275312Sglebius	sb->sb_mcnt += 1;
150275312Sglebius
151275312Sglebius	if (m->m_flags & M_EXT) {
152275312Sglebius		sb->sb_mbcnt += m->m_ext.ext_size;
153275312Sglebius		sb->sb_ccnt += 1;
154275312Sglebius	}
155275312Sglebius}
156275312Sglebius
157275312Sglebius/*
158275312Sglebius * Adjust sockbuf state reflecting freeing of m.
159275312Sglebius */
160275312Sglebiusvoid
161275312Sglebiussbfree(struct sockbuf *sb, struct mbuf *m)
162275312Sglebius{
163275312Sglebius
164275312Sglebius#if 0	/* XXX: not yet: soclose() call path comes here w/o lock. */
165275312Sglebius	SOCKBUF_LOCK_ASSERT(sb);
166275312Sglebius#endif
167275312Sglebius
168275326Sglebius	sb->sb_ccc -= m->m_len;
169275312Sglebius
170275326Sglebius	if (!(m->m_flags & M_NOTAVAIL))
171275326Sglebius		sb->sb_acc -= m->m_len;
172275326Sglebius
173275326Sglebius	if (m == sb->sb_fnrdy) {
174275326Sglebius		struct mbuf *n;
175275326Sglebius
176275326Sglebius		KASSERT(m->m_flags & M_NOTREADY,
177275326Sglebius		    ("%s: m %p !M_NOTREADY", __func__, m));
178275326Sglebius
179275326Sglebius		n = m->m_next;
180275326Sglebius		while (n != NULL && !(n->m_flags & M_NOTREADY)) {
181275326Sglebius			n->m_flags &= ~M_BLOCKED;
182275326Sglebius			sb->sb_acc += n->m_len;
183275326Sglebius			n = n->m_next;
184275326Sglebius		}
185275326Sglebius		sb->sb_fnrdy = n;
186275326Sglebius	}
187275326Sglebius
188275312Sglebius	if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
189275312Sglebius		sb->sb_ctl -= m->m_len;
190275312Sglebius
191275312Sglebius	sb->sb_mbcnt -= MSIZE;
192275312Sglebius	sb->sb_mcnt -= 1;
193275312Sglebius	if (m->m_flags & M_EXT) {
194275312Sglebius		sb->sb_mbcnt -= m->m_ext.ext_size;
195275312Sglebius		sb->sb_ccnt -= 1;
196275312Sglebius	}
197275312Sglebius
198275312Sglebius	if (sb->sb_sndptr == m) {
199275312Sglebius		sb->sb_sndptr = NULL;
200275312Sglebius		sb->sb_sndptroff = 0;
201275312Sglebius	}
202275312Sglebius	if (sb->sb_sndptroff != 0)
203275312Sglebius		sb->sb_sndptroff -= m->m_len;
204275312Sglebius}
205275312Sglebius
206275312Sglebius/*
207160915Srwatson * Socantsendmore indicates that no more data will be sent on the socket; it
208160915Srwatson * would normally be applied to a socket when the user informs the system
209160915Srwatson * that no more data is to be sent, by the protocol code (in case
210160915Srwatson * PRU_SHUTDOWN).  Socantrcvmore indicates that no more data will be
211160915Srwatson * received, and will normally be applied to the socket by a protocol when it
212160915Srwatson * detects that the peer will send no more data.  Data queued for reading in
213160915Srwatson * the socket may yet be read.
2141541Srgrimes */
215130831Srwatsonvoid
216160915Srwatsonsocantsendmore_locked(struct socket *so)
217130831Srwatson{
2181541Srgrimes
219130831Srwatson	SOCKBUF_LOCK_ASSERT(&so->so_snd);
220130831Srwatson
221130831Srwatson	so->so_snd.sb_state |= SBS_CANTSENDMORE;
222130831Srwatson	sowwakeup_locked(so);
223130831Srwatson	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
224130831Srwatson}
225130831Srwatson
2261549Srgrimesvoid
227160915Srwatsonsocantsendmore(struct socket *so)
2281541Srgrimes{
2291541Srgrimes
230130831Srwatson	SOCKBUF_LOCK(&so->so_snd);
231130831Srwatson	socantsendmore_locked(so);
232130831Srwatson	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
2331541Srgrimes}
2341541Srgrimes
2351549Srgrimesvoid
236160915Srwatsonsocantrcvmore_locked(struct socket *so)
2371541Srgrimes{
2381541Srgrimes
239130831Srwatson	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
240130831Srwatson
241130480Srwatson	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
242130831Srwatson	sorwakeup_locked(so);
243130831Srwatson	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
2441541Srgrimes}
2451541Srgrimes
246130831Srwatsonvoid
247160915Srwatsonsocantrcvmore(struct socket *so)
248130831Srwatson{
249130831Srwatson
250130831Srwatson	SOCKBUF_LOCK(&so->so_rcv);
251130831Srwatson	socantrcvmore_locked(so);
252130831Srwatson	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
253130831Srwatson}
254130831Srwatson
2551541Srgrimes/*
2561541Srgrimes * Wait for data to arrive at/drain from a socket buffer.
2571541Srgrimes */
2581549Srgrimesint
259160915Srwatsonsbwait(struct sockbuf *sb)
2601541Srgrimes{
2611541Srgrimes
262130705Srwatson	SOCKBUF_LOCK_ASSERT(sb);
263130705Srwatson
2641541Srgrimes	sb->sb_flags |= SB_WAIT;
265275326Sglebius	return (msleep_sbt(&sb->sb_acc, &sb->sb_mtx,
26612843Sbde	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
267255138Sdavide	    sb->sb_timeo, 0, 0));
2681541Srgrimes}
2691541Srgrimes
2701549Srgrimesint
271169236Srwatsonsblock(struct sockbuf *sb, int flags)
2721541Srgrimes{
2731541Srgrimes
274175845Srwatson	KASSERT((flags & SBL_VALID) == flags,
275175845Srwatson	    ("sblock: flags invalid (0x%x)", flags));
276175845Srwatson
277175845Srwatson	if (flags & SBL_WAIT) {
278175845Srwatson		if ((sb->sb_flags & SB_NOINTR) ||
279175845Srwatson		    (flags & SBL_NOINTR)) {
280170151Srwatson			sx_xlock(&sb->sb_sx);
281170151Srwatson			return (0);
282170151Srwatson		}
283170151Srwatson		return (sx_xlock_sig(&sb->sb_sx));
284169236Srwatson	} else {
285169236Srwatson		if (sx_try_xlock(&sb->sb_sx) == 0)
286169236Srwatson			return (EWOULDBLOCK);
287169236Srwatson		return (0);
2881541Srgrimes	}
2891541Srgrimes}
2901541Srgrimes
291169236Srwatsonvoid
292169236Srwatsonsbunlock(struct sockbuf *sb)
293169236Srwatson{
294169236Srwatson
295169236Srwatson	sx_xunlock(&sb->sb_sx);
296169236Srwatson}
297169236Srwatson
2981541Srgrimes/*
299160915Srwatson * Wakeup processes waiting on a socket buffer.  Do asynchronous notification
300160915Srwatson * via SIGIO if the socket has the SS_ASYNC flag set.
301130831Srwatson *
302130831Srwatson * Called with the socket buffer lock held; will release the lock by the end
303130831Srwatson * of the function.  This allows the caller to acquire the socket buffer lock
304130831Srwatson * while testing for the need for various sorts of wakeup and hold it through
305130831Srwatson * to the point where it's no longer required.  We currently hold the lock
306130831Srwatson * through calls out to other subsystems (with the exception of kqueue), and
307130831Srwatson * then release it to avoid lock order issues.  It's not clear that's
308130831Srwatson * correct.
3091541Srgrimes */
3101549Srgrimesvoid
311160915Srwatsonsowakeup(struct socket *so, struct sockbuf *sb)
3121541Srgrimes{
313193272Sjhb	int ret;
31495552Stanimura
315130831Srwatson	SOCKBUF_LOCK_ASSERT(sb);
316130831Srwatson
317122352Stanimura	selwakeuppri(&sb->sb_sel, PSOCK);
318174647Sjeff	if (!SEL_WAITING(&sb->sb_sel))
319174647Sjeff		sb->sb_flags &= ~SB_SEL;
3201541Srgrimes	if (sb->sb_flags & SB_WAIT) {
3211541Srgrimes		sb->sb_flags &= ~SB_WAIT;
322275326Sglebius		wakeup(&sb->sb_acc);
3231541Srgrimes	}
324133741Sjmg	KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
325193272Sjhb	if (sb->sb_upcall != NULL) {
326243882Sglebius		ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT);
327193272Sjhb		if (ret == SU_ISCONNECTED) {
328193272Sjhb			KASSERT(sb == &so->so_rcv,
329193272Sjhb			    ("SO_SND upcall returned SU_ISCONNECTED"));
330193272Sjhb			soupcall_clear(so, SO_RCV);
331193272Sjhb		}
332193272Sjhb	} else
333193272Sjhb		ret = SU_OK;
334193272Sjhb	if (sb->sb_flags & SB_AIO)
335296277Sjhb		sowakeup_aio(so, sb);
336130831Srwatson	SOCKBUF_UNLOCK(sb);
337193272Sjhb	if (ret == SU_ISCONNECTED)
338193272Sjhb		soisconnected(so);
33997658Stanimura	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
34095883Salfred		pgsigio(&so->so_sigio, SIGIO, 0);
341130831Srwatson	mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED);
3421541Srgrimes}
3431541Srgrimes
3441541Srgrimes/*
3451541Srgrimes * Socket buffer (struct sockbuf) utility routines.
3461541Srgrimes *
347160915Srwatson * Each socket contains two socket buffers: one for sending data and one for
348160915Srwatson * receiving data.  Each buffer contains a queue of mbufs, information about
349160915Srwatson * the number of mbufs and amount of data in the queue, and other fields
350160915Srwatson * allowing select() statements and notification on data availability to be
351160915Srwatson * implemented.
3521541Srgrimes *
353160915Srwatson * Data stored in a socket buffer is maintained as a list of records.  Each
354160915Srwatson * record is a list of mbufs chained together with the m_next field.  Records
355160915Srwatson * are chained together with the m_nextpkt field. The upper level routine
356160915Srwatson * soreceive() expects the following conventions to be observed when placing
357160915Srwatson * information in the receive buffer:
3581541Srgrimes *
359160915Srwatson * 1. If the protocol requires each message be preceded by the sender's name,
360160915Srwatson *    then a record containing that name must be present before any
361160915Srwatson *    associated data (mbuf's must be of type MT_SONAME).
362160915Srwatson * 2. If the protocol supports the exchange of ``access rights'' (really just
363160915Srwatson *    additional data associated with the message), and there are ``rights''
364160915Srwatson *    to be received, then a record containing this data should be present
365160915Srwatson *    (mbuf's must be of type MT_RIGHTS).
366160915Srwatson * 3. If a name or rights record exists, then it must be followed by a data
367160915Srwatson *    record, perhaps of zero length.
3681541Srgrimes *
3691541Srgrimes * Before using a new socket structure it is first necessary to reserve
3701541Srgrimes * buffer space to the socket, by calling sbreserve().  This should commit
3711541Srgrimes * some of the available buffer space in the system buffer pool for the
372160915Srwatson * socket (currently, it does nothing but enforce limits).  The space should
373160915Srwatson * be released by calling sbrelease() when the socket is destroyed.
3741541Srgrimes */
3751549Srgrimesint
376160915Srwatsonsoreserve(struct socket *so, u_long sndcc, u_long rcvcc)
3771541Srgrimes{
37883366Sjulian	struct thread *td = curthread;
3791541Srgrimes
380131006Srwatson	SOCKBUF_LOCK(&so->so_snd);
381131006Srwatson	SOCKBUF_LOCK(&so->so_rcv);
382131006Srwatson	if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0)
3831541Srgrimes		goto bad;
384131006Srwatson	if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0)
3851541Srgrimes		goto bad2;
3861541Srgrimes	if (so->so_rcv.sb_lowat == 0)
3871541Srgrimes		so->so_rcv.sb_lowat = 1;
3881541Srgrimes	if (so->so_snd.sb_lowat == 0)
3891541Srgrimes		so->so_snd.sb_lowat = MCLBYTES;
3901541Srgrimes	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
3911541Srgrimes		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
392131006Srwatson	SOCKBUF_UNLOCK(&so->so_rcv);
393130653Srwatson	SOCKBUF_UNLOCK(&so->so_snd);
3941541Srgrimes	return (0);
3951541Srgrimesbad2:
396131006Srwatson	sbrelease_locked(&so->so_snd, so);
3971541Srgrimesbad:
398131006Srwatson	SOCKBUF_UNLOCK(&so->so_rcv);
399131006Srwatson	SOCKBUF_UNLOCK(&so->so_snd);
4001541Srgrimes	return (ENOBUFS);
4011541Srgrimes}
4021541Srgrimes
403101996Sdgstatic int
404101996Sdgsysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
405101996Sdg{
406101996Sdg	int error = 0;
407162086Sjhb	u_long tmp_sb_max = sb_max;
408101996Sdg
409162086Sjhb	error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req);
410101996Sdg	if (error || !req->newptr)
411101996Sdg		return (error);
412162086Sjhb	if (tmp_sb_max < MSIZE + MCLBYTES)
413101996Sdg		return (EINVAL);
414162086Sjhb	sb_max = tmp_sb_max;
415101996Sdg	sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
416101996Sdg	return (0);
417101996Sdg}
418101996Sdg
4191541Srgrimes/*
420160915Srwatson * Allot mbufs to a sockbuf.  Attempt to scale mbmax so that mbcnt doesn't
421160915Srwatson * become limiting if buffering efficiency is near the normal case.
4221541Srgrimes */
4231549Srgrimesint
424160915Srwatsonsbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
425160915Srwatson    struct thread *td)
4261541Srgrimes{
427125454Sjhb	rlim_t sbsize_limit;
42852070Sgreen
429131006Srwatson	SOCKBUF_LOCK_ASSERT(sb);
430131006Srwatson
43152070Sgreen	/*
432183663Srwatson	 * When a thread is passed, we take into account the thread's socket
433183663Srwatson	 * buffer size limit.  The caller will generally pass curthread, but
434183663Srwatson	 * in the TCP input path, NULL will be passed to indicate that no
435183663Srwatson	 * appropriate thread resource limits are available.  In that case,
436183663Srwatson	 * we don't apply a process limit.
43752070Sgreen	 */
438101996Sdg	if (cc > sb_max_adj)
4391541Srgrimes		return (0);
440125454Sjhb	if (td != NULL) {
441284215Smjg		sbsize_limit = lim_cur(td, RLIMIT_SBSIZE);
442125454Sjhb	} else
443125454Sjhb		sbsize_limit = RLIM_INFINITY;
44465495Struckman	if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
445125454Sjhb	    sbsize_limit))
44652070Sgreen		return (0);
44713267Swollman	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
4481541Srgrimes	if (sb->sb_lowat > sb->sb_hiwat)
4491541Srgrimes		sb->sb_lowat = sb->sb_hiwat;
4501541Srgrimes	return (1);
4511541Srgrimes}
4521541Srgrimes
453131006Srwatsonint
454160915Srwatsonsbreserve(struct sockbuf *sb, u_long cc, struct socket *so,
455160915Srwatson    struct thread *td)
456131006Srwatson{
457131006Srwatson	int error;
458131006Srwatson
459131006Srwatson	SOCKBUF_LOCK(sb);
460131006Srwatson	error = sbreserve_locked(sb, cc, so, td);
461131006Srwatson	SOCKBUF_UNLOCK(sb);
462131006Srwatson	return (error);
463131006Srwatson}
464131006Srwatson
4651541Srgrimes/*
4661541Srgrimes * Free mbufs held by a socket, and reserved mbuf space.
4671541Srgrimes */
468175968Srwatsonvoid
469160915Srwatsonsbrelease_internal(struct sockbuf *sb, struct socket *so)
470160875Srwatson{
471160875Srwatson
472160875Srwatson	sbflush_internal(sb);
473160875Srwatson	(void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
474160875Srwatson	    RLIM_INFINITY);
475160875Srwatson	sb->sb_mbmax = 0;
476160875Srwatson}
477160875Srwatson
4781549Srgrimesvoid
479160915Srwatsonsbrelease_locked(struct sockbuf *sb, struct socket *so)
4801541Srgrimes{
4811541Srgrimes
482130831Srwatson	SOCKBUF_LOCK_ASSERT(sb);
483130831Srwatson
484160875Srwatson	sbrelease_internal(sb, so);
4851541Srgrimes}
4861541Srgrimes
487130831Srwatsonvoid
488160915Srwatsonsbrelease(struct sockbuf *sb, struct socket *so)
489130831Srwatson{
490130831Srwatson
491130831Srwatson	SOCKBUF_LOCK(sb);
492130831Srwatson	sbrelease_locked(sb, so);
493130831Srwatson	SOCKBUF_UNLOCK(sb);
494130831Srwatson}
495160875Srwatson
496160875Srwatsonvoid
497160915Srwatsonsbdestroy(struct sockbuf *sb, struct socket *so)
498160875Srwatson{
499160875Srwatson
500160875Srwatson	sbrelease_internal(sb, so);
501160875Srwatson}
502160875Srwatson
5031541Srgrimes/*
504160915Srwatson * Routines to add and remove data from an mbuf queue.
5051541Srgrimes *
506160915Srwatson * The routines sbappend() or sbappendrecord() are normally called to append
507160915Srwatson * new mbufs to a socket buffer, after checking that adequate space is
508160915Srwatson * available, comparing the function sbspace() with the amount of data to be
509160915Srwatson * added.  sbappendrecord() differs from sbappend() in that data supplied is
510160915Srwatson * treated as the beginning of a new record.  To place a sender's address,
511160915Srwatson * optional access rights, and data in a socket receive buffer,
512160915Srwatson * sbappendaddr() should be used.  To place access rights and data in a
513160915Srwatson * socket receive buffer, sbappendrights() should be used.  In either case,
514160915Srwatson * the new data begins a new record.  Note that unlike sbappend() and
515160915Srwatson * sbappendrecord(), these routines check for the caller that there will be
516160915Srwatson * enough space to store the data.  Each fails if there is not enough space,
517160915Srwatson * or if it cannot find mbufs to store additional information in.
5181541Srgrimes *
519160915Srwatson * Reliable protocols may use the socket send buffer to hold data awaiting
520160915Srwatson * acknowledgement.  Data is normally copied from a socket send buffer in a
521160915Srwatson * protocol with m_copy for output to a peer, and then removing the data from
522160915Srwatson * the socket buffer with sbdrop() or sbdroprecord() when the data is
523160915Srwatson * acknowledged by the peer.
5241541Srgrimes */
525121628Ssam#ifdef SOCKBUF_DEBUG
526121628Ssamvoid
527121628Ssamsblastrecordchk(struct sockbuf *sb, const char *file, int line)
528121628Ssam{
529121628Ssam	struct mbuf *m = sb->sb_mb;
530121628Ssam
531130831Srwatson	SOCKBUF_LOCK_ASSERT(sb);
532130831Srwatson
533121628Ssam	while (m && m->m_nextpkt)
534121628Ssam		m = m->m_nextpkt;
535121628Ssam
536121628Ssam	if (m != sb->sb_lastrecord) {
537121628Ssam		printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
538121628Ssam			__func__, sb->sb_mb, sb->sb_lastrecord, m);
539121628Ssam		printf("packet chain:\n");
540121628Ssam		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
541121628Ssam			printf("\t%p\n", m);
542121628Ssam		panic("%s from %s:%u", __func__, file, line);
543121628Ssam	}
544121628Ssam}
545121628Ssam
546121628Ssamvoid
547121628Ssamsblastmbufchk(struct sockbuf *sb, const char *file, int line)
548121628Ssam{
549121628Ssam	struct mbuf *m = sb->sb_mb;
550121628Ssam	struct mbuf *n;
551121628Ssam
552130831Srwatson	SOCKBUF_LOCK_ASSERT(sb);
553130831Srwatson
554121628Ssam	while (m && m->m_nextpkt)
555121628Ssam		m = m->m_nextpkt;
556121628Ssam
557121628Ssam	while (m && m->m_next)
558121628Ssam		m = m->m_next;
559121628Ssam
560121628Ssam	if (m != sb->sb_mbtail) {
561121628Ssam		printf("%s: sb_mb %p sb_mbtail %p last %p\n",
562121628Ssam			__func__, sb->sb_mb, sb->sb_mbtail, m);
563121628Ssam		printf("packet tree:\n");
564121628Ssam		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
565121628Ssam			printf("\t");
566121628Ssam			for (n = m; n != NULL; n = n->m_next)
567121628Ssam				printf("%p ", n);
568121628Ssam			printf("\n");
569121628Ssam		}
570121628Ssam		panic("%s from %s:%u", __func__, file, line);
571121628Ssam	}
572121628Ssam}
573121628Ssam#endif /* SOCKBUF_DEBUG */
574121628Ssam
575121628Ssam#define SBLINKRECORD(sb, m0) do {					\
576130831Srwatson	SOCKBUF_LOCK_ASSERT(sb);					\
577121628Ssam	if ((sb)->sb_lastrecord != NULL)				\
578121628Ssam		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
579121628Ssam	else								\
580121628Ssam		(sb)->sb_mb = (m0);					\
581121628Ssam	(sb)->sb_lastrecord = (m0);					\
582121628Ssam} while (/*CONSTCOND*/0)
583121628Ssam
5841541Srgrimes/*
585160915Srwatson * Append mbuf chain m to the last record in the socket buffer sb.  The
586160915Srwatson * additional space associated the mbuf chain is recorded in sb.  Empty mbufs
587160915Srwatson * are discarded and mbufs are compacted where possible.
5881541Srgrimes */
5891549Srgrimesvoid
590293432Sglebiussbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags)
5911541Srgrimes{
592160915Srwatson	struct mbuf *n;
5931541Srgrimes
594130831Srwatson	SOCKBUF_LOCK_ASSERT(sb);
595130831Srwatson
596298069Spfg	if (m == NULL)
5971541Srgrimes		return;
598293432Sglebius	sbm_clrprotoflags(m, flags);
599121628Ssam	SBLASTRECORDCHK(sb);
6003308Sphk	n = sb->sb_mb;
6013308Sphk	if (n) {
6021541Srgrimes		while (n->m_nextpkt)
6031541Srgrimes			n = n->m_nextpkt;
6041541Srgrimes		do {
6051541Srgrimes			if (n->m_flags & M_EOR) {
606130831Srwatson				sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
6071541Srgrimes				return;
6081541Srgrimes			}
6091541Srgrimes		} while (n->m_next && (n = n->m_next));
610121628Ssam	} else {
611121628Ssam		/*
612121628Ssam		 * XXX Would like to simply use sb_mbtail here, but
613121628Ssam		 * XXX I need to verify that I won't miss an EOR that
614121628Ssam		 * XXX way.
615121628Ssam		 */
616121628Ssam		if ((n = sb->sb_lastrecord) != NULL) {
617121628Ssam			do {
618121628Ssam				if (n->m_flags & M_EOR) {
619130831Srwatson					sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
620121628Ssam					return;
621121628Ssam				}
622121628Ssam			} while (n->m_next && (n = n->m_next));
623121628Ssam		} else {
624121628Ssam			/*
625121628Ssam			 * If this is the first record in the socket buffer,
626121628Ssam			 * it's also the last record.
627121628Ssam			 */
628121628Ssam			sb->sb_lastrecord = m;
629121628Ssam		}
6301541Srgrimes	}
6311541Srgrimes	sbcompress(sb, m, n);
632121628Ssam	SBLASTRECORDCHK(sb);
6331541Srgrimes}
6341541Srgrimes
635121628Ssam/*
636160915Srwatson * Append mbuf chain m to the last record in the socket buffer sb.  The
637160915Srwatson * additional space associated the mbuf chain is recorded in sb.  Empty mbufs
638160915Srwatson * are discarded and mbufs are compacted where possible.
639130831Srwatson */
640130831Srwatsonvoid
641293432Sglebiussbappend(struct sockbuf *sb, struct mbuf *m, int flags)
642130831Srwatson{
643130831Srwatson
644130831Srwatson	SOCKBUF_LOCK(sb);
645293432Sglebius	sbappend_locked(sb, m, flags);
646130831Srwatson	SOCKBUF_UNLOCK(sb);
647130831Srwatson}
648130831Srwatson
649130831Srwatson/*
650160915Srwatson * This version of sbappend() should only be used when the caller absolutely
651160915Srwatson * knows that there will never be more than one record in the socket buffer,
652160915Srwatson * that is, a stream protocol (such as TCP).
653121628Ssam */
654121628Ssamvoid
655275329Sglebiussbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags)
656121628Ssam{
657130831Srwatson	SOCKBUF_LOCK_ASSERT(sb);
658121628Ssam
659121628Ssam	KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
660121628Ssam	KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
661121628Ssam
662121628Ssam	SBLASTMBUFCHK(sb);
663121628Ssam
664248886Sglebius	/* Remove all packet headers and mbuf tags to get a pure data chain. */
665275329Sglebius	m_demote(m, 1, flags & PRUS_NOTREADY ? M_NOTREADY : 0);
666275329Sglebius
667121628Ssam	sbcompress(sb, m, sb->sb_mbtail);
668121628Ssam
669121628Ssam	sb->sb_lastrecord = sb->sb_mb;
670121628Ssam	SBLASTRECORDCHK(sb);
671121628Ssam}
672121628Ssam
673130831Srwatson/*
674160915Srwatson * This version of sbappend() should only be used when the caller absolutely
675160915Srwatson * knows that there will never be more than one record in the socket buffer,
676160915Srwatson * that is, a stream protocol (such as TCP).
677130831Srwatson */
678130831Srwatsonvoid
679275329Sglebiussbappendstream(struct sockbuf *sb, struct mbuf *m, int flags)
680130831Srwatson{
681130831Srwatson
682130831Srwatson	SOCKBUF_LOCK(sb);
683275329Sglebius	sbappendstream_locked(sb, m, flags);
684130831Srwatson	SOCKBUF_UNLOCK(sb);
685130831Srwatson}
686130831Srwatson
6871541Srgrimes#ifdef SOCKBUF_DEBUG
6881549Srgrimesvoid
689275315Sglebiussbcheck(struct sockbuf *sb, const char *file, int line)
6901541Srgrimes{
691275326Sglebius	struct mbuf *m, *n, *fnrdy;
692275326Sglebius	u_long acc, ccc, mbcnt;
6931541Srgrimes
694130831Srwatson	SOCKBUF_LOCK_ASSERT(sb);
695130831Srwatson
696275326Sglebius	acc = ccc = mbcnt = 0;
697275326Sglebius	fnrdy = NULL;
698275315Sglebius
69940913Sfenner	for (m = sb->sb_mb; m; m = n) {
70040913Sfenner	    n = m->m_nextpkt;
70140913Sfenner	    for (; m; m = m->m_next) {
702275315Sglebius		if (m->m_len == 0) {
703275315Sglebius			printf("sb %p empty mbuf %p\n", sb, m);
704275315Sglebius			goto fail;
705275315Sglebius		}
706275326Sglebius		if ((m->m_flags & M_NOTREADY) && fnrdy == NULL) {
707275326Sglebius			if (m != sb->sb_fnrdy) {
708275326Sglebius				printf("sb %p: fnrdy %p != m %p\n",
709275326Sglebius				    sb, sb->sb_fnrdy, m);
710275326Sglebius				goto fail;
711275326Sglebius			}
712275326Sglebius			fnrdy = m;
713275326Sglebius		}
714275326Sglebius		if (fnrdy) {
715275326Sglebius			if (!(m->m_flags & M_NOTAVAIL)) {
716275326Sglebius				printf("sb %p: fnrdy %p, m %p is avail\n",
717275326Sglebius				    sb, sb->sb_fnrdy, m);
718275326Sglebius				goto fail;
719275326Sglebius			}
720275326Sglebius		} else
721275326Sglebius			acc += m->m_len;
722275326Sglebius		ccc += m->m_len;
7231541Srgrimes		mbcnt += MSIZE;
72417675Sjulian		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
7251541Srgrimes			mbcnt += m->m_ext.ext_size;
72640913Sfenner	    }
7271541Srgrimes	}
728275326Sglebius	if (acc != sb->sb_acc || ccc != sb->sb_ccc || mbcnt != sb->sb_mbcnt) {
729275326Sglebius		printf("acc %ld/%u ccc %ld/%u mbcnt %ld/%u\n",
730275326Sglebius		    acc, sb->sb_acc, ccc, sb->sb_ccc, mbcnt, sb->sb_mbcnt);
731275315Sglebius		goto fail;
7321541Srgrimes	}
733275315Sglebius	return;
734275315Sglebiusfail:
735275315Sglebius	panic("%s from %s:%u", __func__, file, line);
7361541Srgrimes}
7371541Srgrimes#endif
7381541Srgrimes
7391541Srgrimes/*
740160915Srwatson * As above, except the mbuf chain begins a new record.
7411541Srgrimes */
7421549Srgrimesvoid
743160915Srwatsonsbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0)
7441541Srgrimes{
745160915Srwatson	struct mbuf *m;
7461541Srgrimes
747130831Srwatson	SOCKBUF_LOCK_ASSERT(sb);
748130831Srwatson
749298069Spfg	if (m0 == NULL)
7501541Srgrimes		return;
751276058Sglebius	m_clrprotoflags(m0);
7521541Srgrimes	/*
753160915Srwatson	 * Put the first mbuf on the queue.  Note this permits zero length
754160915Srwatson	 * records.
7551541Srgrimes	 */
7561541Srgrimes	sballoc(sb, m0);
757121628Ssam	SBLASTRECORDCHK(sb);
758121628Ssam	SBLINKRECORD(sb, m0);
759191366Semax	sb->sb_mbtail = m0;
7601541Srgrimes	m = m0->m_next;
7611541Srgrimes	m0->m_next = 0;
7621541Srgrimes	if (m && (m0->m_flags & M_EOR)) {
7631541Srgrimes		m0->m_flags &= ~M_EOR;
7641541Srgrimes		m->m_flags |= M_EOR;
7651541Srgrimes	}
766191366Semax	/* always call sbcompress() so it can do SBLASTMBUFCHK() */
7671541Srgrimes	sbcompress(sb, m, m0);
7681541Srgrimes}
7691541Srgrimes
7701541Srgrimes/*
771160915Srwatson * As above, except the mbuf chain begins a new record.
772130831Srwatson */
773130831Srwatsonvoid
774160915Srwatsonsbappendrecord(struct sockbuf *sb, struct mbuf *m0)
775130831Srwatson{
776130831Srwatson
777130831Srwatson	SOCKBUF_LOCK(sb);
778130831Srwatson	sbappendrecord_locked(sb, m0);
779130831Srwatson	SOCKBUF_UNLOCK(sb);
780130831Srwatson}
781130831Srwatson
782262867Sasomers/* Helper routine that appends data, control, and address to a sockbuf. */
783262867Sasomersstatic int
784262867Sasomerssbappendaddr_locked_internal(struct sockbuf *sb, const struct sockaddr *asa,
785262867Sasomers    struct mbuf *m0, struct mbuf *control, struct mbuf *ctrl_last)
7861541Srgrimes{
787121628Ssam	struct mbuf *m, *n, *nlast;
788118045Sscottl#if MSIZE <= 256
7891541Srgrimes	if (asa->sa_len > MLEN)
7901541Srgrimes		return (0);
791118045Sscottl#endif
792248318Sglebius	m = m_get(M_NOWAIT, MT_SONAME);
793248318Sglebius	if (m == NULL)
7941541Srgrimes		return (0);
7951541Srgrimes	m->m_len = asa->sa_len;
79698998Salfred	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
797276058Sglebius	if (m0)
798276058Sglebius		m_clrprotoflags(m0);
799262867Sasomers	if (ctrl_last)
800262867Sasomers		ctrl_last->m_next = m0;	/* concatenate data to control */
8011541Srgrimes	else
8021541Srgrimes		control = m0;
8031541Srgrimes	m->m_next = control;
804121628Ssam	for (n = m; n->m_next != NULL; n = n->m_next)
8051541Srgrimes		sballoc(sb, n);
806121628Ssam	sballoc(sb, n);
807121628Ssam	nlast = n;
808121628Ssam	SBLINKRECORD(sb, m);
809121628Ssam
810121628Ssam	sb->sb_mbtail = nlast;
811121628Ssam	SBLASTMBUFCHK(sb);
812121628Ssam
813121628Ssam	SBLASTRECORDCHK(sb);
8141541Srgrimes	return (1);
8151541Srgrimes}
8161541Srgrimes
817130831Srwatson/*
818160915Srwatson * Append address and data, and optionally, control (ancillary) data to the
819160915Srwatson * receive queue of a socket.  If present, m0 must include a packet header
820160915Srwatson * with total length.  Returns 0 if no space in sockbuf or insufficient
821160915Srwatson * mbufs.
822130831Srwatson */
8231549Srgrimesint
824262867Sasomerssbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
825262867Sasomers    struct mbuf *m0, struct mbuf *control)
826262867Sasomers{
827262867Sasomers	struct mbuf *ctrl_last;
828262867Sasomers	int space = asa->sa_len;
829262867Sasomers
830262867Sasomers	SOCKBUF_LOCK_ASSERT(sb);
831262867Sasomers
832262867Sasomers	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
833262867Sasomers		panic("sbappendaddr_locked");
834262867Sasomers	if (m0)
835262867Sasomers		space += m0->m_pkthdr.len;
836262867Sasomers	space += m_length(control, &ctrl_last);
837262867Sasomers
838262867Sasomers	if (space > sbspace(sb))
839262867Sasomers		return (0);
840262867Sasomers	return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last));
841262867Sasomers}
842262867Sasomers
843262867Sasomers/*
844262867Sasomers * Append address and data, and optionally, control (ancillary) data to the
845262867Sasomers * receive queue of a socket.  If present, m0 must include a packet header
846262867Sasomers * with total length.  Returns 0 if insufficient mbufs.  Does not validate space
847262867Sasomers * on the receiving sockbuf.
848262867Sasomers */
849262867Sasomersint
850262867Sasomerssbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa,
851262867Sasomers    struct mbuf *m0, struct mbuf *control)
852262867Sasomers{
853262867Sasomers	struct mbuf *ctrl_last;
854262867Sasomers
855262867Sasomers	SOCKBUF_LOCK_ASSERT(sb);
856262867Sasomers
857262867Sasomers	ctrl_last = (control == NULL) ? NULL : m_last(control);
858262867Sasomers	return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last));
859262867Sasomers}
860262867Sasomers
861262867Sasomers/*
862262867Sasomers * Append address and data, and optionally, control (ancillary) data to the
863262867Sasomers * receive queue of a socket.  If present, m0 must include a packet header
864262867Sasomers * with total length.  Returns 0 if no space in sockbuf or insufficient
865262867Sasomers * mbufs.
866262867Sasomers */
867262867Sasomersint
868160915Srwatsonsbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
869160915Srwatson    struct mbuf *m0, struct mbuf *control)
870130831Srwatson{
871130831Srwatson	int retval;
872130831Srwatson
873130831Srwatson	SOCKBUF_LOCK(sb);
874130831Srwatson	retval = sbappendaddr_locked(sb, asa, m0, control);
875130831Srwatson	SOCKBUF_UNLOCK(sb);
876130831Srwatson	return (retval);
877130831Srwatson}
878130831Srwatson
879130831Srwatsonint
880160915Srwatsonsbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
881160915Srwatson    struct mbuf *control)
8821541Srgrimes{
883121628Ssam	struct mbuf *m, *n, *mlast;
884103554Sphk	int space;
8851541Srgrimes
886130831Srwatson	SOCKBUF_LOCK_ASSERT(sb);
887130831Srwatson
888298069Spfg	if (control == NULL)
889130831Srwatson		panic("sbappendcontrol_locked");
890103554Sphk	space = m_length(control, &n) + m_length(m0, NULL);
891130831Srwatson
8921541Srgrimes	if (space > sbspace(sb))
8931541Srgrimes		return (0);
894276058Sglebius	m_clrprotoflags(m0);
8951541Srgrimes	n->m_next = m0;			/* concatenate data to control */
896121628Ssam
897121628Ssam	SBLASTRECORDCHK(sb);
898121628Ssam
899121628Ssam	for (m = control; m->m_next; m = m->m_next)
9001541Srgrimes		sballoc(sb, m);
901121628Ssam	sballoc(sb, m);
902121628Ssam	mlast = m;
903121628Ssam	SBLINKRECORD(sb, control);
904121628Ssam
905121628Ssam	sb->sb_mbtail = mlast;
906121628Ssam	SBLASTMBUFCHK(sb);
907121628Ssam
908121628Ssam	SBLASTRECORDCHK(sb);
9091541Srgrimes	return (1);
9101541Srgrimes}
9111541Srgrimes
912130831Srwatsonint
913160915Srwatsonsbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
914130831Srwatson{
915130831Srwatson	int retval;
916130831Srwatson
917130831Srwatson	SOCKBUF_LOCK(sb);
918130831Srwatson	retval = sbappendcontrol_locked(sb, m0, control);
919130831Srwatson	SOCKBUF_UNLOCK(sb);
920130831Srwatson	return (retval);
921130831Srwatson}
922130831Srwatson
9231541Srgrimes/*
924150280Srwatson * Append the data in mbuf chain (m) into the socket buffer sb following mbuf
925150280Srwatson * (n).  If (n) is NULL, the buffer is presumed empty.
926150280Srwatson *
927150280Srwatson * When the data is compressed, mbufs in the chain may be handled in one of
928150280Srwatson * three ways:
929150280Srwatson *
930150280Srwatson * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no
931150280Srwatson *     record boundary, and no change in data type).
932150280Srwatson *
933150280Srwatson * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into
934150280Srwatson *     an mbuf already in the socket buffer.  This can occur if an
935275326Sglebius *     appropriate mbuf exists, there is room, both mbufs are not marked as
936275326Sglebius *     not ready, and no merging of data types will occur.
937150280Srwatson *
938150280Srwatson * (3) The mbuf may be appended to the end of the existing mbuf chain.
939150280Srwatson *
940150280Srwatson * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as
941150280Srwatson * end-of-record.
9421541Srgrimes */
9431549Srgrimesvoid
944160915Srwatsonsbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
9451541Srgrimes{
946160915Srwatson	int eor = 0;
947160915Srwatson	struct mbuf *o;
9481541Srgrimes
949130831Srwatson	SOCKBUF_LOCK_ASSERT(sb);
950130831Srwatson
9511541Srgrimes	while (m) {
9521541Srgrimes		eor |= m->m_flags & M_EOR;
9531541Srgrimes		if (m->m_len == 0 &&
9541541Srgrimes		    (eor == 0 ||
9551541Srgrimes		     (((o = m->m_next) || (o = n)) &&
9561541Srgrimes		      o->m_type == m->m_type))) {
957121628Ssam			if (sb->sb_lastrecord == m)
958121628Ssam				sb->sb_lastrecord = m->m_next;
9591541Srgrimes			m = m_free(m);
9601541Srgrimes			continue;
9611541Srgrimes		}
96268918Sdwmalone		if (n && (n->m_flags & M_EOR) == 0 &&
96368918Sdwmalone		    M_WRITABLE(n) &&
964174711Skmacy		    ((sb->sb_flags & SB_NOCOALESCE) == 0) &&
965275326Sglebius		    !(m->m_flags & M_NOTREADY) &&
966275326Sglebius		    !(n->m_flags & M_NOTREADY) &&
96768918Sdwmalone		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
96868918Sdwmalone		    m->m_len <= M_TRAILINGSPACE(n) &&
9691541Srgrimes		    n->m_type == m->m_type) {
9701541Srgrimes			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
9711541Srgrimes			    (unsigned)m->m_len);
9721541Srgrimes			n->m_len += m->m_len;
973275326Sglebius			sb->sb_ccc += m->m_len;
974275326Sglebius			if (sb->sb_fnrdy == NULL)
975275326Sglebius				sb->sb_acc += m->m_len;
976151967Sandre			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
977109098Stjr				/* XXX: Probably don't need.*/
978106473Skbyanc				sb->sb_ctl += m->m_len;
9791541Srgrimes			m = m_free(m);
9801541Srgrimes			continue;
9811541Srgrimes		}
9821541Srgrimes		if (n)
9831541Srgrimes			n->m_next = m;
9841541Srgrimes		else
9851541Srgrimes			sb->sb_mb = m;
986121628Ssam		sb->sb_mbtail = m;
9871541Srgrimes		sballoc(sb, m);
9881541Srgrimes		n = m;
9891541Srgrimes		m->m_flags &= ~M_EOR;
9901541Srgrimes		m = m->m_next;
9911541Srgrimes		n->m_next = 0;
9921541Srgrimes	}
9931541Srgrimes	if (eor) {
994150280Srwatson		KASSERT(n != NULL, ("sbcompress: eor && n == NULL"));
995150280Srwatson		n->m_flags |= eor;
9961541Srgrimes	}
997121628Ssam	SBLASTMBUFCHK(sb);
9981541Srgrimes}
9991541Srgrimes
10001541Srgrimes/*
1001160915Srwatson * Free all mbufs in a sockbuf.  Check that all resources are reclaimed.
10021541Srgrimes */
1003160875Srwatsonstatic void
1004160915Srwatsonsbflush_internal(struct sockbuf *sb)
10051541Srgrimes{
10061541Srgrimes
100751757Spb	while (sb->sb_mbcnt) {
100851757Spb		/*
1009260819Sglebius		 * Don't call sbcut(sb, 0) if the leading mbuf is non-empty:
101051757Spb		 * we would loop forever. Panic instead.
101151757Spb		 */
1012275326Sglebius		if (sb->sb_ccc == 0 && (sb->sb_mb == NULL || sb->sb_mb->m_len))
101351757Spb			break;
1014275326Sglebius		m_freem(sbcut_internal(sb, (int)sb->sb_ccc));
101551757Spb	}
1016275326Sglebius	KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0,
1017275326Sglebius	    ("%s: ccc %u mb %p mbcnt %u", __func__,
1018275326Sglebius	    sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt));
10191541Srgrimes}
10201541Srgrimes
1021130831Srwatsonvoid
1022160915Srwatsonsbflush_locked(struct sockbuf *sb)
1023160875Srwatson{
1024160875Srwatson
1025160875Srwatson	SOCKBUF_LOCK_ASSERT(sb);
1026160875Srwatson	sbflush_internal(sb);
1027160875Srwatson}
1028160875Srwatson
1029160875Srwatsonvoid
1030160915Srwatsonsbflush(struct sockbuf *sb)
1031130831Srwatson{
1032130831Srwatson
1033130831Srwatson	SOCKBUF_LOCK(sb);
1034130831Srwatson	sbflush_locked(sb);
1035130831Srwatson	SOCKBUF_UNLOCK(sb);
1036130831Srwatson}
1037130831Srwatson
10381541Srgrimes/*
1039256185Sglebius * Cut data from (the front of) a sockbuf.
10401541Srgrimes */
1041256185Sglebiusstatic struct mbuf *
1042256185Sglebiussbcut_internal(struct sockbuf *sb, int len)
10431541Srgrimes{
1044275326Sglebius	struct mbuf *m, *next, *mfree;
10451541Srgrimes
10461541Srgrimes	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
1047256185Sglebius	mfree = NULL;
1048256185Sglebius
10491541Srgrimes	while (len > 0) {
1050274509Sglebius		if (m == NULL) {
1051274509Sglebius			KASSERT(next, ("%s: no next, len %d", __func__, len));
10521541Srgrimes			m = next;
10531541Srgrimes			next = m->m_nextpkt;
10541541Srgrimes		}
10551541Srgrimes		if (m->m_len > len) {
1056275326Sglebius			KASSERT(!(m->m_flags & M_NOTAVAIL),
1057275326Sglebius			    ("%s: m %p M_NOTAVAIL", __func__, m));
10581541Srgrimes			m->m_len -= len;
10591541Srgrimes			m->m_data += len;
1060275326Sglebius			sb->sb_ccc -= len;
1061275326Sglebius			sb->sb_acc -= len;
1062167715Sandre			if (sb->sb_sndptroff != 0)
1063167715Sandre				sb->sb_sndptroff -= len;
1064151967Sandre			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
1065106473Skbyanc				sb->sb_ctl -= len;
10661541Srgrimes			break;
10671541Srgrimes		}
10681541Srgrimes		len -= m->m_len;
10691541Srgrimes		sbfree(sb, m);
1070275326Sglebius		/*
1071275326Sglebius		 * Do not put M_NOTREADY buffers to the free list, they
1072275326Sglebius		 * are referenced from outside.
1073275326Sglebius		 */
1074275326Sglebius		if (m->m_flags & M_NOTREADY)
1075275326Sglebius			m = m->m_next;
1076275326Sglebius		else {
1077275326Sglebius			struct mbuf *n;
1078275326Sglebius
1079275326Sglebius			n = m->m_next;
1080275326Sglebius			m->m_next = mfree;
1081275326Sglebius			mfree = m;
1082275326Sglebius			m = n;
1083275326Sglebius		}
10841541Srgrimes	}
1085275968Sglebius	/*
1086275968Sglebius	 * Free any zero-length mbufs from the buffer.
1087275968Sglebius	 * For SOCK_DGRAM sockets such mbufs represent empty records.
1088275968Sglebius	 * XXX: For SOCK_STREAM sockets such mbufs can appear in the buffer,
1089275968Sglebius	 * when sosend_generic() needs to send only control data.
1090275968Sglebius	 */
1091275968Sglebius	while (m && m->m_len == 0) {
1092275968Sglebius		struct mbuf *n;
1093275968Sglebius
1094275968Sglebius		sbfree(sb, m);
1095275968Sglebius		n = m->m_next;
1096275968Sglebius		m->m_next = mfree;
1097275968Sglebius		mfree = m;
1098275968Sglebius		m = n;
1099275968Sglebius	}
11001541Srgrimes	if (m) {
11011541Srgrimes		sb->sb_mb = m;
11021541Srgrimes		m->m_nextpkt = next;
11031541Srgrimes	} else
11041541Srgrimes		sb->sb_mb = next;
1105121628Ssam	/*
1106160915Srwatson	 * First part is an inline SB_EMPTY_FIXUP().  Second part makes sure
1107160915Srwatson	 * sb_lastrecord is up-to-date if we dropped part of the last record.
1108121628Ssam	 */
1109121628Ssam	m = sb->sb_mb;
1110121628Ssam	if (m == NULL) {
1111121628Ssam		sb->sb_mbtail = NULL;
1112121628Ssam		sb->sb_lastrecord = NULL;
1113121628Ssam	} else if (m->m_nextpkt == NULL) {
1114121628Ssam		sb->sb_lastrecord = m;
1115121628Ssam	}
1116256185Sglebius
1117256185Sglebius	return (mfree);
11181541Srgrimes}
11191541Srgrimes
11201541Srgrimes/*
1121130831Srwatson * Drop data from (the front of) a sockbuf.
1122130831Srwatson */
1123130831Srwatsonvoid
1124160915Srwatsonsbdrop_locked(struct sockbuf *sb, int len)
1125160875Srwatson{
1126160875Srwatson
1127160875Srwatson	SOCKBUF_LOCK_ASSERT(sb);
1128256185Sglebius	m_freem(sbcut_internal(sb, len));
1129256185Sglebius}
1130160875Srwatson
1131256185Sglebius/*
1132256185Sglebius * Drop data from (the front of) a sockbuf,
1133256185Sglebius * and return it to caller.
1134256185Sglebius */
1135256185Sglebiusstruct mbuf *
1136256185Sglebiussbcut_locked(struct sockbuf *sb, int len)
1137256185Sglebius{
1138256185Sglebius
1139256185Sglebius	SOCKBUF_LOCK_ASSERT(sb);
1140256185Sglebius	return (sbcut_internal(sb, len));
1141160875Srwatson}
1142160875Srwatson
1143160875Srwatsonvoid
1144160915Srwatsonsbdrop(struct sockbuf *sb, int len)
1145130831Srwatson{
1146256185Sglebius	struct mbuf *mfree;
1147130831Srwatson
1148130831Srwatson	SOCKBUF_LOCK(sb);
1149256185Sglebius	mfree = sbcut_internal(sb, len);
1150130831Srwatson	SOCKBUF_UNLOCK(sb);
1151256185Sglebius
1152256185Sglebius	m_freem(mfree);
1153130831Srwatson}
1154130831Srwatson
1155130831Srwatson/*
1156167715Sandre * Maintain a pointer and offset pair into the socket buffer mbuf chain to
1157167715Sandre * avoid traversal of the entire socket buffer for larger offsets.
1158167715Sandre */
1159167715Sandrestruct mbuf *
1160167715Sandresbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff)
1161167715Sandre{
1162167715Sandre	struct mbuf *m, *ret;
1163167715Sandre
1164167715Sandre	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
1165275326Sglebius	KASSERT(off + len <= sb->sb_acc, ("%s: beyond sb", __func__));
1166275326Sglebius	KASSERT(sb->sb_sndptroff <= sb->sb_acc, ("%s: sndptroff broken", __func__));
1167167715Sandre
1168167715Sandre	/*
1169167715Sandre	 * Is off below stored offset? Happens on retransmits.
1170167715Sandre	 * Just return, we can't help here.
1171167715Sandre	 */
1172167715Sandre	if (sb->sb_sndptroff > off) {
1173167715Sandre		*moff = off;
1174167715Sandre		return (sb->sb_mb);
1175167715Sandre	}
1176167715Sandre
1177167715Sandre	/* Return closest mbuf in chain for current offset. */
1178167715Sandre	*moff = off - sb->sb_sndptroff;
1179167715Sandre	m = ret = sb->sb_sndptr ? sb->sb_sndptr : sb->sb_mb;
1180251984Slstewart	if (*moff == m->m_len) {
1181251984Slstewart		*moff = 0;
1182251984Slstewart		sb->sb_sndptroff += m->m_len;
1183251984Slstewart		m = ret = m->m_next;
1184251984Slstewart		KASSERT(ret->m_len > 0,
1185251984Slstewart		    ("mbuf %p in sockbuf %p chain has no valid data", ret, sb));
1186251984Slstewart	}
1187167715Sandre
1188167715Sandre	/* Advance by len to be as close as possible for the next transmit. */
1189167715Sandre	for (off = off - sb->sb_sndptroff + len - 1;
1190182842Sbz	     off > 0 && m != NULL && off >= m->m_len;
1191167715Sandre	     m = m->m_next) {
1192167715Sandre		sb->sb_sndptroff += m->m_len;
1193167715Sandre		off -= m->m_len;
1194167715Sandre	}
1195182842Sbz	if (off > 0 && m == NULL)
1196182842Sbz		panic("%s: sockbuf %p and mbuf %p clashing", __func__, sb, ret);
1197167715Sandre	sb->sb_sndptr = m;
1198167715Sandre
1199167715Sandre	return (ret);
1200167715Sandre}
1201167715Sandre
1202167715Sandre/*
1203271946Shselasky * Return the first mbuf and the mbuf data offset for the provided
1204271946Shselasky * send offset without changing the "sb_sndptroff" field.
1205271946Shselasky */
1206271946Shselaskystruct mbuf *
1207271946Shselaskysbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff)
1208271946Shselasky{
1209271946Shselasky	struct mbuf *m;
1210271946Shselasky
1211271946Shselasky	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
1212271946Shselasky
1213271946Shselasky	/*
1214271946Shselasky	 * If the "off" is below the stored offset, which happens on
1215271946Shselasky	 * retransmits, just use "sb_mb":
1216271946Shselasky	 */
1217271946Shselasky	if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) {
1218271946Shselasky		m = sb->sb_mb;
1219271946Shselasky	} else {
1220271946Shselasky		m = sb->sb_sndptr;
1221271946Shselasky		off -= sb->sb_sndptroff;
1222271946Shselasky	}
1223271946Shselasky	while (off > 0 && m != NULL) {
1224271946Shselasky		if (off < m->m_len)
1225271946Shselasky			break;
1226271946Shselasky		off -= m->m_len;
1227271946Shselasky		m = m->m_next;
1228271946Shselasky	}
1229271946Shselasky	*moff = off;
1230271946Shselasky	return (m);
1231271946Shselasky}
1232271946Shselasky
1233271946Shselasky/*
1234160915Srwatson * Drop a record off the front of a sockbuf and move the next record to the
1235160915Srwatson * front.
12361541Srgrimes */
12371549Srgrimesvoid
1238160915Srwatsonsbdroprecord_locked(struct sockbuf *sb)
12391541Srgrimes{
1240160915Srwatson	struct mbuf *m;
12411541Srgrimes
1242130831Srwatson	SOCKBUF_LOCK_ASSERT(sb);
1243130831Srwatson
12441541Srgrimes	m = sb->sb_mb;
12451541Srgrimes	if (m) {
12461541Srgrimes		sb->sb_mb = m->m_nextpkt;
12471541Srgrimes		do {
12481541Srgrimes			sbfree(sb, m);
124990227Sdillon			m = m_free(m);
12503308Sphk		} while (m);
12511541Srgrimes	}
1252121628Ssam	SB_EMPTY_FIXUP(sb);
12531541Srgrimes}
125417047Swollman
125519622Sfenner/*
1256160915Srwatson * Drop a record off the front of a sockbuf and move the next record to the
1257160915Srwatson * front.
1258130831Srwatson */
1259130831Srwatsonvoid
1260160915Srwatsonsbdroprecord(struct sockbuf *sb)
1261130831Srwatson{
1262130831Srwatson
1263130831Srwatson	SOCKBUF_LOCK(sb);
1264130831Srwatson	sbdroprecord_locked(sb);
1265130831Srwatson	SOCKBUF_UNLOCK(sb);
1266130831Srwatson}
1267130831Srwatson
1268167895Srwatson/*
1269167902Srwatson * Create a "control" mbuf containing the specified data with the specified
1270167902Srwatson * type for presentation on a socket buffer.
1271167895Srwatson */
1272167895Srwatsonstruct mbuf *
1273169624Srwatsonsbcreatecontrol(caddr_t p, int size, int type, int level)
1274167895Srwatson{
1275169624Srwatson	struct cmsghdr *cp;
1276167895Srwatson	struct mbuf *m;
1277167895Srwatson
1278167895Srwatson	if (CMSG_SPACE((u_int)size) > MCLBYTES)
1279167895Srwatson		return ((struct mbuf *) NULL);
1280167895Srwatson	if (CMSG_SPACE((u_int)size) > MLEN)
1281243882Sglebius		m = m_getcl(M_NOWAIT, MT_CONTROL, 0);
1282167895Srwatson	else
1283243882Sglebius		m = m_get(M_NOWAIT, MT_CONTROL);
1284167895Srwatson	if (m == NULL)
1285167895Srwatson		return ((struct mbuf *) NULL);
1286167895Srwatson	cp = mtod(m, struct cmsghdr *);
1287167895Srwatson	m->m_len = 0;
1288167895Srwatson	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
1289167895Srwatson	    ("sbcreatecontrol: short mbuf"));
1290268430Sdelphij	/*
1291268430Sdelphij	 * Don't leave the padding between the msg header and the
1292268430Sdelphij	 * cmsg data and the padding after the cmsg data un-initialized.
1293268430Sdelphij	 */
1294268430Sdelphij	bzero(cp, CMSG_SPACE((u_int)size));
1295167895Srwatson	if (p != NULL)
1296167895Srwatson		(void)memcpy(CMSG_DATA(cp), p, size);
1297167895Srwatson	m->m_len = CMSG_SPACE(size);
1298167895Srwatson	cp->cmsg_len = CMSG_LEN(size);
1299167895Srwatson	cp->cmsg_level = level;
1300167895Srwatson	cp->cmsg_type = type;
1301167895Srwatson	return (m);
1302167895Srwatson}
1303167895Srwatson
1304167895Srwatson/*
1305167902Srwatson * This does the same for socket buffers that sotoxsocket does for sockets:
1306167902Srwatson * generate an user-format data structure describing the socket buffer.  Note
1307167902Srwatson * that the xsockbuf structure, since it is always embedded in a socket, does
1308167902Srwatson * not include a self pointer nor a length.  We make this entry point public
1309167902Srwatson * in case some other mechanism needs it.
1310167895Srwatson */
1311167895Srwatsonvoid
1312167895Srwatsonsbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
1313167895Srwatson{
1314169624Srwatson
1315275326Sglebius	xsb->sb_cc = sb->sb_ccc;
1316167895Srwatson	xsb->sb_hiwat = sb->sb_hiwat;
1317167895Srwatson	xsb->sb_mbcnt = sb->sb_mbcnt;
1318179027Sgnn	xsb->sb_mcnt = sb->sb_mcnt;
1319179027Sgnn	xsb->sb_ccnt = sb->sb_ccnt;
1320167895Srwatson	xsb->sb_mbmax = sb->sb_mbmax;
1321167895Srwatson	xsb->sb_lowat = sb->sb_lowat;
1322167895Srwatson	xsb->sb_flags = sb->sb_flags;
1323167895Srwatson	xsb->sb_timeo = sb->sb_timeo;
1324167895Srwatson}
1325167895Srwatson
132623081Swollman/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
132723081Swollmanstatic int dummy;
132823081SwollmanSYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
1329160621SrwatsonSYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW,
1330110268Sharti    &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size");
1331110268ShartiSYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
1332228449Seadler    &sb_efficiency, 0, "Socket buffer size waste factor");
1333