1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: stable/11/sys/kern/uipc_sockbuf.c 337975 2018-08-17 16:04:20Z markj $");
34
35#include "opt_param.h"
36
37#include <sys/param.h>
38#include <sys/aio.h> /* for aio_swake proto */
39#include <sys/kernel.h>
40#include <sys/lock.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/mutex.h>
44#include <sys/proc.h>
45#include <sys/protosw.h>
46#include <sys/resourcevar.h>
47#include <sys/signalvar.h>
48#include <sys/socket.h>
49#include <sys/socketvar.h>
50#include <sys/sx.h>
51#include <sys/sysctl.h>
52
53/*
54 * Function pointer set by the AIO routines so that the socket buffer code
55 * can call back into the AIO module if it is loaded.
56 */
57void	(*aio_swake)(struct socket *, struct sockbuf *);
58
59/*
60 * Primitive routines for operating on socket buffers
61 */
62
63u_long	sb_max = SB_MAX;
64u_long sb_max_adj =
65       (quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
66
67static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
68
69static struct mbuf	*sbcut_internal(struct sockbuf *sb, int len);
70static void	sbflush_internal(struct sockbuf *sb);
71
72/*
73 * Our own version of m_clrprotoflags(), that can preserve M_NOTREADY.
74 */
75static void
76sbm_clrprotoflags(struct mbuf *m, int flags)
77{
78	int mask;
79
80	mask = ~M_PROTOFLAGS;
81	if (flags & PRUS_NOTREADY)
82		mask |= M_NOTREADY;
83	while (m) {
84		m->m_flags &= mask;
85		m = m->m_next;
86	}
87}
88
89/*
90 * Mark ready "count" mbufs starting with "m".
91 */
92int
93sbready(struct sockbuf *sb, struct mbuf *m, int count)
94{
95	u_int blocker;
96
97	SOCKBUF_LOCK_ASSERT(sb);
98	KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb));
99
100	blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0;
101
102	for (int i = 0; i < count; i++, m = m->m_next) {
103		KASSERT(m->m_flags & M_NOTREADY,
104		    ("%s: m %p !M_NOTREADY", __func__, m));
105		m->m_flags &= ~(M_NOTREADY | blocker);
106		if (blocker)
107			sb->sb_acc += m->m_len;
108	}
109
110	if (!blocker)
111		return (EINPROGRESS);
112
113	/* This one was blocking all the queue. */
114	for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) {
115		KASSERT(m->m_flags & M_BLOCKED,
116		    ("%s: m %p !M_BLOCKED", __func__, m));
117		m->m_flags &= ~M_BLOCKED;
118		sb->sb_acc += m->m_len;
119	}
120
121	sb->sb_fnrdy = m;
122
123	return (0);
124}
125
126/*
127 * Adjust sockbuf state reflecting allocation of m.
128 */
129void
130sballoc(struct sockbuf *sb, struct mbuf *m)
131{
132
133	SOCKBUF_LOCK_ASSERT(sb);
134
135	sb->sb_ccc += m->m_len;
136
137	if (sb->sb_fnrdy == NULL) {
138		if (m->m_flags & M_NOTREADY)
139			sb->sb_fnrdy = m;
140		else
141			sb->sb_acc += m->m_len;
142	} else
143		m->m_flags |= M_BLOCKED;
144
145	if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
146		sb->sb_ctl += m->m_len;
147
148	sb->sb_mbcnt += MSIZE;
149	sb->sb_mcnt += 1;
150
151	if (m->m_flags & M_EXT) {
152		sb->sb_mbcnt += m->m_ext.ext_size;
153		sb->sb_ccnt += 1;
154	}
155}
156
157/*
158 * Adjust sockbuf state reflecting freeing of m.
159 */
160void
161sbfree(struct sockbuf *sb, struct mbuf *m)
162{
163
164#if 0	/* XXX: not yet: soclose() call path comes here w/o lock. */
165	SOCKBUF_LOCK_ASSERT(sb);
166#endif
167
168	sb->sb_ccc -= m->m_len;
169
170	if (!(m->m_flags & M_NOTAVAIL))
171		sb->sb_acc -= m->m_len;
172
173	if (m == sb->sb_fnrdy) {
174		struct mbuf *n;
175
176		KASSERT(m->m_flags & M_NOTREADY,
177		    ("%s: m %p !M_NOTREADY", __func__, m));
178
179		n = m->m_next;
180		while (n != NULL && !(n->m_flags & M_NOTREADY)) {
181			n->m_flags &= ~M_BLOCKED;
182			sb->sb_acc += n->m_len;
183			n = n->m_next;
184		}
185		sb->sb_fnrdy = n;
186	}
187
188	if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
189		sb->sb_ctl -= m->m_len;
190
191	sb->sb_mbcnt -= MSIZE;
192	sb->sb_mcnt -= 1;
193	if (m->m_flags & M_EXT) {
194		sb->sb_mbcnt -= m->m_ext.ext_size;
195		sb->sb_ccnt -= 1;
196	}
197
198	if (sb->sb_sndptr == m) {
199		sb->sb_sndptr = NULL;
200		sb->sb_sndptroff = 0;
201	}
202	if (sb->sb_sndptroff != 0)
203		sb->sb_sndptroff -= m->m_len;
204}
205
206/*
207 * Socantsendmore indicates that no more data will be sent on the socket; it
208 * would normally be applied to a socket when the user informs the system
209 * that no more data is to be sent, by the protocol code (in case
210 * PRU_SHUTDOWN).  Socantrcvmore indicates that no more data will be
211 * received, and will normally be applied to the socket by a protocol when it
212 * detects that the peer will send no more data.  Data queued for reading in
213 * the socket may yet be read.
214 */
215void
216socantsendmore_locked(struct socket *so)
217{
218
219	SOCKBUF_LOCK_ASSERT(&so->so_snd);
220
221	so->so_snd.sb_state |= SBS_CANTSENDMORE;
222	sowwakeup_locked(so);
223	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
224}
225
226void
227socantsendmore(struct socket *so)
228{
229
230	SOCKBUF_LOCK(&so->so_snd);
231	socantsendmore_locked(so);
232	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
233}
234
235void
236socantrcvmore_locked(struct socket *so)
237{
238
239	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
240
241	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
242	sorwakeup_locked(so);
243	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
244}
245
246void
247socantrcvmore(struct socket *so)
248{
249
250	SOCKBUF_LOCK(&so->so_rcv);
251	socantrcvmore_locked(so);
252	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
253}
254
255/*
256 * Wait for data to arrive at/drain from a socket buffer.
257 */
258int
259sbwait(struct sockbuf *sb)
260{
261
262	SOCKBUF_LOCK_ASSERT(sb);
263
264	sb->sb_flags |= SB_WAIT;
265	return (msleep_sbt(&sb->sb_acc, &sb->sb_mtx,
266	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
267	    sb->sb_timeo, 0, 0));
268}
269
270int
271sblock(struct sockbuf *sb, int flags)
272{
273
274	KASSERT((flags & SBL_VALID) == flags,
275	    ("sblock: flags invalid (0x%x)", flags));
276
277	if (flags & SBL_WAIT) {
278		if ((sb->sb_flags & SB_NOINTR) ||
279		    (flags & SBL_NOINTR)) {
280			sx_xlock(&sb->sb_sx);
281			return (0);
282		}
283		return (sx_xlock_sig(&sb->sb_sx));
284	} else {
285		if (sx_try_xlock(&sb->sb_sx) == 0)
286			return (EWOULDBLOCK);
287		return (0);
288	}
289}
290
291void
292sbunlock(struct sockbuf *sb)
293{
294
295	sx_xunlock(&sb->sb_sx);
296}
297
298/*
299 * Wakeup processes waiting on a socket buffer.  Do asynchronous notification
300 * via SIGIO if the socket has the SS_ASYNC flag set.
301 *
302 * Called with the socket buffer lock held; will release the lock by the end
303 * of the function.  This allows the caller to acquire the socket buffer lock
304 * while testing for the need for various sorts of wakeup and hold it through
305 * to the point where it's no longer required.  We currently hold the lock
306 * through calls out to other subsystems (with the exception of kqueue), and
307 * then release it to avoid lock order issues.  It's not clear that's
308 * correct.
309 */
310void
311sowakeup(struct socket *so, struct sockbuf *sb)
312{
313	int ret;
314
315	SOCKBUF_LOCK_ASSERT(sb);
316
317	selwakeuppri(&sb->sb_sel, PSOCK);
318	if (!SEL_WAITING(&sb->sb_sel))
319		sb->sb_flags &= ~SB_SEL;
320	if (sb->sb_flags & SB_WAIT) {
321		sb->sb_flags &= ~SB_WAIT;
322		wakeup(&sb->sb_acc);
323	}
324	KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
325	if (sb->sb_upcall != NULL) {
326		ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT);
327		if (ret == SU_ISCONNECTED) {
328			KASSERT(sb == &so->so_rcv,
329			    ("SO_SND upcall returned SU_ISCONNECTED"));
330			soupcall_clear(so, SO_RCV);
331		}
332	} else
333		ret = SU_OK;
334	if (sb->sb_flags & SB_AIO)
335		sowakeup_aio(so, sb);
336	SOCKBUF_UNLOCK(sb);
337	if (ret == SU_ISCONNECTED)
338		soisconnected(so);
339	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
340		pgsigio(&so->so_sigio, SIGIO, 0);
341	mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED);
342}
343
344/*
345 * Socket buffer (struct sockbuf) utility routines.
346 *
347 * Each socket contains two socket buffers: one for sending data and one for
348 * receiving data.  Each buffer contains a queue of mbufs, information about
349 * the number of mbufs and amount of data in the queue, and other fields
350 * allowing select() statements and notification on data availability to be
351 * implemented.
352 *
353 * Data stored in a socket buffer is maintained as a list of records.  Each
354 * record is a list of mbufs chained together with the m_next field.  Records
355 * are chained together with the m_nextpkt field. The upper level routine
356 * soreceive() expects the following conventions to be observed when placing
357 * information in the receive buffer:
358 *
359 * 1. If the protocol requires each message be preceded by the sender's name,
360 *    then a record containing that name must be present before any
361 *    associated data (mbuf's must be of type MT_SONAME).
362 * 2. If the protocol supports the exchange of ``access rights'' (really just
363 *    additional data associated with the message), and there are ``rights''
364 *    to be received, then a record containing this data should be present
365 *    (mbuf's must be of type MT_RIGHTS).
366 * 3. If a name or rights record exists, then it must be followed by a data
367 *    record, perhaps of zero length.
368 *
369 * Before using a new socket structure it is first necessary to reserve
370 * buffer space to the socket, by calling sbreserve().  This should commit
371 * some of the available buffer space in the system buffer pool for the
372 * socket (currently, it does nothing but enforce limits).  The space should
373 * be released by calling sbrelease() when the socket is destroyed.
374 */
375int
376soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
377{
378	struct thread *td = curthread;
379
380	SOCKBUF_LOCK(&so->so_snd);
381	SOCKBUF_LOCK(&so->so_rcv);
382	if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0)
383		goto bad;
384	if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0)
385		goto bad2;
386	if (so->so_rcv.sb_lowat == 0)
387		so->so_rcv.sb_lowat = 1;
388	if (so->so_snd.sb_lowat == 0)
389		so->so_snd.sb_lowat = MCLBYTES;
390	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
391		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
392	SOCKBUF_UNLOCK(&so->so_rcv);
393	SOCKBUF_UNLOCK(&so->so_snd);
394	return (0);
395bad2:
396	sbrelease_locked(&so->so_snd, so);
397bad:
398	SOCKBUF_UNLOCK(&so->so_rcv);
399	SOCKBUF_UNLOCK(&so->so_snd);
400	return (ENOBUFS);
401}
402
403static int
404sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
405{
406	int error = 0;
407	u_long tmp_sb_max = sb_max;
408
409	error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req);
410	if (error || !req->newptr)
411		return (error);
412	if (tmp_sb_max < MSIZE + MCLBYTES)
413		return (EINVAL);
414	sb_max = tmp_sb_max;
415	sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
416	return (0);
417}
418
419/*
420 * Allot mbufs to a sockbuf.  Attempt to scale mbmax so that mbcnt doesn't
421 * become limiting if buffering efficiency is near the normal case.
422 */
423int
424sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
425    struct thread *td)
426{
427	rlim_t sbsize_limit;
428
429	SOCKBUF_LOCK_ASSERT(sb);
430
431	/*
432	 * When a thread is passed, we take into account the thread's socket
433	 * buffer size limit.  The caller will generally pass curthread, but
434	 * in the TCP input path, NULL will be passed to indicate that no
435	 * appropriate thread resource limits are available.  In that case,
436	 * we don't apply a process limit.
437	 */
438	if (cc > sb_max_adj)
439		return (0);
440	if (td != NULL) {
441		sbsize_limit = lim_cur(td, RLIMIT_SBSIZE);
442	} else
443		sbsize_limit = RLIM_INFINITY;
444	if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
445	    sbsize_limit))
446		return (0);
447	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
448	if (sb->sb_lowat > sb->sb_hiwat)
449		sb->sb_lowat = sb->sb_hiwat;
450	return (1);
451}
452
453int
454sbreserve(struct sockbuf *sb, u_long cc, struct socket *so,
455    struct thread *td)
456{
457	int error;
458
459	SOCKBUF_LOCK(sb);
460	error = sbreserve_locked(sb, cc, so, td);
461	SOCKBUF_UNLOCK(sb);
462	return (error);
463}
464
465/*
466 * Free mbufs held by a socket, and reserved mbuf space.
467 */
468void
469sbrelease_internal(struct sockbuf *sb, struct socket *so)
470{
471
472	sbflush_internal(sb);
473	(void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
474	    RLIM_INFINITY);
475	sb->sb_mbmax = 0;
476}
477
478void
479sbrelease_locked(struct sockbuf *sb, struct socket *so)
480{
481
482	SOCKBUF_LOCK_ASSERT(sb);
483
484	sbrelease_internal(sb, so);
485}
486
487void
488sbrelease(struct sockbuf *sb, struct socket *so)
489{
490
491	SOCKBUF_LOCK(sb);
492	sbrelease_locked(sb, so);
493	SOCKBUF_UNLOCK(sb);
494}
495
496void
497sbdestroy(struct sockbuf *sb, struct socket *so)
498{
499
500	sbrelease_internal(sb, so);
501}
502
503/*
504 * Routines to add and remove data from an mbuf queue.
505 *
506 * The routines sbappend() or sbappendrecord() are normally called to append
507 * new mbufs to a socket buffer, after checking that adequate space is
508 * available, comparing the function sbspace() with the amount of data to be
509 * added.  sbappendrecord() differs from sbappend() in that data supplied is
510 * treated as the beginning of a new record.  To place a sender's address,
511 * optional access rights, and data in a socket receive buffer,
512 * sbappendaddr() should be used.  To place access rights and data in a
513 * socket receive buffer, sbappendrights() should be used.  In either case,
514 * the new data begins a new record.  Note that unlike sbappend() and
515 * sbappendrecord(), these routines check for the caller that there will be
516 * enough space to store the data.  Each fails if there is not enough space,
517 * or if it cannot find mbufs to store additional information in.
518 *
519 * Reliable protocols may use the socket send buffer to hold data awaiting
520 * acknowledgement.  Data is normally copied from a socket send buffer in a
521 * protocol with m_copy for output to a peer, and then removing the data from
522 * the socket buffer with sbdrop() or sbdroprecord() when the data is
523 * acknowledged by the peer.
524 */
525#ifdef SOCKBUF_DEBUG
526void
527sblastrecordchk(struct sockbuf *sb, const char *file, int line)
528{
529	struct mbuf *m = sb->sb_mb;
530
531	SOCKBUF_LOCK_ASSERT(sb);
532
533	while (m && m->m_nextpkt)
534		m = m->m_nextpkt;
535
536	if (m != sb->sb_lastrecord) {
537		printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
538			__func__, sb->sb_mb, sb->sb_lastrecord, m);
539		printf("packet chain:\n");
540		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
541			printf("\t%p\n", m);
542		panic("%s from %s:%u", __func__, file, line);
543	}
544}
545
546void
547sblastmbufchk(struct sockbuf *sb, const char *file, int line)
548{
549	struct mbuf *m = sb->sb_mb;
550	struct mbuf *n;
551
552	SOCKBUF_LOCK_ASSERT(sb);
553
554	while (m && m->m_nextpkt)
555		m = m->m_nextpkt;
556
557	while (m && m->m_next)
558		m = m->m_next;
559
560	if (m != sb->sb_mbtail) {
561		printf("%s: sb_mb %p sb_mbtail %p last %p\n",
562			__func__, sb->sb_mb, sb->sb_mbtail, m);
563		printf("packet tree:\n");
564		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
565			printf("\t");
566			for (n = m; n != NULL; n = n->m_next)
567				printf("%p ", n);
568			printf("\n");
569		}
570		panic("%s from %s:%u", __func__, file, line);
571	}
572}
573#endif /* SOCKBUF_DEBUG */
574
575#define SBLINKRECORD(sb, m0) do {					\
576	SOCKBUF_LOCK_ASSERT(sb);					\
577	if ((sb)->sb_lastrecord != NULL)				\
578		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
579	else								\
580		(sb)->sb_mb = (m0);					\
581	(sb)->sb_lastrecord = (m0);					\
582} while (/*CONSTCOND*/0)
583
584/*
585 * Append mbuf chain m to the last record in the socket buffer sb.  The
586 * additional space associated the mbuf chain is recorded in sb.  Empty mbufs
587 * are discarded and mbufs are compacted where possible.
588 */
589void
590sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags)
591{
592	struct mbuf *n;
593
594	SOCKBUF_LOCK_ASSERT(sb);
595
596	if (m == NULL)
597		return;
598	sbm_clrprotoflags(m, flags);
599	SBLASTRECORDCHK(sb);
600	n = sb->sb_mb;
601	if (n) {
602		while (n->m_nextpkt)
603			n = n->m_nextpkt;
604		do {
605			if (n->m_flags & M_EOR) {
606				sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
607				return;
608			}
609		} while (n->m_next && (n = n->m_next));
610	} else {
611		/*
612		 * XXX Would like to simply use sb_mbtail here, but
613		 * XXX I need to verify that I won't miss an EOR that
614		 * XXX way.
615		 */
616		if ((n = sb->sb_lastrecord) != NULL) {
617			do {
618				if (n->m_flags & M_EOR) {
619					sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
620					return;
621				}
622			} while (n->m_next && (n = n->m_next));
623		} else {
624			/*
625			 * If this is the first record in the socket buffer,
626			 * it's also the last record.
627			 */
628			sb->sb_lastrecord = m;
629		}
630	}
631	sbcompress(sb, m, n);
632	SBLASTRECORDCHK(sb);
633}
634
635/*
636 * Append mbuf chain m to the last record in the socket buffer sb.  The
637 * additional space associated the mbuf chain is recorded in sb.  Empty mbufs
638 * are discarded and mbufs are compacted where possible.
639 */
640void
641sbappend(struct sockbuf *sb, struct mbuf *m, int flags)
642{
643
644	SOCKBUF_LOCK(sb);
645	sbappend_locked(sb, m, flags);
646	SOCKBUF_UNLOCK(sb);
647}
648
649/*
650 * This version of sbappend() should only be used when the caller absolutely
651 * knows that there will never be more than one record in the socket buffer,
652 * that is, a stream protocol (such as TCP).
653 */
654void
655sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags)
656{
657	SOCKBUF_LOCK_ASSERT(sb);
658
659	KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
660	KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
661
662	SBLASTMBUFCHK(sb);
663
664	/* Remove all packet headers and mbuf tags to get a pure data chain. */
665	m_demote(m, 1, flags & PRUS_NOTREADY ? M_NOTREADY : 0);
666
667	sbcompress(sb, m, sb->sb_mbtail);
668
669	sb->sb_lastrecord = sb->sb_mb;
670	SBLASTRECORDCHK(sb);
671}
672
673/*
674 * This version of sbappend() should only be used when the caller absolutely
675 * knows that there will never be more than one record in the socket buffer,
676 * that is, a stream protocol (such as TCP).
677 */
678void
679sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags)
680{
681
682	SOCKBUF_LOCK(sb);
683	sbappendstream_locked(sb, m, flags);
684	SOCKBUF_UNLOCK(sb);
685}
686
687#ifdef SOCKBUF_DEBUG
688void
689sbcheck(struct sockbuf *sb, const char *file, int line)
690{
691	struct mbuf *m, *n, *fnrdy;
692	u_long acc, ccc, mbcnt;
693
694	SOCKBUF_LOCK_ASSERT(sb);
695
696	acc = ccc = mbcnt = 0;
697	fnrdy = NULL;
698
699	for (m = sb->sb_mb; m; m = n) {
700	    n = m->m_nextpkt;
701	    for (; m; m = m->m_next) {
702		if (m->m_len == 0) {
703			printf("sb %p empty mbuf %p\n", sb, m);
704			goto fail;
705		}
706		if ((m->m_flags & M_NOTREADY) && fnrdy == NULL) {
707			if (m != sb->sb_fnrdy) {
708				printf("sb %p: fnrdy %p != m %p\n",
709				    sb, sb->sb_fnrdy, m);
710				goto fail;
711			}
712			fnrdy = m;
713		}
714		if (fnrdy) {
715			if (!(m->m_flags & M_NOTAVAIL)) {
716				printf("sb %p: fnrdy %p, m %p is avail\n",
717				    sb, sb->sb_fnrdy, m);
718				goto fail;
719			}
720		} else
721			acc += m->m_len;
722		ccc += m->m_len;
723		mbcnt += MSIZE;
724		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
725			mbcnt += m->m_ext.ext_size;
726	    }
727	}
728	if (acc != sb->sb_acc || ccc != sb->sb_ccc || mbcnt != sb->sb_mbcnt) {
729		printf("acc %ld/%u ccc %ld/%u mbcnt %ld/%u\n",
730		    acc, sb->sb_acc, ccc, sb->sb_ccc, mbcnt, sb->sb_mbcnt);
731		goto fail;
732	}
733	return;
734fail:
735	panic("%s from %s:%u", __func__, file, line);
736}
737#endif
738
739/*
740 * As above, except the mbuf chain begins a new record.
741 */
742void
743sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0)
744{
745	struct mbuf *m;
746
747	SOCKBUF_LOCK_ASSERT(sb);
748
749	if (m0 == NULL)
750		return;
751	m_clrprotoflags(m0);
752	/*
753	 * Put the first mbuf on the queue.  Note this permits zero length
754	 * records.
755	 */
756	sballoc(sb, m0);
757	SBLASTRECORDCHK(sb);
758	SBLINKRECORD(sb, m0);
759	sb->sb_mbtail = m0;
760	m = m0->m_next;
761	m0->m_next = 0;
762	if (m && (m0->m_flags & M_EOR)) {
763		m0->m_flags &= ~M_EOR;
764		m->m_flags |= M_EOR;
765	}
766	/* always call sbcompress() so it can do SBLASTMBUFCHK() */
767	sbcompress(sb, m, m0);
768}
769
770/*
771 * As above, except the mbuf chain begins a new record.
772 */
773void
774sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
775{
776
777	SOCKBUF_LOCK(sb);
778	sbappendrecord_locked(sb, m0);
779	SOCKBUF_UNLOCK(sb);
780}
781
782/* Helper routine that appends data, control, and address to a sockbuf. */
783static int
784sbappendaddr_locked_internal(struct sockbuf *sb, const struct sockaddr *asa,
785    struct mbuf *m0, struct mbuf *control, struct mbuf *ctrl_last)
786{
787	struct mbuf *m, *n, *nlast;
788#if MSIZE <= 256
789	if (asa->sa_len > MLEN)
790		return (0);
791#endif
792	m = m_get(M_NOWAIT, MT_SONAME);
793	if (m == NULL)
794		return (0);
795	m->m_len = asa->sa_len;
796	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
797	if (m0) {
798		m_clrprotoflags(m0);
799		m_tag_delete_chain(m0, NULL);
800		/*
801		 * Clear some persistent info from pkthdr.
802		 * We don't use m_demote(), because some netgraph consumers
803		 * expect M_PKTHDR presence.
804		 */
805		m0->m_pkthdr.rcvif = NULL;
806		m0->m_pkthdr.flowid = 0;
807		m0->m_pkthdr.csum_flags = 0;
808		m0->m_pkthdr.fibnum = 0;
809		m0->m_pkthdr.rsstype = 0;
810	}
811	if (ctrl_last)
812		ctrl_last->m_next = m0;	/* concatenate data to control */
813	else
814		control = m0;
815	m->m_next = control;
816	for (n = m; n->m_next != NULL; n = n->m_next)
817		sballoc(sb, n);
818	sballoc(sb, n);
819	nlast = n;
820	SBLINKRECORD(sb, m);
821
822	sb->sb_mbtail = nlast;
823	SBLASTMBUFCHK(sb);
824
825	SBLASTRECORDCHK(sb);
826	return (1);
827}
828
829/*
830 * Append address and data, and optionally, control (ancillary) data to the
831 * receive queue of a socket.  If present, m0 must include a packet header
832 * with total length.  Returns 0 if no space in sockbuf or insufficient
833 * mbufs.
834 */
835int
836sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
837    struct mbuf *m0, struct mbuf *control)
838{
839	struct mbuf *ctrl_last;
840	int space = asa->sa_len;
841
842	SOCKBUF_LOCK_ASSERT(sb);
843
844	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
845		panic("sbappendaddr_locked");
846	if (m0)
847		space += m0->m_pkthdr.len;
848	space += m_length(control, &ctrl_last);
849
850	if (space > sbspace(sb))
851		return (0);
852	return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last));
853}
854
855/*
856 * Append address and data, and optionally, control (ancillary) data to the
857 * receive queue of a socket.  If present, m0 must include a packet header
858 * with total length.  Returns 0 if insufficient mbufs.  Does not validate space
859 * on the receiving sockbuf.
860 */
861int
862sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa,
863    struct mbuf *m0, struct mbuf *control)
864{
865	struct mbuf *ctrl_last;
866
867	SOCKBUF_LOCK_ASSERT(sb);
868
869	ctrl_last = (control == NULL) ? NULL : m_last(control);
870	return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last));
871}
872
873/*
874 * Append address and data, and optionally, control (ancillary) data to the
875 * receive queue of a socket.  If present, m0 must include a packet header
876 * with total length.  Returns 0 if no space in sockbuf or insufficient
877 * mbufs.
878 */
879int
880sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
881    struct mbuf *m0, struct mbuf *control)
882{
883	int retval;
884
885	SOCKBUF_LOCK(sb);
886	retval = sbappendaddr_locked(sb, asa, m0, control);
887	SOCKBUF_UNLOCK(sb);
888	return (retval);
889}
890
891void
892sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
893    struct mbuf *control)
894{
895	struct mbuf *m, *mlast;
896
897	m_clrprotoflags(m0);
898	m_last(control)->m_next = m0;
899
900	SBLASTRECORDCHK(sb);
901
902	for (m = control; m->m_next; m = m->m_next)
903		sballoc(sb, m);
904	sballoc(sb, m);
905	mlast = m;
906	SBLINKRECORD(sb, control);
907
908	sb->sb_mbtail = mlast;
909	SBLASTMBUFCHK(sb);
910
911	SBLASTRECORDCHK(sb);
912}
913
914void
915sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
916{
917
918	SOCKBUF_LOCK(sb);
919	sbappendcontrol_locked(sb, m0, control);
920	SOCKBUF_UNLOCK(sb);
921}
922
923/*
924 * Append the data in mbuf chain (m) into the socket buffer sb following mbuf
925 * (n).  If (n) is NULL, the buffer is presumed empty.
926 *
927 * When the data is compressed, mbufs in the chain may be handled in one of
928 * three ways:
929 *
930 * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no
931 *     record boundary, and no change in data type).
932 *
933 * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into
934 *     an mbuf already in the socket buffer.  This can occur if an
935 *     appropriate mbuf exists, there is room, both mbufs are not marked as
936 *     not ready, and no merging of data types will occur.
937 *
938 * (3) The mbuf may be appended to the end of the existing mbuf chain.
939 *
940 * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as
941 * end-of-record.
942 */
943void
944sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
945{
946	int eor = 0;
947	struct mbuf *o;
948
949	SOCKBUF_LOCK_ASSERT(sb);
950
951	while (m) {
952		eor |= m->m_flags & M_EOR;
953		if (m->m_len == 0 &&
954		    (eor == 0 ||
955		     (((o = m->m_next) || (o = n)) &&
956		      o->m_type == m->m_type))) {
957			if (sb->sb_lastrecord == m)
958				sb->sb_lastrecord = m->m_next;
959			m = m_free(m);
960			continue;
961		}
962		if (n && (n->m_flags & M_EOR) == 0 &&
963		    M_WRITABLE(n) &&
964		    ((sb->sb_flags & SB_NOCOALESCE) == 0) &&
965		    !(m->m_flags & M_NOTREADY) &&
966		    !(n->m_flags & M_NOTREADY) &&
967		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
968		    m->m_len <= M_TRAILINGSPACE(n) &&
969		    n->m_type == m->m_type) {
970			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
971			    (unsigned)m->m_len);
972			n->m_len += m->m_len;
973			sb->sb_ccc += m->m_len;
974			if (sb->sb_fnrdy == NULL)
975				sb->sb_acc += m->m_len;
976			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
977				/* XXX: Probably don't need.*/
978				sb->sb_ctl += m->m_len;
979			m = m_free(m);
980			continue;
981		}
982		if (n)
983			n->m_next = m;
984		else
985			sb->sb_mb = m;
986		sb->sb_mbtail = m;
987		sballoc(sb, m);
988		n = m;
989		m->m_flags &= ~M_EOR;
990		m = m->m_next;
991		n->m_next = 0;
992	}
993	if (eor) {
994		KASSERT(n != NULL, ("sbcompress: eor && n == NULL"));
995		n->m_flags |= eor;
996	}
997	SBLASTMBUFCHK(sb);
998}
999
1000/*
1001 * Free all mbufs in a sockbuf.  Check that all resources are reclaimed.
1002 */
1003static void
1004sbflush_internal(struct sockbuf *sb)
1005{
1006
1007	while (sb->sb_mbcnt) {
1008		/*
1009		 * Don't call sbcut(sb, 0) if the leading mbuf is non-empty:
1010		 * we would loop forever. Panic instead.
1011		 */
1012		if (sb->sb_ccc == 0 && (sb->sb_mb == NULL || sb->sb_mb->m_len))
1013			break;
1014		m_freem(sbcut_internal(sb, (int)sb->sb_ccc));
1015	}
1016	KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0,
1017	    ("%s: ccc %u mb %p mbcnt %u", __func__,
1018	    sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt));
1019}
1020
1021void
1022sbflush_locked(struct sockbuf *sb)
1023{
1024
1025	SOCKBUF_LOCK_ASSERT(sb);
1026	sbflush_internal(sb);
1027}
1028
1029void
1030sbflush(struct sockbuf *sb)
1031{
1032
1033	SOCKBUF_LOCK(sb);
1034	sbflush_locked(sb);
1035	SOCKBUF_UNLOCK(sb);
1036}
1037
1038/*
1039 * Cut data from (the front of) a sockbuf.
1040 */
1041static struct mbuf *
1042sbcut_internal(struct sockbuf *sb, int len)
1043{
1044	struct mbuf *m, *next, *mfree;
1045
1046	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
1047	mfree = NULL;
1048
1049	while (len > 0) {
1050		if (m == NULL) {
1051			KASSERT(next, ("%s: no next, len %d", __func__, len));
1052			m = next;
1053			next = m->m_nextpkt;
1054		}
1055		if (m->m_len > len) {
1056			KASSERT(!(m->m_flags & M_NOTAVAIL),
1057			    ("%s: m %p M_NOTAVAIL", __func__, m));
1058			m->m_len -= len;
1059			m->m_data += len;
1060			sb->sb_ccc -= len;
1061			sb->sb_acc -= len;
1062			if (sb->sb_sndptroff != 0)
1063				sb->sb_sndptroff -= len;
1064			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
1065				sb->sb_ctl -= len;
1066			break;
1067		}
1068		len -= m->m_len;
1069		sbfree(sb, m);
1070		/*
1071		 * Do not put M_NOTREADY buffers to the free list, they
1072		 * are referenced from outside.
1073		 */
1074		if (m->m_flags & M_NOTREADY)
1075			m = m->m_next;
1076		else {
1077			struct mbuf *n;
1078
1079			n = m->m_next;
1080			m->m_next = mfree;
1081			mfree = m;
1082			m = n;
1083		}
1084	}
1085	/*
1086	 * Free any zero-length mbufs from the buffer.
1087	 * For SOCK_DGRAM sockets such mbufs represent empty records.
1088	 * XXX: For SOCK_STREAM sockets such mbufs can appear in the buffer,
1089	 * when sosend_generic() needs to send only control data.
1090	 */
1091	while (m && m->m_len == 0) {
1092		struct mbuf *n;
1093
1094		sbfree(sb, m);
1095		n = m->m_next;
1096		m->m_next = mfree;
1097		mfree = m;
1098		m = n;
1099	}
1100	if (m) {
1101		sb->sb_mb = m;
1102		m->m_nextpkt = next;
1103	} else
1104		sb->sb_mb = next;
1105	/*
1106	 * First part is an inline SB_EMPTY_FIXUP().  Second part makes sure
1107	 * sb_lastrecord is up-to-date if we dropped part of the last record.
1108	 */
1109	m = sb->sb_mb;
1110	if (m == NULL) {
1111		sb->sb_mbtail = NULL;
1112		sb->sb_lastrecord = NULL;
1113	} else if (m->m_nextpkt == NULL) {
1114		sb->sb_lastrecord = m;
1115	}
1116
1117	return (mfree);
1118}
1119
1120/*
1121 * Drop data from (the front of) a sockbuf.
1122 */
1123void
1124sbdrop_locked(struct sockbuf *sb, int len)
1125{
1126
1127	SOCKBUF_LOCK_ASSERT(sb);
1128	m_freem(sbcut_internal(sb, len));
1129}
1130
1131/*
1132 * Drop data from (the front of) a sockbuf,
1133 * and return it to caller.
1134 */
1135struct mbuf *
1136sbcut_locked(struct sockbuf *sb, int len)
1137{
1138
1139	SOCKBUF_LOCK_ASSERT(sb);
1140	return (sbcut_internal(sb, len));
1141}
1142
1143void
1144sbdrop(struct sockbuf *sb, int len)
1145{
1146	struct mbuf *mfree;
1147
1148	SOCKBUF_LOCK(sb);
1149	mfree = sbcut_internal(sb, len);
1150	SOCKBUF_UNLOCK(sb);
1151
1152	m_freem(mfree);
1153}
1154
1155/*
1156 * Maintain a pointer and offset pair into the socket buffer mbuf chain to
1157 * avoid traversal of the entire socket buffer for larger offsets.
1158 */
1159struct mbuf *
1160sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff)
1161{
1162	struct mbuf *m, *ret;
1163
1164	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
1165	KASSERT(off + len <= sb->sb_acc, ("%s: beyond sb", __func__));
1166	KASSERT(sb->sb_sndptroff <= sb->sb_acc, ("%s: sndptroff broken", __func__));
1167
1168	/*
1169	 * Is off below stored offset? Happens on retransmits.
1170	 * Just return, we can't help here.
1171	 */
1172	if (sb->sb_sndptroff > off) {
1173		*moff = off;
1174		return (sb->sb_mb);
1175	}
1176
1177	/* Return closest mbuf in chain for current offset. */
1178	*moff = off - sb->sb_sndptroff;
1179	m = ret = sb->sb_sndptr ? sb->sb_sndptr : sb->sb_mb;
1180	if (*moff == m->m_len) {
1181		*moff = 0;
1182		sb->sb_sndptroff += m->m_len;
1183		m = ret = m->m_next;
1184		KASSERT(ret->m_len > 0,
1185		    ("mbuf %p in sockbuf %p chain has no valid data", ret, sb));
1186	}
1187
1188	/* Advance by len to be as close as possible for the next transmit. */
1189	for (off = off - sb->sb_sndptroff + len - 1;
1190	     off > 0 && m != NULL && off >= m->m_len;
1191	     m = m->m_next) {
1192		sb->sb_sndptroff += m->m_len;
1193		off -= m->m_len;
1194	}
1195	if (off > 0 && m == NULL)
1196		panic("%s: sockbuf %p and mbuf %p clashing", __func__, sb, ret);
1197	sb->sb_sndptr = m;
1198
1199	return (ret);
1200}
1201
1202/*
1203 * Return the first mbuf and the mbuf data offset for the provided
1204 * send offset without changing the "sb_sndptroff" field.
1205 */
1206struct mbuf *
1207sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff)
1208{
1209	struct mbuf *m;
1210
1211	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
1212
1213	/*
1214	 * If the "off" is below the stored offset, which happens on
1215	 * retransmits, just use "sb_mb":
1216	 */
1217	if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) {
1218		m = sb->sb_mb;
1219	} else {
1220		m = sb->sb_sndptr;
1221		off -= sb->sb_sndptroff;
1222	}
1223	while (off > 0 && m != NULL) {
1224		if (off < m->m_len)
1225			break;
1226		off -= m->m_len;
1227		m = m->m_next;
1228	}
1229	*moff = off;
1230	return (m);
1231}
1232
1233/*
1234 * Drop a record off the front of a sockbuf and move the next record to the
1235 * front.
1236 */
1237void
1238sbdroprecord_locked(struct sockbuf *sb)
1239{
1240	struct mbuf *m;
1241
1242	SOCKBUF_LOCK_ASSERT(sb);
1243
1244	m = sb->sb_mb;
1245	if (m) {
1246		sb->sb_mb = m->m_nextpkt;
1247		do {
1248			sbfree(sb, m);
1249			m = m_free(m);
1250		} while (m);
1251	}
1252	SB_EMPTY_FIXUP(sb);
1253}
1254
1255/*
1256 * Drop a record off the front of a sockbuf and move the next record to the
1257 * front.
1258 */
1259void
1260sbdroprecord(struct sockbuf *sb)
1261{
1262
1263	SOCKBUF_LOCK(sb);
1264	sbdroprecord_locked(sb);
1265	SOCKBUF_UNLOCK(sb);
1266}
1267
1268/*
1269 * Create a "control" mbuf containing the specified data with the specified
1270 * type for presentation on a socket buffer.
1271 */
1272struct mbuf *
1273sbcreatecontrol(caddr_t p, int size, int type, int level)
1274{
1275	struct cmsghdr *cp;
1276	struct mbuf *m;
1277
1278	if (CMSG_SPACE((u_int)size) > MCLBYTES)
1279		return ((struct mbuf *) NULL);
1280	if (CMSG_SPACE((u_int)size) > MLEN)
1281		m = m_getcl(M_NOWAIT, MT_CONTROL, 0);
1282	else
1283		m = m_get(M_NOWAIT, MT_CONTROL);
1284	if (m == NULL)
1285		return ((struct mbuf *) NULL);
1286	cp = mtod(m, struct cmsghdr *);
1287	m->m_len = 0;
1288	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
1289	    ("sbcreatecontrol: short mbuf"));
1290	/*
1291	 * Don't leave the padding between the msg header and the
1292	 * cmsg data and the padding after the cmsg data un-initialized.
1293	 */
1294	bzero(cp, CMSG_SPACE((u_int)size));
1295	if (p != NULL)
1296		(void)memcpy(CMSG_DATA(cp), p, size);
1297	m->m_len = CMSG_SPACE(size);
1298	cp->cmsg_len = CMSG_LEN(size);
1299	cp->cmsg_level = level;
1300	cp->cmsg_type = type;
1301	return (m);
1302}
1303
1304/*
1305 * This does the same for socket buffers that sotoxsocket does for sockets:
1306 * generate an user-format data structure describing the socket buffer.  Note
1307 * that the xsockbuf structure, since it is always embedded in a socket, does
1308 * not include a self pointer nor a length.  We make this entry point public
1309 * in case some other mechanism needs it.
1310 */
1311void
1312sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
1313{
1314
1315	xsb->sb_cc = sb->sb_ccc;
1316	xsb->sb_hiwat = sb->sb_hiwat;
1317	xsb->sb_mbcnt = sb->sb_mbcnt;
1318	xsb->sb_mcnt = sb->sb_mcnt;
1319	xsb->sb_ccnt = sb->sb_ccnt;
1320	xsb->sb_mbmax = sb->sb_mbmax;
1321	xsb->sb_lowat = sb->sb_lowat;
1322	xsb->sb_flags = sb->sb_flags;
1323	xsb->sb_timeo = sb->sb_timeo;
1324}
1325
1326/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
1327static int dummy;
1328SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
1329SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW,
1330    &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size");
1331SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
1332    &sb_efficiency, 0, "Socket buffer size waste factor");
1333