1/*	$OpenBSD: uipc_socket2.c,v 1.155 2024/05/17 19:11:14 mvs Exp $	*/
2/*	$NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $	*/
3
4/*
5 * Copyright (c) 1982, 1986, 1988, 1990, 1993
6 *	The Regents of the University of California.  All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
33 */
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/malloc.h>
38#include <sys/mbuf.h>
39#include <sys/protosw.h>
40#include <sys/domain.h>
41#include <sys/socket.h>
42#include <sys/socketvar.h>
43#include <sys/signalvar.h>
44#include <sys/pool.h>
45
46/*
47 * Primitive routines for operating on sockets and socket buffers
48 */
49
50u_long	sb_max = SB_MAX;		/* patchable */
51
52extern struct pool mclpools[];
53extern struct pool mbpool;
54
55/*
56 * Procedures to manipulate state flags of socket
57 * and do appropriate wakeups.  Normal sequence from the
58 * active (originating) side is that soisconnecting() is
59 * called during processing of connect() call,
60 * resulting in an eventual call to soisconnected() if/when the
61 * connection is established.  When the connection is torn down
62 * soisdisconnecting() is called during processing of disconnect() call,
63 * and soisdisconnected() is called when the connection to the peer
64 * is totally severed.  The semantics of these routines are such that
65 * connectionless protocols can call soisconnected() and soisdisconnected()
66 * only, bypassing the in-progress calls when setting up a ``connection''
67 * takes no time.
68 *
69 * From the passive side, a socket is created with
70 * two queues of sockets: so_q0 for connections in progress
71 * and so_q for connections already made and awaiting user acceptance.
72 * As a protocol is preparing incoming connections, it creates a socket
73 * structure queued on so_q0 by calling sonewconn().  When the connection
74 * is established, soisconnected() is called, and transfers the
75 * socket structure to so_q, making it available to accept().
76 *
77 * If a socket is closed with sockets on either
78 * so_q0 or so_q, these sockets are dropped.
79 *
80 * If higher level protocols are implemented in
81 * the kernel, the wakeups done here will sometimes
82 * cause software-interrupt process scheduling.
83 */
84
85void
86soisconnecting(struct socket *so)
87{
88	soassertlocked(so);
89	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
90	so->so_state |= SS_ISCONNECTING;
91}
92
93void
94soisconnected(struct socket *so)
95{
96	struct socket *head = so->so_head;
97
98	soassertlocked(so);
99	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
100	so->so_state |= SS_ISCONNECTED;
101
102	if (head != NULL && so->so_onq == &head->so_q0) {
103		int persocket = solock_persocket(so);
104
105		if (persocket) {
106			soref(so);
107			soref(head);
108
109			sounlock(so);
110			solock(head);
111			solock(so);
112
113			if (so->so_onq != &head->so_q0) {
114				sounlock(head);
115				sorele(head);
116				sorele(so);
117
118				return;
119			}
120
121			sorele(head);
122			sorele(so);
123		}
124
125		soqremque(so, 0);
126		soqinsque(head, so, 1);
127		sorwakeup(head);
128		wakeup_one(&head->so_timeo);
129
130		if (persocket)
131			sounlock(head);
132	} else {
133		wakeup(&so->so_timeo);
134		sorwakeup(so);
135		sowwakeup(so);
136	}
137}
138
139void
140soisdisconnecting(struct socket *so)
141{
142	soassertlocked(so);
143	so->so_state &= ~SS_ISCONNECTING;
144	so->so_state |= SS_ISDISCONNECTING;
145
146	mtx_enter(&so->so_rcv.sb_mtx);
147	so->so_rcv.sb_state |= SS_CANTRCVMORE;
148	mtx_leave(&so->so_rcv.sb_mtx);
149
150	mtx_enter(&so->so_snd.sb_mtx);
151	so->so_snd.sb_state |= SS_CANTSENDMORE;
152	mtx_leave(&so->so_snd.sb_mtx);
153
154	wakeup(&so->so_timeo);
155	sowwakeup(so);
156	sorwakeup(so);
157}
158
159void
160soisdisconnected(struct socket *so)
161{
162	soassertlocked(so);
163	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
164	so->so_state |= SS_ISDISCONNECTED;
165
166	mtx_enter(&so->so_rcv.sb_mtx);
167	so->so_rcv.sb_state |= SS_CANTRCVMORE;
168	mtx_leave(&so->so_rcv.sb_mtx);
169
170	mtx_enter(&so->so_snd.sb_mtx);
171	so->so_snd.sb_state |= SS_CANTSENDMORE;
172	mtx_leave(&so->so_snd.sb_mtx);
173
174	wakeup(&so->so_timeo);
175	sowwakeup(so);
176	sorwakeup(so);
177}
178
179/*
180 * When an attempt at a new connection is noted on a socket
181 * which accepts connections, sonewconn is called.  If the
182 * connection is possible (subject to space constraints, etc.)
183 * then we allocate a new structure, properly linked into the
184 * data structure of the original socket, and return this.
185 * Connstatus may be 0 or SS_ISCONNECTED.
186 */
187struct socket *
188sonewconn(struct socket *head, int connstatus, int wait)
189{
190	struct socket *so;
191	int persocket = solock_persocket(head);
192	int soqueue = connstatus ? 1 : 0;
193
194	/*
195	 * XXXSMP as long as `so' and `head' share the same lock, we
196	 * can call soreserve() and pr_attach() below w/o explicitly
197	 * locking `so'.
198	 */
199	soassertlocked(head);
200
201	if (m_pool_used() > 95)
202		return (NULL);
203	if (head->so_qlen + head->so_q0len > head->so_qlimit * 3)
204		return (NULL);
205	so = soalloc(head->so_proto, wait);
206	if (so == NULL)
207		return (NULL);
208	so->so_type = head->so_type;
209	so->so_options = head->so_options &~ SO_ACCEPTCONN;
210	so->so_linger = head->so_linger;
211	so->so_state = head->so_state | SS_NOFDREF;
212	so->so_proto = head->so_proto;
213	so->so_timeo = head->so_timeo;
214	so->so_euid = head->so_euid;
215	so->so_ruid = head->so_ruid;
216	so->so_egid = head->so_egid;
217	so->so_rgid = head->so_rgid;
218	so->so_cpid = head->so_cpid;
219
220	/*
221	 * Lock order will be `head' -> `so' while these sockets are linked.
222	 */
223	if (persocket)
224		solock(so);
225
226	/*
227	 * Inherit watermarks but those may get clamped in low mem situations.
228	 */
229	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat))
230		goto fail;
231
232	mtx_enter(&head->so_snd.sb_mtx);
233	so->so_snd.sb_wat = head->so_snd.sb_wat;
234	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
235	so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs;
236	mtx_leave(&head->so_snd.sb_mtx);
237
238	mtx_enter(&head->so_rcv.sb_mtx);
239	so->so_rcv.sb_wat = head->so_rcv.sb_wat;
240	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
241	so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs;
242	mtx_leave(&head->so_rcv.sb_mtx);
243
244	sigio_copy(&so->so_sigio, &head->so_sigio);
245
246	soqinsque(head, so, soqueue);
247	if (pru_attach(so, 0, wait) != 0) {
248		soqremque(so, soqueue);
249		goto fail;
250	}
251	if (connstatus) {
252		so->so_state |= connstatus;
253		sorwakeup(head);
254		wakeup(&head->so_timeo);
255	}
256
257	if (persocket)
258		sounlock(so);
259
260	return (so);
261
262fail:
263	if (persocket)
264		sounlock(so);
265	sigio_free(&so->so_sigio);
266	klist_free(&so->so_rcv.sb_klist);
267	klist_free(&so->so_snd.sb_klist);
268	pool_put(&socket_pool, so);
269
270	return (NULL);
271}
272
273void
274soqinsque(struct socket *head, struct socket *so, int q)
275{
276	soassertlocked(head);
277	soassertlocked(so);
278
279	KASSERT(so->so_onq == NULL);
280
281	so->so_head = head;
282	if (q == 0) {
283		head->so_q0len++;
284		so->so_onq = &head->so_q0;
285	} else {
286		head->so_qlen++;
287		so->so_onq = &head->so_q;
288	}
289	TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
290}
291
292int
293soqremque(struct socket *so, int q)
294{
295	struct socket *head = so->so_head;
296
297	soassertlocked(so);
298	soassertlocked(head);
299
300	if (q == 0) {
301		if (so->so_onq != &head->so_q0)
302			return (0);
303		head->so_q0len--;
304	} else {
305		if (so->so_onq != &head->so_q)
306			return (0);
307		head->so_qlen--;
308	}
309	TAILQ_REMOVE(so->so_onq, so, so_qe);
310	so->so_onq = NULL;
311	so->so_head = NULL;
312	return (1);
313}
314
315/*
316 * Socantsendmore indicates that no more data will be sent on the
317 * socket; it would normally be applied to a socket when the user
318 * informs the system that no more data is to be sent, by the protocol
319 * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
320 * will be received, and will normally be applied to the socket by a
321 * protocol when it detects that the peer will send no more data.
322 * Data queued for reading in the socket may yet be read.
323 */
324
325void
326socantsendmore(struct socket *so)
327{
328	soassertlocked(so);
329	mtx_enter(&so->so_snd.sb_mtx);
330	so->so_snd.sb_state |= SS_CANTSENDMORE;
331	mtx_leave(&so->so_snd.sb_mtx);
332	sowwakeup(so);
333}
334
335void
336socantrcvmore(struct socket *so)
337{
338	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
339		soassertlocked(so);
340
341	mtx_enter(&so->so_rcv.sb_mtx);
342	so->so_rcv.sb_state |= SS_CANTRCVMORE;
343	mtx_leave(&so->so_rcv.sb_mtx);
344	sorwakeup(so);
345}
346
347void
348solock(struct socket *so)
349{
350	switch (so->so_proto->pr_domain->dom_family) {
351	case PF_INET:
352	case PF_INET6:
353		NET_LOCK();
354		break;
355	default:
356		rw_enter_write(&so->so_lock);
357		break;
358	}
359}
360
361void
362solock_shared(struct socket *so)
363{
364	switch (so->so_proto->pr_domain->dom_family) {
365	case PF_INET:
366	case PF_INET6:
367		if (so->so_proto->pr_usrreqs->pru_lock != NULL) {
368			NET_LOCK_SHARED();
369			rw_enter_write(&so->so_lock);
370		} else
371			NET_LOCK();
372		break;
373	default:
374		rw_enter_write(&so->so_lock);
375		break;
376	}
377}
378
379int
380solock_persocket(struct socket *so)
381{
382	switch (so->so_proto->pr_domain->dom_family) {
383	case PF_INET:
384	case PF_INET6:
385		return 0;
386	default:
387		return 1;
388	}
389}
390
391void
392solock_pair(struct socket *so1, struct socket *so2)
393{
394	KASSERT(so1 != so2);
395	KASSERT(so1->so_type == so2->so_type);
396	KASSERT(solock_persocket(so1));
397
398	if (so1 < so2) {
399		solock(so1);
400		solock(so2);
401	} else {
402		solock(so2);
403		solock(so1);
404	}
405}
406
407void
408sounlock(struct socket *so)
409{
410	switch (so->so_proto->pr_domain->dom_family) {
411	case PF_INET:
412	case PF_INET6:
413		NET_UNLOCK();
414		break;
415	default:
416		rw_exit_write(&so->so_lock);
417		break;
418	}
419}
420
421void
422sounlock_shared(struct socket *so)
423{
424	switch (so->so_proto->pr_domain->dom_family) {
425	case PF_INET:
426	case PF_INET6:
427		if (so->so_proto->pr_usrreqs->pru_unlock != NULL) {
428			rw_exit_write(&so->so_lock);
429			NET_UNLOCK_SHARED();
430		} else
431			NET_UNLOCK();
432		break;
433	default:
434		rw_exit_write(&so->so_lock);
435		break;
436	}
437}
438
439void
440soassertlocked_readonly(struct socket *so)
441{
442	switch (so->so_proto->pr_domain->dom_family) {
443	case PF_INET:
444	case PF_INET6:
445		NET_ASSERT_LOCKED();
446		break;
447	default:
448		rw_assert_wrlock(&so->so_lock);
449		break;
450	}
451}
452
453void
454soassertlocked(struct socket *so)
455{
456	switch (so->so_proto->pr_domain->dom_family) {
457	case PF_INET:
458	case PF_INET6:
459		if (rw_status(&netlock) == RW_READ) {
460			NET_ASSERT_LOCKED();
461
462			if (splassert_ctl > 0 && pru_locked(so) == 0 &&
463			    rw_status(&so->so_lock) != RW_WRITE)
464				splassert_fail(0, RW_WRITE, __func__);
465		} else
466			NET_ASSERT_LOCKED_EXCLUSIVE();
467		break;
468	default:
469		rw_assert_wrlock(&so->so_lock);
470		break;
471	}
472}
473
474int
475sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg,
476    uint64_t nsecs)
477{
478	int ret;
479
480	switch (so->so_proto->pr_domain->dom_family) {
481	case PF_INET:
482	case PF_INET6:
483		if (so->so_proto->pr_usrreqs->pru_unlock != NULL &&
484		    rw_status(&netlock) == RW_READ) {
485			rw_exit_write(&so->so_lock);
486		}
487		ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
488		if (so->so_proto->pr_usrreqs->pru_lock != NULL &&
489		    rw_status(&netlock) == RW_READ) {
490			rw_enter_write(&so->so_lock);
491		}
492		break;
493	default:
494		ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs);
495		break;
496	}
497
498	return ret;
499}
500
501void
502sbmtxassertlocked(struct socket *so, struct sockbuf *sb)
503{
504	if (sb->sb_flags & SB_MTXLOCK) {
505		if (splassert_ctl > 0 && mtx_owned(&sb->sb_mtx) == 0)
506			splassert_fail(0, RW_WRITE, __func__);
507	} else
508		soassertlocked(so);
509}
510
511/*
512 * Wait for data to arrive at/drain from a socket buffer.
513 */
514int
515sbwait(struct socket *so, struct sockbuf *sb)
516{
517	uint64_t timeo_nsecs;
518	int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
519
520	if (sb->sb_flags & SB_MTXLOCK) {
521		MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
522
523		sb->sb_flags |= SB_WAIT;
524		return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait",
525		    sb->sb_timeo_nsecs);
526	}
527
528	soassertlocked(so);
529
530	mtx_enter(&sb->sb_mtx);
531	timeo_nsecs = sb->sb_timeo_nsecs;
532	sb->sb_flags |= SB_WAIT;
533	mtx_leave(&sb->sb_mtx);
534
535	return sosleep_nsec(so, &sb->sb_cc, prio, "netio", timeo_nsecs);
536}
537
538int
539sblock(struct sockbuf *sb, int flags)
540{
541	int rwflags = RW_WRITE, error;
542
543	if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
544		rwflags |= RW_INTR;
545	if (!(flags & SBL_WAIT))
546		rwflags |= RW_NOSLEEP;
547
548	error = rw_enter(&sb->sb_lock, rwflags);
549	if (error == EBUSY)
550		error = EWOULDBLOCK;
551
552	return error;
553}
554
555void
556sbunlock(struct sockbuf *sb)
557{
558	rw_exit(&sb->sb_lock);
559}
560
561/*
562 * Wakeup processes waiting on a socket buffer.
563 * Do asynchronous notification via SIGIO
564 * if the socket buffer has the SB_ASYNC flag set.
565 */
566void
567sowakeup(struct socket *so, struct sockbuf *sb)
568{
569	int dowakeup = 0, dopgsigio = 0;
570
571	mtx_enter(&sb->sb_mtx);
572	if (sb->sb_flags & SB_WAIT) {
573		sb->sb_flags &= ~SB_WAIT;
574		dowakeup = 1;
575	}
576	if (sb->sb_flags & SB_ASYNC)
577		dopgsigio = 1;
578
579	knote_locked(&sb->sb_klist, 0);
580	mtx_leave(&sb->sb_mtx);
581
582	if (dowakeup)
583		wakeup(&sb->sb_cc);
584
585	if (dopgsigio)
586		pgsigio(&so->so_sigio, SIGIO, 0);
587}
588
589/*
590 * Socket buffer (struct sockbuf) utility routines.
591 *
592 * Each socket contains two socket buffers: one for sending data and
593 * one for receiving data.  Each buffer contains a queue of mbufs,
594 * information about the number of mbufs and amount of data in the
595 * queue, and other fields allowing select() statements and notification
596 * on data availability to be implemented.
597 *
598 * Data stored in a socket buffer is maintained as a list of records.
599 * Each record is a list of mbufs chained together with the m_next
600 * field.  Records are chained together with the m_nextpkt field. The upper
601 * level routine soreceive() expects the following conventions to be
602 * observed when placing information in the receive buffer:
603 *
604 * 1. If the protocol requires each message be preceded by the sender's
605 *    name, then a record containing that name must be present before
606 *    any associated data (mbuf's must be of type MT_SONAME).
607 * 2. If the protocol supports the exchange of ``access rights'' (really
608 *    just additional data associated with the message), and there are
609 *    ``rights'' to be received, then a record containing this data
610 *    should be present (mbuf's must be of type MT_CONTROL).
611 * 3. If a name or rights record exists, then it must be followed by
612 *    a data record, perhaps of zero length.
613 *
614 * Before using a new socket structure it is first necessary to reserve
615 * buffer space to the socket, by calling sbreserve().  This should commit
616 * some of the available buffer space in the system buffer pool for the
617 * socket (currently, it does nothing but enforce limits).  The space
618 * should be released by calling sbrelease() when the socket is destroyed.
619 */
620
621int
622soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
623{
624	soassertlocked(so);
625
626	mtx_enter(&so->so_rcv.sb_mtx);
627	mtx_enter(&so->so_snd.sb_mtx);
628	if (sbreserve(so, &so->so_snd, sndcc))
629		goto bad;
630	so->so_snd.sb_wat = sndcc;
631	if (so->so_snd.sb_lowat == 0)
632		so->so_snd.sb_lowat = MCLBYTES;
633	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
634		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
635	if (sbreserve(so, &so->so_rcv, rcvcc))
636		goto bad2;
637	so->so_rcv.sb_wat = rcvcc;
638	if (so->so_rcv.sb_lowat == 0)
639		so->so_rcv.sb_lowat = 1;
640	mtx_leave(&so->so_snd.sb_mtx);
641	mtx_leave(&so->so_rcv.sb_mtx);
642
643	return (0);
644bad2:
645	sbrelease(so, &so->so_snd);
646bad:
647	mtx_leave(&so->so_snd.sb_mtx);
648	mtx_leave(&so->so_rcv.sb_mtx);
649	return (ENOBUFS);
650}
651
652/*
653 * Allot mbufs to a sockbuf.
654 * Attempt to scale mbmax so that mbcnt doesn't become limiting
655 * if buffering efficiency is near the normal case.
656 */
657int
658sbreserve(struct socket *so, struct sockbuf *sb, u_long cc)
659{
660	sbmtxassertlocked(so, sb);
661
662	if (cc == 0 || cc > sb_max)
663		return (1);
664	sb->sb_hiwat = cc;
665	sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8);
666	if (sb->sb_lowat > sb->sb_hiwat)
667		sb->sb_lowat = sb->sb_hiwat;
668	return (0);
669}
670
671/*
672 * In low memory situation, do not accept any greater than normal request.
673 */
674int
675sbcheckreserve(u_long cnt, u_long defcnt)
676{
677	if (cnt > defcnt && sbchecklowmem())
678		return (ENOBUFS);
679	return (0);
680}
681
682int
683sbchecklowmem(void)
684{
685	static int sblowmem;
686	unsigned int used = m_pool_used();
687
688	if (used < 60)
689		sblowmem = 0;
690	else if (used > 80)
691		sblowmem = 1;
692
693	return (sblowmem);
694}
695
696/*
697 * Free mbufs held by a socket, and reserved mbuf space.
698 */
699void
700sbrelease(struct socket *so, struct sockbuf *sb)
701{
702
703	sbflush(so, sb);
704	sb->sb_hiwat = sb->sb_mbmax = 0;
705}
706
707/*
708 * Routines to add and remove
709 * data from an mbuf queue.
710 *
711 * The routines sbappend() or sbappendrecord() are normally called to
712 * append new mbufs to a socket buffer, after checking that adequate
713 * space is available, comparing the function sbspace() with the amount
714 * of data to be added.  sbappendrecord() differs from sbappend() in
715 * that data supplied is treated as the beginning of a new record.
716 * To place a sender's address, optional access rights, and data in a
717 * socket receive buffer, sbappendaddr() should be used.  To place
718 * access rights and data in a socket receive buffer, sbappendrights()
719 * should be used.  In either case, the new data begins a new record.
720 * Note that unlike sbappend() and sbappendrecord(), these routines check
721 * for the caller that there will be enough space to store the data.
722 * Each fails if there is not enough space, or if it cannot find mbufs
723 * to store additional information in.
724 *
725 * Reliable protocols may use the socket send buffer to hold data
726 * awaiting acknowledgement.  Data is normally copied from a socket
727 * send buffer in a protocol with m_copym for output to a peer,
728 * and then removing the data from the socket buffer with sbdrop()
729 * or sbdroprecord() when the data is acknowledged by the peer.
730 */
731
732#ifdef SOCKBUF_DEBUG
733void
734sblastrecordchk(struct sockbuf *sb, const char *where)
735{
736	struct mbuf *m = sb->sb_mb;
737
738	while (m && m->m_nextpkt)
739		m = m->m_nextpkt;
740
741	if (m != sb->sb_lastrecord) {
742		printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
743		    sb->sb_mb, sb->sb_lastrecord, m);
744		printf("packet chain:\n");
745		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
746			printf("\t%p\n", m);
747		panic("sblastrecordchk from %s", where);
748	}
749}
750
751void
752sblastmbufchk(struct sockbuf *sb, const char *where)
753{
754	struct mbuf *m = sb->sb_mb;
755	struct mbuf *n;
756
757	while (m && m->m_nextpkt)
758		m = m->m_nextpkt;
759
760	while (m && m->m_next)
761		m = m->m_next;
762
763	if (m != sb->sb_mbtail) {
764		printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
765		    sb->sb_mb, sb->sb_mbtail, m);
766		printf("packet tree:\n");
767		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
768			printf("\t");
769			for (n = m; n != NULL; n = n->m_next)
770				printf("%p ", n);
771			printf("\n");
772		}
773		panic("sblastmbufchk from %s", where);
774	}
775}
776#endif /* SOCKBUF_DEBUG */
777
778#define	SBLINKRECORD(sb, m0)						\
779do {									\
780	if ((sb)->sb_lastrecord != NULL)				\
781		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
782	else								\
783		(sb)->sb_mb = (m0);					\
784	(sb)->sb_lastrecord = (m0);					\
785} while (/*CONSTCOND*/0)
786
787/*
788 * Append mbuf chain m to the last record in the
789 * socket buffer sb.  The additional space associated
790 * the mbuf chain is recorded in sb.  Empty mbufs are
791 * discarded and mbufs are compacted where possible.
792 */
793void
794sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m)
795{
796	struct mbuf *n;
797
798	if (m == NULL)
799		return;
800
801	sbmtxassertlocked(so, sb);
802	SBLASTRECORDCHK(sb, "sbappend 1");
803
804	if ((n = sb->sb_lastrecord) != NULL) {
805		/*
806		 * XXX Would like to simply use sb_mbtail here, but
807		 * XXX I need to verify that I won't miss an EOR that
808		 * XXX way.
809		 */
810		do {
811			if (n->m_flags & M_EOR) {
812				sbappendrecord(so, sb, m); /* XXXXXX!!!! */
813				return;
814			}
815		} while (n->m_next && (n = n->m_next));
816	} else {
817		/*
818		 * If this is the first record in the socket buffer, it's
819		 * also the last record.
820		 */
821		sb->sb_lastrecord = m;
822	}
823	sbcompress(so, sb, m, n);
824	SBLASTRECORDCHK(sb, "sbappend 2");
825}
826
827/*
828 * This version of sbappend() should only be used when the caller
829 * absolutely knows that there will never be more than one record
830 * in the socket buffer, that is, a stream protocol (such as TCP).
831 */
832void
833sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m)
834{
835	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
836	soassertlocked(so);
837	KDASSERT(m->m_nextpkt == NULL);
838	KASSERT(sb->sb_mb == sb->sb_lastrecord);
839
840	SBLASTMBUFCHK(sb, __func__);
841
842	sbcompress(so, sb, m, sb->sb_mbtail);
843
844	sb->sb_lastrecord = sb->sb_mb;
845	SBLASTRECORDCHK(sb, __func__);
846}
847
848#ifdef SOCKBUF_DEBUG
849void
850sbcheck(struct socket *so, struct sockbuf *sb)
851{
852	struct mbuf *m, *n;
853	u_long len = 0, mbcnt = 0;
854
855	for (m = sb->sb_mb; m; m = m->m_nextpkt) {
856		for (n = m; n; n = n->m_next) {
857			len += n->m_len;
858			mbcnt += MSIZE;
859			if (n->m_flags & M_EXT)
860				mbcnt += n->m_ext.ext_size;
861			if (m != n && n->m_nextpkt)
862				panic("sbcheck nextpkt");
863		}
864	}
865	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
866		printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
867		    mbcnt, sb->sb_mbcnt);
868		panic("sbcheck");
869	}
870}
871#endif
872
873/*
874 * As above, except the mbuf chain
875 * begins a new record.
876 */
877void
878sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0)
879{
880	struct mbuf *m;
881
882	sbmtxassertlocked(so, sb);
883
884	if (m0 == NULL)
885		return;
886
887	/*
888	 * Put the first mbuf on the queue.
889	 * Note this permits zero length records.
890	 */
891	sballoc(so, sb, m0);
892	SBLASTRECORDCHK(sb, "sbappendrecord 1");
893	SBLINKRECORD(sb, m0);
894	m = m0->m_next;
895	m0->m_next = NULL;
896	if (m && (m0->m_flags & M_EOR)) {
897		m0->m_flags &= ~M_EOR;
898		m->m_flags |= M_EOR;
899	}
900	sbcompress(so, sb, m, m0);
901	SBLASTRECORDCHK(sb, "sbappendrecord 2");
902}
903
904/*
905 * Append address and data, and optionally, control (ancillary) data
906 * to the receive queue of a socket.  If present,
907 * m0 must include a packet header with total length.
908 * Returns 0 if no space in sockbuf or insufficient mbufs.
909 */
910int
911sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa,
912    struct mbuf *m0, struct mbuf *control)
913{
914	struct mbuf *m, *n, *nlast;
915	int space = asa->sa_len;
916
917	sbmtxassertlocked(so, sb);
918
919	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
920		panic("sbappendaddr");
921	if (m0)
922		space += m0->m_pkthdr.len;
923	for (n = control; n; n = n->m_next) {
924		space += n->m_len;
925		if (n->m_next == NULL)	/* keep pointer to last control buf */
926			break;
927	}
928	if (space > sbspace(so, sb))
929		return (0);
930	if (asa->sa_len > MLEN)
931		return (0);
932	MGET(m, M_DONTWAIT, MT_SONAME);
933	if (m == NULL)
934		return (0);
935	m->m_len = asa->sa_len;
936	memcpy(mtod(m, caddr_t), asa, asa->sa_len);
937	if (n)
938		n->m_next = m0;		/* concatenate data to control */
939	else
940		control = m0;
941	m->m_next = control;
942
943	SBLASTRECORDCHK(sb, "sbappendaddr 1");
944
945	for (n = m; n->m_next != NULL; n = n->m_next)
946		sballoc(so, sb, n);
947	sballoc(so, sb, n);
948	nlast = n;
949	SBLINKRECORD(sb, m);
950
951	sb->sb_mbtail = nlast;
952	SBLASTMBUFCHK(sb, "sbappendaddr");
953
954	SBLASTRECORDCHK(sb, "sbappendaddr 2");
955
956	return (1);
957}
958
959int
960sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0,
961    struct mbuf *control)
962{
963	struct mbuf *m, *mlast, *n;
964	int eor = 0, space = 0;
965
966	sbmtxassertlocked(so, sb);
967
968	if (control == NULL)
969		panic("sbappendcontrol");
970	for (m = control; ; m = m->m_next) {
971		space += m->m_len;
972		if (m->m_next == NULL)
973			break;
974	}
975	n = m;			/* save pointer to last control buffer */
976	for (m = m0; m; m = m->m_next) {
977		space += m->m_len;
978		eor |= m->m_flags & M_EOR;
979		if (eor) {
980			if (m->m_next == NULL)
981				m->m_flags |= M_EOR;
982			else
983				m->m_flags &= ~M_EOR;
984		}
985	}
986	if (space > sbspace(so, sb))
987		return (0);
988	n->m_next = m0;			/* concatenate data to control */
989
990	SBLASTRECORDCHK(sb, "sbappendcontrol 1");
991
992	for (m = control; m->m_next != NULL; m = m->m_next)
993		sballoc(so, sb, m);
994	sballoc(so, sb, m);
995	mlast = m;
996	SBLINKRECORD(sb, control);
997
998	sb->sb_mbtail = mlast;
999	SBLASTMBUFCHK(sb, "sbappendcontrol");
1000
1001	SBLASTRECORDCHK(sb, "sbappendcontrol 2");
1002
1003	return (1);
1004}
1005
1006/*
1007 * Compress mbuf chain m into the socket
1008 * buffer sb following mbuf n.  If n
1009 * is null, the buffer is presumed empty.
1010 */
1011void
1012sbcompress(struct socket *so, struct sockbuf *sb, struct mbuf *m,
1013    struct mbuf *n)
1014{
1015	int eor = 0;
1016	struct mbuf *o;
1017
1018	while (m) {
1019		eor |= m->m_flags & M_EOR;
1020		if (m->m_len == 0 &&
1021		    (eor == 0 ||
1022		    (((o = m->m_next) || (o = n)) &&
1023		    o->m_type == m->m_type))) {
1024			if (sb->sb_lastrecord == m)
1025				sb->sb_lastrecord = m->m_next;
1026			m = m_free(m);
1027			continue;
1028		}
1029		if (n && (n->m_flags & M_EOR) == 0 &&
1030		    /* m_trailingspace() checks buffer writeability */
1031		    m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size :
1032		       MCLBYTES) / 4 && /* XXX Don't copy too much */
1033		    m->m_len <= m_trailingspace(n) &&
1034		    n->m_type == m->m_type) {
1035			memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t),
1036			    m->m_len);
1037			n->m_len += m->m_len;
1038			sb->sb_cc += m->m_len;
1039			if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME)
1040				sb->sb_datacc += m->m_len;
1041			m = m_free(m);
1042			continue;
1043		}
1044		if (n)
1045			n->m_next = m;
1046		else
1047			sb->sb_mb = m;
1048		sb->sb_mbtail = m;
1049		sballoc(so, sb, m);
1050		n = m;
1051		m->m_flags &= ~M_EOR;
1052		m = m->m_next;
1053		n->m_next = NULL;
1054	}
1055	if (eor) {
1056		if (n)
1057			n->m_flags |= eor;
1058		else
1059			printf("semi-panic: sbcompress");
1060	}
1061	SBLASTMBUFCHK(sb, __func__);
1062}
1063
1064/*
1065 * Free all mbufs in a sockbuf.
1066 * Check that all resources are reclaimed.
1067 */
1068void
1069sbflush(struct socket *so, struct sockbuf *sb)
1070{
1071	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
1072	rw_assert_unlocked(&sb->sb_lock);
1073
1074	while (sb->sb_mbcnt)
1075		sbdrop(so, sb, (int)sb->sb_cc);
1076
1077	KASSERT(sb->sb_cc == 0);
1078	KASSERT(sb->sb_datacc == 0);
1079	KASSERT(sb->sb_mb == NULL);
1080	KASSERT(sb->sb_mbtail == NULL);
1081	KASSERT(sb->sb_lastrecord == NULL);
1082}
1083
1084/*
1085 * Drop data from (the front of) a sockbuf.
1086 */
1087void
1088sbdrop(struct socket *so, struct sockbuf *sb, int len)
1089{
1090	struct mbuf *m, *mn;
1091	struct mbuf *next;
1092
1093	sbmtxassertlocked(so, sb);
1094
1095	next = (m = sb->sb_mb) ? m->m_nextpkt : NULL;
1096	while (len > 0) {
1097		if (m == NULL) {
1098			if (next == NULL)
1099				panic("sbdrop");
1100			m = next;
1101			next = m->m_nextpkt;
1102			continue;
1103		}
1104		if (m->m_len > len) {
1105			m->m_len -= len;
1106			m->m_data += len;
1107			sb->sb_cc -= len;
1108			if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME)
1109				sb->sb_datacc -= len;
1110			break;
1111		}
1112		len -= m->m_len;
1113		sbfree(so, sb, m);
1114		mn = m_free(m);
1115		m = mn;
1116	}
1117	while (m && m->m_len == 0) {
1118		sbfree(so, sb, m);
1119		mn = m_free(m);
1120		m = mn;
1121	}
1122	if (m) {
1123		sb->sb_mb = m;
1124		m->m_nextpkt = next;
1125	} else
1126		sb->sb_mb = next;
1127	/*
1128	 * First part is an inline SB_EMPTY_FIXUP().  Second part
1129	 * makes sure sb_lastrecord is up-to-date if we dropped
1130	 * part of the last record.
1131	 */
1132	m = sb->sb_mb;
1133	if (m == NULL) {
1134		sb->sb_mbtail = NULL;
1135		sb->sb_lastrecord = NULL;
1136	} else if (m->m_nextpkt == NULL)
1137		sb->sb_lastrecord = m;
1138}
1139
1140/*
1141 * Drop a record off the front of a sockbuf
1142 * and move the next record to the front.
1143 */
1144void
1145sbdroprecord(struct socket *so, struct sockbuf *sb)
1146{
1147	struct mbuf *m, *mn;
1148
1149	m = sb->sb_mb;
1150	if (m) {
1151		sb->sb_mb = m->m_nextpkt;
1152		do {
1153			sbfree(so, sb, m);
1154			mn = m_free(m);
1155		} while ((m = mn) != NULL);
1156	}
1157	SB_EMPTY_FIXUP(sb);
1158}
1159
1160/*
1161 * Create a "control" mbuf containing the specified data
1162 * with the specified type for presentation on a socket buffer.
1163 */
1164struct mbuf *
1165sbcreatecontrol(const void *p, size_t size, int type, int level)
1166{
1167	struct cmsghdr *cp;
1168	struct mbuf *m;
1169
1170	if (CMSG_SPACE(size) > MCLBYTES) {
1171		printf("sbcreatecontrol: message too large %zu\n", size);
1172		return (NULL);
1173	}
1174
1175	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
1176		return (NULL);
1177	if (CMSG_SPACE(size) > MLEN) {
1178		MCLGET(m, M_DONTWAIT);
1179		if ((m->m_flags & M_EXT) == 0) {
1180			m_free(m);
1181			return NULL;
1182		}
1183	}
1184	cp = mtod(m, struct cmsghdr *);
1185	memset(cp, 0, CMSG_SPACE(size));
1186	memcpy(CMSG_DATA(cp), p, size);
1187	m->m_len = CMSG_SPACE(size);
1188	cp->cmsg_len = CMSG_LEN(size);
1189	cp->cmsg_level = level;
1190	cp->cmsg_type = type;
1191	return (m);
1192}
1193