uipc_mbuf.c revision 75105
1/*
2 * Copyright (c) 1982, 1986, 1988, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
34 * $FreeBSD: head/sys/kern/uipc_mbuf.c 75105 2001-04-03 03:15:11Z alfred $
35 */
36
37#include "opt_param.h"
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/malloc.h>
41#include <sys/mbuf.h>
42#include <sys/mutex.h>
43#include <sys/kernel.h>
44#include <sys/sysctl.h>
45#include <sys/domain.h>
46#include <sys/protosw.h>
47#include <vm/vm.h>
48#include <vm/vm_kern.h>
49#include <vm/vm_extern.h>
50
51static void mbinit(void *);
52SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
53
54struct mbuf *mbutl;
55struct mbstat mbstat;
56u_long	mbtypes[MT_NTYPES];
57int	max_linkhdr;
58int	max_protohdr;
59int	max_hdr;
60int	max_datalen;
61int	nmbclusters;
62int	nmbufs;
63int	nmbcnt;
64u_long	m_mballoc_wid = 0;
65u_long	m_clalloc_wid = 0;
66
67/*
68 * freelist header structures...
69 * mbffree_lst, mclfree_lst, mcntfree_lst
70 */
71struct mbffree_lst mmbfree;
72struct mclfree_lst mclfree;
73struct mcntfree_lst mcntfree;
74struct mtx	mbuf_mtx;
75
76/*
77 * sysctl(8) exported objects
78 */
79SYSCTL_DECL(_kern_ipc);
80SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
81	   &max_linkhdr, 0, "");
82SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
83	   &max_protohdr, 0, "");
84SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
85SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
86	   &max_datalen, 0, "");
87SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
88	   &mbuf_wait, 0, "");
89SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD, &mbstat, mbstat, "");
90SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mbtypes, CTLFLAG_RD, mbtypes,
91	   sizeof(mbtypes), "LU", "");
92SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD,
93	   &nmbclusters, 0, "Maximum number of mbuf clusters available");
94SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
95	   "Maximum number of mbufs available");
96SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0,
97	   "Maximum number of ext_buf counters available");
98#ifndef NMBCLUSTERS
99#define NMBCLUSTERS	(512 + MAXUSERS * 16)
100#endif
101TUNABLE_INT_DECL("kern.ipc.nmbclusters", NMBCLUSTERS, nmbclusters);
102TUNABLE_INT_DECL("kern.ipc.nmbufs", NMBCLUSTERS * 4, nmbufs);
103TUNABLE_INT_DECL("kern.ipc.nmbcnt", EXT_COUNTERS, nmbcnt);
104
105static void	m_reclaim(void);
106
107/* Initial allocation numbers */
108#define NCL_INIT	2
109#define NMB_INIT	16
110#define REF_INIT	NMBCLUSTERS
111
112/*
113 * Full mbuf subsystem initialization done here.
114 *
115 * XXX: If ever we have system specific map setups to do, then move them to
116 *      machdep.c - for now, there is no reason for this stuff to go there.
117 */
118static void
119mbinit(void *dummy)
120{
121	vm_offset_t maxaddr, mb_map_size;
122
123	/*
124	 * Setup the mb_map, allocate requested VM space.
125	 */
126	mb_map_size = nmbufs * MSIZE + nmbclusters * MCLBYTES + nmbcnt
127	    * sizeof(union mext_refcnt);
128	mb_map_size = roundup2(mb_map_size, PAGE_SIZE);
129	mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr,
130	    mb_map_size);
131	/* XXX XXX XXX: mb_map->system_map = 1; */
132
133	/*
134	 * Initialize the free list headers, and setup locks for lists.
135	 */
136	mmbfree.m_head = NULL;
137	mclfree.m_head = NULL;
138	mcntfree.m_head = NULL;
139	mtx_init(&mbuf_mtx, "mbuf free list lock", MTX_DEF);
140
141	/*
142	 * Initialize mbuf subsystem (sysctl exported) statistics structure.
143	 */
144	mbstat.m_msize = MSIZE;
145	mbstat.m_mclbytes = MCLBYTES;
146	mbstat.m_minclsize = MINCLSIZE;
147	mbstat.m_mlen = MLEN;
148	mbstat.m_mhlen = MHLEN;
149
150	/*
151	 * Perform some initial allocations.
152	 */
153	mtx_lock(&mbuf_mtx);
154	if (m_alloc_ref(REF_INIT, M_DONTWAIT) == 0)
155		goto bad;
156	if (m_mballoc(NMB_INIT, M_DONTWAIT) == 0)
157		goto bad;
158	if (m_clalloc(NCL_INIT, M_DONTWAIT) == 0)
159		goto bad;
160	mtx_unlock(&mbuf_mtx);
161
162	return;
163bad:
164	panic("mbinit: failed to initialize mbuf subsystem!");
165}
166
167/*
168 * Allocate at least nmb reference count structs and place them
169 * on the ref cnt free list.
170 *
171 * Must be called with the mcntfree lock held.
172 */
173int
174m_alloc_ref(u_int nmb, int how)
175{
176	caddr_t p;
177	u_int nbytes;
178	int i;
179
180	/*
181	 * We don't cap the amount of memory that can be used
182	 * by the reference counters, like we do for mbufs and
183	 * mbuf clusters. In fact, we're absolutely sure that we
184	 * won't ever be going over our allocated space. We keep enough
185	 * space in mb_map to accomodate maximum values of allocatable
186	 * external buffers including, but not limited to, clusters.
187	 * (That's also why we won't have to have wait routines for
188	 * counters).
189	 *
190	 * If we're in here, we're absolutely certain to be returning
191	 * succesfully, as long as there is physical memory to accomodate
192	 * us. And if there isn't, but we're willing to wait, then
193	 * kmem_malloc() will do the only waiting needed.
194	 */
195
196	nbytes = round_page(nmb * sizeof(union mext_refcnt));
197	if (1 /* XXX: how == M_TRYWAIT */)
198		mtx_unlock(&mbuf_mtx);
199	if ((p = (caddr_t)kmem_malloc(mb_map, nbytes, how == M_TRYWAIT ?
200	    M_WAITOK : M_NOWAIT)) == NULL) {
201		if (1 /* XXX: how == M_TRYWAIT */)
202			mtx_lock(&mbuf_mtx);
203		return (0);
204	}
205	nmb = nbytes / sizeof(union mext_refcnt);
206
207	/*
208	 * We don't let go of the mutex in order to avoid a race.
209	 * It is up to the caller to let go of the mutex.
210	 */
211	if (1 /* XXX: how == M_TRYWAIT */)
212		mtx_lock(&mbuf_mtx);
213	for (i = 0; i < nmb; i++) {
214		((union mext_refcnt *)p)->next_ref = mcntfree.m_head;
215		mcntfree.m_head = (union mext_refcnt *)p;
216		p += sizeof(union mext_refcnt);
217		mbstat.m_refree++;
218	}
219	mbstat.m_refcnt += nmb;
220
221	return (1);
222}
223
224/*
225 * Allocate at least nmb mbufs and place on mbuf free list.
226 *
227 * Must be called with the mmbfree lock held.
228 */
229int
230m_mballoc(int nmb, int how)
231{
232	caddr_t p;
233	int i;
234	int nbytes;
235
236	nbytes = round_page(nmb * MSIZE);
237	nmb = nbytes / MSIZE;
238
239	/*
240	 * If we've hit the mbuf limit, stop allocating from mb_map.
241	 * Also, once we run out of map space, it will be impossible to
242	 * get any more (nothing is ever freed back to the map).
243	 */
244	if (mb_map_full || ((nmb + mbstat.m_mbufs) > nmbufs))
245		return (0);
246
247	if (1 /* XXX: how == M_TRYWAIT */)
248		mtx_unlock(&mbuf_mtx);
249	p = (caddr_t)kmem_malloc(mb_map, nbytes, how == M_TRYWAIT ?
250		M_WAITOK : M_NOWAIT);
251	if (1 /* XXX: how == M_TRYWAIT */) {
252		mtx_lock(&mbuf_mtx);
253		if (p == NULL)
254			mbstat.m_wait++;
255	}
256
257	/*
258	 * Either the map is now full, or `how' is M_DONTWAIT and there
259	 * are no pages left.
260	 */
261	if (p == NULL)
262		return (0);
263
264	/*
265	 * We don't let go of the mutex in order to avoid a race.
266	 * It is up to the caller to let go of the mutex when done
267	 * with grabbing the mbuf from the free list.
268	 */
269	for (i = 0; i < nmb; i++) {
270		((struct mbuf *)p)->m_next = mmbfree.m_head;
271		mmbfree.m_head = (struct mbuf *)p;
272		p += MSIZE;
273	}
274	mbstat.m_mbufs += nmb;
275	mbtypes[MT_FREE] += nmb;
276	return (1);
277}
278
279/*
280 * Once the mb_map has been exhausted and if the call to the allocation macros
281 * (or, in some cases, functions) is with M_TRYWAIT, then it is necessary to
282 * rely solely on reclaimed mbufs.
283 *
284 * Here we request for the protocols to free up some resources and, if we
285 * still cannot get anything, then we wait for an mbuf to be freed for a
286 * designated (mbuf_wait) time.
287 *
288 * Must be called with the mmbfree mutex held.
289 */
290struct mbuf *
291m_mballoc_wait(void)
292{
293	struct mbuf *p = NULL;
294
295	/*
296	 * See if we can drain some resources out of the protocols.
297	 * We drop the mmbfree mutex to avoid recursing into it in some of
298	 * the drain routines. Clearly, we're faced with a race here because
299	 * once something is freed during the drain, it may be grabbed right
300	 * from under us by some other thread. But we accept this possibility
301	 * in order to avoid a potentially large lock recursion and, more
302	 * importantly, to avoid a potential lock order reversal which may
303	 * result in deadlock (See comment above m_reclaim()).
304	 */
305	mtx_unlock(&mbuf_mtx);
306	m_reclaim();
307
308	mtx_lock(&mbuf_mtx);
309	_MGET(p, M_DONTWAIT);
310
311	if (p == NULL) {
312		m_mballoc_wid++;
313		msleep(&m_mballoc_wid, &mbuf_mtx, PVM, "mballc",
314		    mbuf_wait);
315		m_mballoc_wid--;
316
317		/*
318		 * Try again (one last time).
319		 *
320		 * We retry to fetch _even_ if the sleep timed out. This
321		 * is left this way, purposely, in the [unlikely] case
322		 * that an mbuf was freed but the sleep was not awoken
323		 * in time.
324		 *
325		 * If the sleep didn't time out (i.e. we got woken up) then
326		 * we have the lock so we just grab an mbuf, hopefully.
327		 */
328		_MGET(p, M_DONTWAIT);
329	}
330
331	/* If we waited and got something... */
332	if (p != NULL) {
333		mbstat.m_wait++;
334		if (mmbfree.m_head != NULL)
335			MBWAKEUP(m_mballoc_wid);
336	}
337
338	return (p);
339}
340
341/*
342 * Allocate some number of mbuf clusters
343 * and place on cluster free list.
344 *
345 * Must be called with the mclfree lock held.
346 */
347int
348m_clalloc(int ncl, int how)
349{
350	caddr_t p;
351	int i;
352	int npg_sz;
353
354	npg_sz = round_page(ncl * MCLBYTES);
355	ncl = npg_sz / MCLBYTES;
356
357	/*
358	 * If the map is now full (nothing will ever be freed to it).
359	 * If we've hit the mcluster number limit, stop allocating from
360	 * mb_map.
361	 */
362	if (mb_map_full || ((ncl + mbstat.m_clusters) > nmbclusters))
363		return (0);
364
365	if (1 /* XXX: how == M_TRYWAIT */)
366		mtx_unlock(&mbuf_mtx);
367	p = (caddr_t)kmem_malloc(mb_map, npg_sz,
368				 how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
369	if (1 /* XXX: how == M_TRYWAIT */)
370		mtx_lock(&mbuf_mtx);
371
372	/*
373	 * Either the map is now full, or `how' is M_DONTWAIT and there
374	 * are no pages left.
375	 */
376	if (p == NULL)
377		return (0);
378
379	for (i = 0; i < ncl; i++) {
380		((union mcluster *)p)->mcl_next = mclfree.m_head;
381		mclfree.m_head = (union mcluster *)p;
382		p += MCLBYTES;
383		mbstat.m_clfree++;
384	}
385	mbstat.m_clusters += ncl;
386	return (1);
387}
388
389/*
390 * Once the mb_map submap has been exhausted and the allocation is called with
391 * M_TRYWAIT, we rely on the mclfree list. If nothing is free, we will
392 * sleep for a designated amount of time (mbuf_wait) or until we're woken up
393 * due to sudden mcluster availability.
394 *
395 * Must be called with the mclfree lock held.
396 */
397caddr_t
398m_clalloc_wait(void)
399{
400	caddr_t p = NULL;
401
402	m_clalloc_wid++;
403	msleep(&m_clalloc_wid, &mbuf_mtx, PVM, "mclalc", mbuf_wait);
404	m_clalloc_wid--;
405
406	/*
407	 * Now that we (think) that we've got something, try again.
408	 */
409	_MCLALLOC(p, M_DONTWAIT);
410
411	/* If we waited and got something ... */
412	if (p != NULL) {
413		mbstat.m_wait++;
414		if (mclfree.m_head != NULL)
415			MBWAKEUP(m_clalloc_wid);
416	}
417
418	return (p);
419}
420
421/*
422 * m_reclaim: drain protocols in hopes to free up some resources...
423 *
424 * XXX: No locks should be held going in here. The drain routines have
425 * to presently acquire some locks which raises the possibility of lock
426 * order violation if we're holding any mutex if that mutex is acquired in
427 * reverse order relative to one of the locks in the drain routines.
428 */
429static void
430m_reclaim(void)
431{
432	struct domain *dp;
433	struct protosw *pr;
434
435#ifdef WITNESS
436	KASSERT(witness_list(CURPROC) == 0,
437	    ("m_reclaim called with locks held"));
438#endif
439
440	for (dp = domains; dp; dp = dp->dom_next)
441		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
442			if (pr->pr_drain)
443				(*pr->pr_drain)();
444	mbstat.m_drain++;
445}
446
447/*
448 * Space allocation routines.
449 * Some of these are also available as macros
450 * for critical paths.
451 */
452struct mbuf *
453m_get(int how, int type)
454{
455	struct mbuf *m;
456
457	MGET(m, how, type);
458	return (m);
459}
460
461struct mbuf *
462m_gethdr(int how, int type)
463{
464	struct mbuf *m;
465
466	MGETHDR(m, how, type);
467	return (m);
468}
469
470struct mbuf *
471m_getclr(int how, int type)
472{
473	struct mbuf *m;
474
475	MGET(m, how, type);
476	if (m != NULL)
477		bzero(mtod(m, caddr_t), MLEN);
478	return (m);
479}
480
481struct mbuf *
482m_free(struct mbuf *m)
483{
484	struct mbuf *n;
485
486	MFREE(m, n);
487	return (n);
488}
489
490/*
491 * struct mbuf *
492 * m_getm(m, len, how, type)
493 *
494 * This will allocate len-worth of mbufs and/or mbuf clusters (whatever fits
495 * best) and return a pointer to the top of the allocated chain. If m is
496 * non-null, then we assume that it is a single mbuf or an mbuf chain to
497 * which we want len bytes worth of mbufs and/or clusters attached, and so
498 * if we succeed in allocating it, we will just return a pointer to m.
499 *
500 * If we happen to fail at any point during the allocation, we will free
501 * up everything we have already allocated and return NULL.
502 *
503 */
504struct mbuf *
505m_getm(struct mbuf *m, int len, int how, int type)
506{
507	struct mbuf *top, *tail, *mp, *mtail = NULL;
508
509	KASSERT(len >= 0, ("len is < 0 in m_getm"));
510
511	MGET(mp, how, type);
512	if (mp == NULL)
513		return (NULL);
514	else if (len > MINCLSIZE) {
515		MCLGET(mp, how);
516		if ((mp->m_flags & M_EXT) == 0) {
517			m_free(mp);
518			return (NULL);
519		}
520	}
521	mp->m_len = 0;
522	len -= M_TRAILINGSPACE(mp);
523
524	if (m != NULL)
525		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
526	else
527		m = mp;
528
529	top = tail = mp;
530	while (len > 0) {
531		MGET(mp, how, type);
532		if (mp == NULL)
533			goto failed;
534
535		tail->m_next = mp;
536		tail = mp;
537		if (len > MINCLSIZE) {
538			MCLGET(mp, how);
539			if ((mp->m_flags & M_EXT) == 0)
540				goto failed;
541		}
542
543		mp->m_len = 0;
544		len -= M_TRAILINGSPACE(mp);
545	}
546
547	if (mtail != NULL)
548		mtail->m_next = top;
549	return (m);
550
551failed:
552	m_freem(top);
553	return (NULL);
554}
555
556void
557m_freem(struct mbuf *m)
558{
559	struct mbuf *n;
560
561	if (m == NULL)
562		return;
563	do {
564		/*
565		 * we do need to check non-first mbuf, since some of existing
566		 * code does not call M_PREPEND properly.
567		 * (example: call to bpf_mtap from drivers)
568		 */
569		if ((m->m_flags & M_PKTHDR) != 0 && m->m_pkthdr.aux) {
570			m_freem(m->m_pkthdr.aux);
571			m->m_pkthdr.aux = NULL;
572		}
573		MFREE(m, n);
574		m = n;
575	} while (m);
576}
577
578/*
579 * Lesser-used path for M_PREPEND:
580 * allocate new mbuf to prepend to chain,
581 * copy junk along.
582 */
583struct mbuf *
584m_prepend(struct mbuf *m, int len, int how)
585{
586	struct mbuf *mn;
587
588	MGET(mn, how, m->m_type);
589	if (mn == NULL) {
590		m_freem(m);
591		return (NULL);
592	}
593	if (m->m_flags & M_PKTHDR) {
594		M_COPY_PKTHDR(mn, m);
595		m->m_flags &= ~M_PKTHDR;
596	}
597	mn->m_next = m;
598	m = mn;
599	if (len < MHLEN)
600		MH_ALIGN(m, len);
601	m->m_len = len;
602	return (m);
603}
604
605/*
606 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
607 * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
608 * The wait parameter is a choice of M_TRYWAIT/M_DONTWAIT from caller.
609 * Note that the copy is read-only, because clusters are not copied,
610 * only their reference counts are incremented.
611 */
612struct mbuf *
613m_copym(struct mbuf *m, int off0, int len, int wait)
614{
615	struct mbuf *n, **np;
616	int off = off0;
617	struct mbuf *top;
618	int copyhdr = 0;
619
620	KASSERT(off >= 0, ("m_copym, negative off %d", off));
621	KASSERT(len >= 0, ("m_copym, negative len %d", len));
622	if (off == 0 && m->m_flags & M_PKTHDR)
623		copyhdr = 1;
624	while (off > 0) {
625		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
626		if (off < m->m_len)
627			break;
628		off -= m->m_len;
629		m = m->m_next;
630	}
631	np = &top;
632	top = 0;
633	while (len > 0) {
634		if (m == NULL) {
635			KASSERT(len == M_COPYALL,
636			    ("m_copym, length > size of mbuf chain"));
637			break;
638		}
639		MGET(n, wait, m->m_type);
640		*np = n;
641		if (n == NULL)
642			goto nospace;
643		if (copyhdr) {
644			M_COPY_PKTHDR(n, m);
645			if (len == M_COPYALL)
646				n->m_pkthdr.len -= off0;
647			else
648				n->m_pkthdr.len = len;
649			copyhdr = 0;
650		}
651		n->m_len = min(len, m->m_len - off);
652		if (m->m_flags & M_EXT) {
653			n->m_data = m->m_data + off;
654			n->m_ext = m->m_ext;
655			n->m_flags |= M_EXT;
656			MEXT_ADD_REF(m);
657		} else
658			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
659			    (unsigned)n->m_len);
660		if (len != M_COPYALL)
661			len -= n->m_len;
662		off = 0;
663		m = m->m_next;
664		np = &n->m_next;
665	}
666	if (top == NULL) {
667		mtx_lock(&mbuf_mtx);
668		mbstat.m_mcfail++;
669		mtx_unlock(&mbuf_mtx);
670	}
671	return (top);
672nospace:
673	m_freem(top);
674	mtx_lock(&mbuf_mtx);
675	mbstat.m_mcfail++;
676	mtx_unlock(&mbuf_mtx);
677	return (NULL);
678}
679
680/*
681 * Copy an entire packet, including header (which must be present).
682 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
683 * Note that the copy is read-only, because clusters are not copied,
684 * only their reference counts are incremented.
685 * Preserve alignment of the first mbuf so if the creator has left
686 * some room at the beginning (e.g. for inserting protocol headers)
687 * the copies still have the room available.
688 */
689struct mbuf *
690m_copypacket(struct mbuf *m, int how)
691{
692	struct mbuf *top, *n, *o;
693
694	MGET(n, how, m->m_type);
695	top = n;
696	if (n == NULL)
697		goto nospace;
698
699	M_COPY_PKTHDR(n, m);
700	n->m_len = m->m_len;
701	if (m->m_flags & M_EXT) {
702		n->m_data = m->m_data;
703		n->m_ext = m->m_ext;
704		n->m_flags |= M_EXT;
705		MEXT_ADD_REF(m);
706	} else {
707		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
708		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
709	}
710
711	m = m->m_next;
712	while (m) {
713		MGET(o, how, m->m_type);
714		if (o == NULL)
715			goto nospace;
716
717		n->m_next = o;
718		n = n->m_next;
719
720		n->m_len = m->m_len;
721		if (m->m_flags & M_EXT) {
722			n->m_data = m->m_data;
723			n->m_ext = m->m_ext;
724			n->m_flags |= M_EXT;
725			MEXT_ADD_REF(m);
726		} else {
727			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
728		}
729
730		m = m->m_next;
731	}
732	return top;
733nospace:
734	m_freem(top);
735	mtx_lock(&mbuf_mtx);
736	mbstat.m_mcfail++;
737	mtx_unlock(&mbuf_mtx);
738	return (NULL);
739}
740
741/*
742 * Copy data from an mbuf chain starting "off" bytes from the beginning,
743 * continuing for "len" bytes, into the indicated buffer.
744 */
745void
746m_copydata(struct mbuf *m, int off, int len, caddr_t cp)
747{
748	unsigned count;
749
750	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
751	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
752	while (off > 0) {
753		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
754		if (off < m->m_len)
755			break;
756		off -= m->m_len;
757		m = m->m_next;
758	}
759	while (len > 0) {
760		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
761		count = min(m->m_len - off, len);
762		bcopy(mtod(m, caddr_t) + off, cp, count);
763		len -= count;
764		cp += count;
765		off = 0;
766		m = m->m_next;
767	}
768}
769
770/*
771 * Copy a packet header mbuf chain into a completely new chain, including
772 * copying any mbuf clusters.  Use this instead of m_copypacket() when
773 * you need a writable copy of an mbuf chain.
774 */
775struct mbuf *
776m_dup(struct mbuf *m, int how)
777{
778	struct mbuf **p, *top = NULL;
779	int remain, moff, nsize;
780
781	/* Sanity check */
782	if (m == NULL)
783		return (NULL);
784	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __FUNCTION__));
785
786	/* While there's more data, get a new mbuf, tack it on, and fill it */
787	remain = m->m_pkthdr.len;
788	moff = 0;
789	p = &top;
790	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
791		struct mbuf *n;
792
793		/* Get the next new mbuf */
794		MGET(n, how, m->m_type);
795		if (n == NULL)
796			goto nospace;
797		if (top == NULL) {		/* first one, must be PKTHDR */
798			M_COPY_PKTHDR(n, m);
799			nsize = MHLEN;
800		} else				/* not the first one */
801			nsize = MLEN;
802		if (remain >= MINCLSIZE) {
803			MCLGET(n, how);
804			if ((n->m_flags & M_EXT) == 0) {
805				(void)m_free(n);
806				goto nospace;
807			}
808			nsize = MCLBYTES;
809		}
810		n->m_len = 0;
811
812		/* Link it into the new chain */
813		*p = n;
814		p = &n->m_next;
815
816		/* Copy data from original mbuf(s) into new mbuf */
817		while (n->m_len < nsize && m != NULL) {
818			int chunk = min(nsize - n->m_len, m->m_len - moff);
819
820			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
821			moff += chunk;
822			n->m_len += chunk;
823			remain -= chunk;
824			if (moff == m->m_len) {
825				m = m->m_next;
826				moff = 0;
827			}
828		}
829
830		/* Check correct total mbuf length */
831		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
832		    	("%s: bogus m_pkthdr.len", __FUNCTION__));
833	}
834	return (top);
835
836nospace:
837	m_freem(top);
838	mtx_lock(&mbuf_mtx);
839	mbstat.m_mcfail++;
840	mtx_unlock(&mbuf_mtx);
841	return (NULL);
842}
843
844/*
845 * Concatenate mbuf chain n to m.
846 * Both chains must be of the same type (e.g. MT_DATA).
847 * Any m_pkthdr is not updated.
848 */
849void
850m_cat(struct mbuf *m, struct mbuf *n)
851{
852	while (m->m_next)
853		m = m->m_next;
854	while (n) {
855		if (m->m_flags & M_EXT ||
856		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
857			/* just join the two chains */
858			m->m_next = n;
859			return;
860		}
861		/* splat the data from one into the other */
862		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
863		    (u_int)n->m_len);
864		m->m_len += n->m_len;
865		n = m_free(n);
866	}
867}
868
869void
870m_adj(struct mbuf *mp, int req_len)
871{
872	int len = req_len;
873	struct mbuf *m;
874	int count;
875
876	if ((m = mp) == NULL)
877		return;
878	if (len >= 0) {
879		/*
880		 * Trim from head.
881		 */
882		while (m != NULL && len > 0) {
883			if (m->m_len <= len) {
884				len -= m->m_len;
885				m->m_len = 0;
886				m = m->m_next;
887			} else {
888				m->m_len -= len;
889				m->m_data += len;
890				len = 0;
891			}
892		}
893		m = mp;
894		if (mp->m_flags & M_PKTHDR)
895			m->m_pkthdr.len -= (req_len - len);
896	} else {
897		/*
898		 * Trim from tail.  Scan the mbuf chain,
899		 * calculating its length and finding the last mbuf.
900		 * If the adjustment only affects this mbuf, then just
901		 * adjust and return.  Otherwise, rescan and truncate
902		 * after the remaining size.
903		 */
904		len = -len;
905		count = 0;
906		for (;;) {
907			count += m->m_len;
908			if (m->m_next == (struct mbuf *)0)
909				break;
910			m = m->m_next;
911		}
912		if (m->m_len >= len) {
913			m->m_len -= len;
914			if (mp->m_flags & M_PKTHDR)
915				mp->m_pkthdr.len -= len;
916			return;
917		}
918		count -= len;
919		if (count < 0)
920			count = 0;
921		/*
922		 * Correct length for chain is "count".
923		 * Find the mbuf with last data, adjust its length,
924		 * and toss data from remaining mbufs on chain.
925		 */
926		m = mp;
927		if (m->m_flags & M_PKTHDR)
928			m->m_pkthdr.len = count;
929		for (; m; m = m->m_next) {
930			if (m->m_len >= count) {
931				m->m_len = count;
932				break;
933			}
934			count -= m->m_len;
935		}
936		while (m->m_next)
937			(m = m->m_next) ->m_len = 0;
938	}
939}
940
941/*
942 * Rearange an mbuf chain so that len bytes are contiguous
943 * and in the data area of an mbuf (so that mtod and dtom
944 * will work for a structure of size len).  Returns the resulting
945 * mbuf chain on success, frees it and returns null on failure.
946 * If there is room, it will add up to max_protohdr-len extra bytes to the
947 * contiguous region in an attempt to avoid being called next time.
948 */
949struct mbuf *
950m_pullup(struct mbuf *n, int len)
951{
952	struct mbuf *m;
953	int count;
954	int space;
955
956	/*
957	 * If first mbuf has no cluster, and has room for len bytes
958	 * without shifting current data, pullup into it,
959	 * otherwise allocate a new mbuf to prepend to the chain.
960	 */
961	if ((n->m_flags & M_EXT) == 0 &&
962	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
963		if (n->m_len >= len)
964			return (n);
965		m = n;
966		n = n->m_next;
967		len -= m->m_len;
968	} else {
969		if (len > MHLEN)
970			goto bad;
971		MGET(m, M_DONTWAIT, n->m_type);
972		if (m == NULL)
973			goto bad;
974		m->m_len = 0;
975		if (n->m_flags & M_PKTHDR) {
976			M_COPY_PKTHDR(m, n);
977			n->m_flags &= ~M_PKTHDR;
978		}
979	}
980	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
981	do {
982		count = min(min(max(len, max_protohdr), space), n->m_len);
983		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
984		  (unsigned)count);
985		len -= count;
986		m->m_len += count;
987		n->m_len -= count;
988		space -= count;
989		if (n->m_len)
990			n->m_data += count;
991		else
992			n = m_free(n);
993	} while (len > 0 && n);
994	if (len > 0) {
995		(void) m_free(m);
996		goto bad;
997	}
998	m->m_next = n;
999	return (m);
1000bad:
1001	m_freem(n);
1002	mtx_lock(&mbuf_mtx);
1003	mbstat.m_mcfail++;
1004	mtx_unlock(&mbuf_mtx);
1005	return (NULL);
1006}
1007
1008/*
1009 * Partition an mbuf chain in two pieces, returning the tail --
1010 * all but the first len0 bytes.  In case of failure, it returns NULL and
1011 * attempts to restore the chain to its original state.
1012 */
1013struct mbuf *
1014m_split(struct mbuf *m0, int len0, int wait)
1015{
1016	struct mbuf *m, *n;
1017	unsigned len = len0, remain;
1018
1019	for (m = m0; m && len > m->m_len; m = m->m_next)
1020		len -= m->m_len;
1021	if (m == NULL)
1022		return (NULL);
1023	remain = m->m_len - len;
1024	if (m0->m_flags & M_PKTHDR) {
1025		MGETHDR(n, wait, m0->m_type);
1026		if (n == NULL)
1027			return (NULL);
1028		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1029		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1030		m0->m_pkthdr.len = len0;
1031		if (m->m_flags & M_EXT)
1032			goto extpacket;
1033		if (remain > MHLEN) {
1034			/* m can't be the lead packet */
1035			MH_ALIGN(n, 0);
1036			n->m_next = m_split(m, len, wait);
1037			if (n->m_next == NULL) {
1038				(void) m_free(n);
1039				return (NULL);
1040			} else
1041				return (n);
1042		} else
1043			MH_ALIGN(n, remain);
1044	} else if (remain == 0) {
1045		n = m->m_next;
1046		m->m_next = NULL;
1047		return (n);
1048	} else {
1049		MGET(n, wait, m->m_type);
1050		if (n == NULL)
1051			return (NULL);
1052		M_ALIGN(n, remain);
1053	}
1054extpacket:
1055	if (m->m_flags & M_EXT) {
1056		n->m_flags |= M_EXT;
1057		n->m_ext = m->m_ext;
1058		MEXT_ADD_REF(m);
1059		m->m_ext.ext_size = 0; /* For Accounting XXXXXX danger */
1060		n->m_data = m->m_data + len;
1061	} else {
1062		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1063	}
1064	n->m_len = remain;
1065	m->m_len = len;
1066	n->m_next = m->m_next;
1067	m->m_next = NULL;
1068	return (n);
1069}
1070/*
1071 * Routine to copy from device local memory into mbufs.
1072 */
1073struct mbuf *
1074m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
1075	 void (*copy)(char *from, caddr_t to, u_int len))
1076{
1077	struct mbuf *m;
1078	struct mbuf *top = 0, **mp = &top;
1079	int off = off0, len;
1080	char *cp;
1081	char *epkt;
1082
1083	cp = buf;
1084	epkt = cp + totlen;
1085	if (off) {
1086		cp += off + 2 * sizeof(u_short);
1087		totlen -= 2 * sizeof(u_short);
1088	}
1089	MGETHDR(m, M_DONTWAIT, MT_DATA);
1090	if (m == NULL)
1091		return (NULL);
1092	m->m_pkthdr.rcvif = ifp;
1093	m->m_pkthdr.len = totlen;
1094	m->m_len = MHLEN;
1095
1096	while (totlen > 0) {
1097		if (top) {
1098			MGET(m, M_DONTWAIT, MT_DATA);
1099			if (m == NULL) {
1100				m_freem(top);
1101				return (NULL);
1102			}
1103			m->m_len = MLEN;
1104		}
1105		len = min(totlen, epkt - cp);
1106		if (len >= MINCLSIZE) {
1107			MCLGET(m, M_DONTWAIT);
1108			if (m->m_flags & M_EXT)
1109				m->m_len = len = min(len, MCLBYTES);
1110			else
1111				len = m->m_len;
1112		} else {
1113			/*
1114			 * Place initial small packet/header at end of mbuf.
1115			 */
1116			if (len < m->m_len) {
1117				if (top == NULL && len +
1118				    max_linkhdr <= m->m_len)
1119					m->m_data += max_linkhdr;
1120				m->m_len = len;
1121			} else
1122				len = m->m_len;
1123		}
1124		if (copy)
1125			copy(cp, mtod(m, caddr_t), (unsigned)len);
1126		else
1127			bcopy(cp, mtod(m, caddr_t), (unsigned)len);
1128		cp += len;
1129		*mp = m;
1130		mp = &m->m_next;
1131		totlen -= len;
1132		if (cp == epkt)
1133			cp = buf;
1134	}
1135	return (top);
1136}
1137
1138/*
1139 * Copy data from a buffer back into the indicated mbuf chain,
1140 * starting "off" bytes from the beginning, extending the mbuf
1141 * chain if necessary.
1142 */
1143void
1144m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
1145{
1146	int mlen;
1147	struct mbuf *m = m0, *n;
1148	int totlen = 0;
1149
1150	if (m0 == NULL)
1151		return;
1152	while (off > (mlen = m->m_len)) {
1153		off -= mlen;
1154		totlen += mlen;
1155		if (m->m_next == NULL) {
1156			n = m_getclr(M_DONTWAIT, m->m_type);
1157			if (n == NULL)
1158				goto out;
1159			n->m_len = min(MLEN, len + off);
1160			m->m_next = n;
1161		}
1162		m = m->m_next;
1163	}
1164	while (len > 0) {
1165		mlen = min (m->m_len - off, len);
1166		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
1167		cp += mlen;
1168		len -= mlen;
1169		mlen += off;
1170		off = 0;
1171		totlen += mlen;
1172		if (len == 0)
1173			break;
1174		if (m->m_next == NULL) {
1175			n = m_get(M_DONTWAIT, m->m_type);
1176			if (n == NULL)
1177				break;
1178			n->m_len = min(MLEN, len);
1179			m->m_next = n;
1180		}
1181		m = m->m_next;
1182	}
1183out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1184		m->m_pkthdr.len = totlen;
1185}
1186
1187void
1188m_print(const struct mbuf *m)
1189{
1190	int len;
1191	const struct mbuf *m2;
1192
1193	len = m->m_pkthdr.len;
1194	m2 = m;
1195	while (len) {
1196		printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
1197		len -= m2->m_len;
1198		m2 = m2->m_next;
1199	}
1200	return;
1201}
1202