uipc_mbuf.c revision 78064
1/*
2 * Copyright (c) 1982, 1986, 1988, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
34 * $FreeBSD: head/sys/kern/uipc_mbuf.c 78064 2001-06-11 12:39:29Z ume $
35 */
36
37#include "opt_param.h"
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/condvar.h>
41#include <sys/kernel.h>
42#include <sys/lock.h>
43#include <sys/malloc.h>
44#include <sys/mbuf.h>
45#include <sys/mutex.h>
46#include <sys/sysctl.h>
47#include <sys/domain.h>
48#include <sys/protosw.h>
49
50#include <vm/vm.h>
51#include <vm/vm_kern.h>
52#include <vm/vm_extern.h>
53
54#ifndef NMBCLUSTERS
55#define NMBCLUSTERS	(512 + MAXUSERS * 16)
56#endif
57
58static void mbinit(void *);
59SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
60
61struct mbuf *mbutl;
62struct mbstat mbstat;
63u_long	mbtypes[MT_NTYPES];
64int	max_linkhdr;
65int	max_protohdr;
66int	max_hdr;
67int	max_datalen;
68int	nmbclusters = NMBCLUSTERS;
69int	nmbufs = NMBCLUSTERS * 4;
70int	nmbcnt;
71u_long	m_mballoc_wid = 0;
72u_long	m_clalloc_wid = 0;
73
74/*
75 * freelist header structures...
76 * mbffree_lst, mclfree_lst, mcntfree_lst
77 */
78struct mbffree_lst mmbfree;
79struct mclfree_lst mclfree;
80struct mcntfree_lst mcntfree;
81struct mtx	mbuf_mtx;
82
83/*
84 * sysctl(8) exported objects
85 */
86SYSCTL_DECL(_kern_ipc);
87SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
88	   &max_linkhdr, 0, "");
89SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
90	   &max_protohdr, 0, "");
91SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
92SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
93	   &max_datalen, 0, "");
94SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
95	   &mbuf_wait, 0, "");
96SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD, &mbstat, mbstat, "");
97SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mbtypes, CTLFLAG_RD, mbtypes,
98	   sizeof(mbtypes), "LU", "");
99SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD,
100	   &nmbclusters, 0, "Maximum number of mbuf clusters available");
101SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
102	   "Maximum number of mbufs available");
103SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0,
104	   "Maximum number of ext_buf counters available");
105
106TUNABLE_INT("kern.ipc.nmbclusters", &nmbclusters);
107TUNABLE_INT("kern.ipc.nmbufs", &nmbufs);
108TUNABLE_INT("kern.ipc.nmbcnt", &nmbcnt);
109
110static void	m_reclaim(void);
111
112/* Initial allocation numbers */
113#define NCL_INIT	2
114#define NMB_INIT	16
115#define REF_INIT	NMBCLUSTERS
116
117/*
118 * Full mbuf subsystem initialization done here.
119 *
120 * XXX: If ever we have system specific map setups to do, then move them to
121 *      machdep.c - for now, there is no reason for this stuff to go there.
122 */
123static void
124mbinit(void *dummy)
125{
126	vm_offset_t maxaddr;
127	vm_size_t mb_map_size;
128
129	/* Sanity checks and pre-initialization for non-constants */
130	if (nmbufs < nmbclusters * 2)
131		nmbufs = nmbclusters * 2;
132	if (nmbcnt == 0)
133		nmbcnt = EXT_COUNTERS;
134
135	/*
136	 * Setup the mb_map, allocate requested VM space.
137	 */
138	mb_map_size = (vm_size_t)(nmbufs * MSIZE + nmbclusters * MCLBYTES +
139	    nmbcnt * sizeof(union mext_refcnt));
140	mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
141	mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr,
142	    mb_map_size);
143	/* XXX XXX XXX: mb_map->system_map = 1; */
144
145	/*
146	 * Initialize the free list headers, and setup locks for lists.
147	 */
148	mmbfree.m_head = NULL;
149	mclfree.m_head = NULL;
150	mcntfree.m_head = NULL;
151	mtx_init(&mbuf_mtx, "mbuf free list lock", MTX_DEF);
152	cv_init(&mmbfree.m_starved, "mbuf free list starved cv");
153	cv_init(&mclfree.m_starved, "mbuf cluster free list starved cv");
154
155	/*
156	 * Initialize mbuf subsystem (sysctl exported) statistics structure.
157	 */
158	mbstat.m_msize = MSIZE;
159	mbstat.m_mclbytes = MCLBYTES;
160	mbstat.m_minclsize = MINCLSIZE;
161	mbstat.m_mlen = MLEN;
162	mbstat.m_mhlen = MHLEN;
163
164	/*
165	 * Perform some initial allocations.
166	 */
167	mtx_lock(&mbuf_mtx);
168	if (m_alloc_ref(REF_INIT, M_DONTWAIT) == 0)
169		goto bad;
170	if (m_mballoc(NMB_INIT, M_DONTWAIT) == 0)
171		goto bad;
172	if (m_clalloc(NCL_INIT, M_DONTWAIT) == 0)
173		goto bad;
174	mtx_unlock(&mbuf_mtx);
175
176	return;
177bad:
178	panic("mbinit: failed to initialize mbuf subsystem!");
179}
180
181/*
182 * Allocate at least nmb reference count structs and place them
183 * on the ref cnt free list.
184 *
185 * Must be called with the mcntfree lock held.
186 */
187int
188m_alloc_ref(u_int nmb, int how)
189{
190	caddr_t p;
191	u_int nbytes;
192	int i;
193
194	/*
195	 * We don't cap the amount of memory that can be used
196	 * by the reference counters, like we do for mbufs and
197	 * mbuf clusters. In fact, we're absolutely sure that we
198	 * won't ever be going over our allocated space. We keep enough
199	 * space in mb_map to accomodate maximum values of allocatable
200	 * external buffers including, but not limited to, clusters.
201	 * (That's also why we won't have to have wait routines for
202	 * counters).
203	 *
204	 * If we're in here, we're absolutely certain to be returning
205	 * succesfully, as long as there is physical memory to accomodate
206	 * us. And if there isn't, but we're willing to wait, then
207	 * kmem_malloc() will do the only waiting needed.
208	 */
209
210	nbytes = round_page(nmb * sizeof(union mext_refcnt));
211	if (1 /* XXX: how == M_TRYWAIT */)
212		mtx_unlock(&mbuf_mtx);
213	if ((p = (caddr_t)kmem_malloc(mb_map, nbytes, how == M_TRYWAIT ?
214	    M_WAITOK : M_NOWAIT)) == NULL) {
215		if (1 /* XXX: how == M_TRYWAIT */)
216			mtx_lock(&mbuf_mtx);
217		return (0);
218	}
219	nmb = nbytes / sizeof(union mext_refcnt);
220
221	/*
222	 * We don't let go of the mutex in order to avoid a race.
223	 * It is up to the caller to let go of the mutex.
224	 */
225	if (1 /* XXX: how == M_TRYWAIT */)
226		mtx_lock(&mbuf_mtx);
227	for (i = 0; i < nmb; i++) {
228		((union mext_refcnt *)p)->next_ref = mcntfree.m_head;
229		mcntfree.m_head = (union mext_refcnt *)p;
230		p += sizeof(union mext_refcnt);
231		mbstat.m_refree++;
232	}
233	mbstat.m_refcnt += nmb;
234
235	return (1);
236}
237
238/*
239 * Allocate at least nmb mbufs and place on mbuf free list.
240 *
241 * Must be called with the mmbfree lock held.
242 */
243int
244m_mballoc(int nmb, int how)
245{
246	caddr_t p;
247	int i;
248	int nbytes;
249
250	nbytes = round_page(nmb * MSIZE);
251	nmb = nbytes / MSIZE;
252
253	/*
254	 * If we've hit the mbuf limit, stop allocating from mb_map.
255	 * Also, once we run out of map space, it will be impossible to
256	 * get any more (nothing is ever freed back to the map).
257	 */
258	if (mb_map_full || ((nmb + mbstat.m_mbufs) > nmbufs))
259		return (0);
260
261	if (1 /* XXX: how == M_TRYWAIT */)
262		mtx_unlock(&mbuf_mtx);
263	p = (caddr_t)kmem_malloc(mb_map, nbytes, how == M_TRYWAIT ?
264		M_WAITOK : M_NOWAIT);
265	if (1 /* XXX: how == M_TRYWAIT */) {
266		mtx_lock(&mbuf_mtx);
267		if (p == NULL)
268			mbstat.m_wait++;
269	}
270
271	/*
272	 * Either the map is now full, or `how' is M_DONTWAIT and there
273	 * are no pages left.
274	 */
275	if (p == NULL)
276		return (0);
277
278	/*
279	 * We don't let go of the mutex in order to avoid a race.
280	 * It is up to the caller to let go of the mutex when done
281	 * with grabbing the mbuf from the free list.
282	 */
283	for (i = 0; i < nmb; i++) {
284		((struct mbuf *)p)->m_next = mmbfree.m_head;
285		mmbfree.m_head = (struct mbuf *)p;
286		p += MSIZE;
287	}
288	mbstat.m_mbufs += nmb;
289	mbtypes[MT_FREE] += nmb;
290	return (1);
291}
292
293/*
294 * Once the mb_map has been exhausted and if the call to the allocation macros
295 * (or, in some cases, functions) is with M_TRYWAIT, then it is necessary to
296 * rely solely on reclaimed mbufs.
297 *
298 * Here we request for the protocols to free up some resources and, if we
299 * still cannot get anything, then we wait for an mbuf to be freed for a
300 * designated (mbuf_wait) time, at most.
301 *
302 * Must be called with the mmbfree mutex held.
303 */
304struct mbuf *
305m_mballoc_wait(void)
306{
307	struct mbuf *p = NULL;
308
309	/*
310	 * See if we can drain some resources out of the protocols.
311	 * We drop the mmbfree mutex to avoid recursing into it in some of
312	 * the drain routines. Clearly, we're faced with a race here because
313	 * once something is freed during the drain, it may be grabbed right
314	 * from under us by some other thread. But we accept this possibility
315	 * in order to avoid a potentially large lock recursion and, more
316	 * importantly, to avoid a potential lock order reversal which may
317	 * result in deadlock (See comment above m_reclaim()).
318	 */
319	mtx_unlock(&mbuf_mtx);
320	m_reclaim();
321
322	mtx_lock(&mbuf_mtx);
323	_MGET(p, M_DONTWAIT);
324
325	if (p == NULL) {
326		int retval;
327
328		m_mballoc_wid++;
329		retval = cv_timedwait(&mmbfree.m_starved, &mbuf_mtx,
330		    mbuf_wait);
331		m_mballoc_wid--;
332
333		/*
334		 * If we got signaled (i.e. didn't time out), allocate.
335		 */
336		if (retval == 0)
337			_MGET(p, M_DONTWAIT);
338	}
339
340	if (p != NULL) {
341		mbstat.m_wait++;
342		if (mmbfree.m_head != NULL)
343			MBWAKEUP(m_mballoc_wid, &mmbfree.m_starved);
344	}
345
346	return (p);
347}
348
349/*
350 * Allocate some number of mbuf clusters
351 * and place on cluster free list.
352 *
353 * Must be called with the mclfree lock held.
354 */
355int
356m_clalloc(int ncl, int how)
357{
358	caddr_t p;
359	int i;
360	int npg_sz;
361
362	npg_sz = round_page(ncl * MCLBYTES);
363	ncl = npg_sz / MCLBYTES;
364
365	/*
366	 * If the map is now full (nothing will ever be freed to it).
367	 * If we've hit the mcluster number limit, stop allocating from
368	 * mb_map.
369	 */
370	if (mb_map_full || ((ncl + mbstat.m_clusters) > nmbclusters))
371		return (0);
372
373	if (1 /* XXX: how == M_TRYWAIT */)
374		mtx_unlock(&mbuf_mtx);
375	p = (caddr_t)kmem_malloc(mb_map, npg_sz,
376				 how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
377	if (1 /* XXX: how == M_TRYWAIT */)
378		mtx_lock(&mbuf_mtx);
379
380	/*
381	 * Either the map is now full, or `how' is M_DONTWAIT and there
382	 * are no pages left.
383	 */
384	if (p == NULL)
385		return (0);
386
387	for (i = 0; i < ncl; i++) {
388		((union mcluster *)p)->mcl_next = mclfree.m_head;
389		mclfree.m_head = (union mcluster *)p;
390		p += MCLBYTES;
391		mbstat.m_clfree++;
392	}
393	mbstat.m_clusters += ncl;
394	return (1);
395}
396
397/*
398 * Once the mb_map submap has been exhausted and the allocation is called with
399 * M_TRYWAIT, we rely on the mclfree list. If nothing is free, we will
400 * block on a cv for a designated amount of time (mbuf_wait) or until we're
401 * signaled due to sudden mcluster availability.
402 *
403 * Must be called with the mclfree lock held.
404 */
405caddr_t
406m_clalloc_wait(void)
407{
408	caddr_t p = NULL;
409	int retval;
410
411	m_clalloc_wid++;
412	retval = cv_timedwait(&mclfree.m_starved, &mbuf_mtx, mbuf_wait);
413	m_clalloc_wid--;
414
415	/*
416	 * Now that we (think) that we've got something, try again.
417	 */
418	if (retval == 0)
419		_MCLALLOC(p, M_DONTWAIT);
420
421	if (p != NULL) {
422		mbstat.m_wait++;
423		if (mclfree.m_head != NULL)
424			MBWAKEUP(m_clalloc_wid, &mclfree.m_starved);
425	}
426
427	return (p);
428}
429
430/*
431 * m_reclaim: drain protocols in hopes to free up some resources...
432 *
433 * XXX: No locks should be held going in here. The drain routines have
434 * to presently acquire some locks which raises the possibility of lock
435 * order violation if we're holding any mutex if that mutex is acquired in
436 * reverse order relative to one of the locks in the drain routines.
437 */
438static void
439m_reclaim(void)
440{
441	struct domain *dp;
442	struct protosw *pr;
443
444#ifdef WITNESS
445	KASSERT(witness_list(curproc) == 0,
446	    ("m_reclaim called with locks held"));
447#endif
448
449	for (dp = domains; dp; dp = dp->dom_next)
450		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
451			if (pr->pr_drain)
452				(*pr->pr_drain)();
453	mbstat.m_drain++;
454}
455
456/*
457 * Space allocation routines.
458 * Some of these are also available as macros
459 * for critical paths.
460 */
461struct mbuf *
462m_get(int how, int type)
463{
464	struct mbuf *m;
465
466	MGET(m, how, type);
467	return (m);
468}
469
470struct mbuf *
471m_gethdr(int how, int type)
472{
473	struct mbuf *m;
474
475	MGETHDR(m, how, type);
476	return (m);
477}
478
479struct mbuf *
480m_getclr(int how, int type)
481{
482	struct mbuf *m;
483
484	MGET(m, how, type);
485	if (m != NULL)
486		bzero(mtod(m, caddr_t), MLEN);
487	return (m);
488}
489
490struct mbuf *
491m_free(struct mbuf *m)
492{
493	struct mbuf *n;
494
495	MFREE(m, n);
496	return (n);
497}
498
499/*
500 * struct mbuf *
501 * m_getm(m, len, how, type)
502 *
503 * This will allocate len-worth of mbufs and/or mbuf clusters (whatever fits
504 * best) and return a pointer to the top of the allocated chain. If m is
505 * non-null, then we assume that it is a single mbuf or an mbuf chain to
506 * which we want len bytes worth of mbufs and/or clusters attached, and so
507 * if we succeed in allocating it, we will just return a pointer to m.
508 *
509 * If we happen to fail at any point during the allocation, we will free
510 * up everything we have already allocated and return NULL.
511 *
512 */
513struct mbuf *
514m_getm(struct mbuf *m, int len, int how, int type)
515{
516	struct mbuf *top, *tail, *mp, *mtail = NULL;
517
518	KASSERT(len >= 0, ("len is < 0 in m_getm"));
519
520	MGET(mp, how, type);
521	if (mp == NULL)
522		return (NULL);
523	else if (len > MINCLSIZE) {
524		MCLGET(mp, how);
525		if ((mp->m_flags & M_EXT) == 0) {
526			m_free(mp);
527			return (NULL);
528		}
529	}
530	mp->m_len = 0;
531	len -= M_TRAILINGSPACE(mp);
532
533	if (m != NULL)
534		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
535	else
536		m = mp;
537
538	top = tail = mp;
539	while (len > 0) {
540		MGET(mp, how, type);
541		if (mp == NULL)
542			goto failed;
543
544		tail->m_next = mp;
545		tail = mp;
546		if (len > MINCLSIZE) {
547			MCLGET(mp, how);
548			if ((mp->m_flags & M_EXT) == 0)
549				goto failed;
550		}
551
552		mp->m_len = 0;
553		len -= M_TRAILINGSPACE(mp);
554	}
555
556	if (mtail != NULL)
557		mtail->m_next = top;
558	return (m);
559
560failed:
561	m_freem(top);
562	return (NULL);
563}
564
565void
566m_freem(struct mbuf *m)
567{
568	struct mbuf *n;
569
570	if (m == NULL)
571		return;
572	do {
573		MFREE(m, n);
574		m = n;
575	} while (m);
576}
577
578/*
579 * Lesser-used path for M_PREPEND:
580 * allocate new mbuf to prepend to chain,
581 * copy junk along.
582 */
583struct mbuf *
584m_prepend(struct mbuf *m, int len, int how)
585{
586	struct mbuf *mn;
587
588	MGET(mn, how, m->m_type);
589	if (mn == NULL) {
590		m_freem(m);
591		return (NULL);
592	}
593	if (m->m_flags & M_PKTHDR) {
594		M_COPY_PKTHDR(mn, m);
595		m->m_flags &= ~M_PKTHDR;
596	}
597	mn->m_next = m;
598	m = mn;
599	if (len < MHLEN)
600		MH_ALIGN(m, len);
601	m->m_len = len;
602	return (m);
603}
604
605/*
606 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
607 * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
608 * The wait parameter is a choice of M_TRYWAIT/M_DONTWAIT from caller.
609 * Note that the copy is read-only, because clusters are not copied,
610 * only their reference counts are incremented.
611 */
612struct mbuf *
613m_copym(struct mbuf *m, int off0, int len, int wait)
614{
615	struct mbuf *n, **np;
616	int off = off0;
617	struct mbuf *top;
618	int copyhdr = 0;
619
620	KASSERT(off >= 0, ("m_copym, negative off %d", off));
621	KASSERT(len >= 0, ("m_copym, negative len %d", len));
622	if (off == 0 && m->m_flags & M_PKTHDR)
623		copyhdr = 1;
624	while (off > 0) {
625		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
626		if (off < m->m_len)
627			break;
628		off -= m->m_len;
629		m = m->m_next;
630	}
631	np = &top;
632	top = 0;
633	while (len > 0) {
634		if (m == NULL) {
635			KASSERT(len == M_COPYALL,
636			    ("m_copym, length > size of mbuf chain"));
637			break;
638		}
639		MGET(n, wait, m->m_type);
640		*np = n;
641		if (n == NULL)
642			goto nospace;
643		if (copyhdr) {
644			M_COPY_PKTHDR(n, m);
645			if (len == M_COPYALL)
646				n->m_pkthdr.len -= off0;
647			else
648				n->m_pkthdr.len = len;
649			copyhdr = 0;
650		}
651		n->m_len = min(len, m->m_len - off);
652		if (m->m_flags & M_EXT) {
653			n->m_data = m->m_data + off;
654			n->m_ext = m->m_ext;
655			n->m_flags |= M_EXT;
656			MEXT_ADD_REF(m);
657		} else
658			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
659			    (unsigned)n->m_len);
660		if (len != M_COPYALL)
661			len -= n->m_len;
662		off = 0;
663		m = m->m_next;
664		np = &n->m_next;
665	}
666	if (top == NULL) {
667		mtx_lock(&mbuf_mtx);
668		mbstat.m_mcfail++;
669		mtx_unlock(&mbuf_mtx);
670	}
671	return (top);
672nospace:
673	m_freem(top);
674	mtx_lock(&mbuf_mtx);
675	mbstat.m_mcfail++;
676	mtx_unlock(&mbuf_mtx);
677	return (NULL);
678}
679
680/*
681 * Copy an entire packet, including header (which must be present).
682 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
683 * Note that the copy is read-only, because clusters are not copied,
684 * only their reference counts are incremented.
685 * Preserve alignment of the first mbuf so if the creator has left
686 * some room at the beginning (e.g. for inserting protocol headers)
687 * the copies still have the room available.
688 */
689struct mbuf *
690m_copypacket(struct mbuf *m, int how)
691{
692	struct mbuf *top, *n, *o;
693
694	MGET(n, how, m->m_type);
695	top = n;
696	if (n == NULL)
697		goto nospace;
698
699	M_COPY_PKTHDR(n, m);
700	n->m_len = m->m_len;
701	if (m->m_flags & M_EXT) {
702		n->m_data = m->m_data;
703		n->m_ext = m->m_ext;
704		n->m_flags |= M_EXT;
705		MEXT_ADD_REF(m);
706	} else {
707		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
708		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
709	}
710
711	m = m->m_next;
712	while (m) {
713		MGET(o, how, m->m_type);
714		if (o == NULL)
715			goto nospace;
716
717		n->m_next = o;
718		n = n->m_next;
719
720		n->m_len = m->m_len;
721		if (m->m_flags & M_EXT) {
722			n->m_data = m->m_data;
723			n->m_ext = m->m_ext;
724			n->m_flags |= M_EXT;
725			MEXT_ADD_REF(m);
726		} else {
727			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
728		}
729
730		m = m->m_next;
731	}
732	return top;
733nospace:
734	m_freem(top);
735	mtx_lock(&mbuf_mtx);
736	mbstat.m_mcfail++;
737	mtx_unlock(&mbuf_mtx);
738	return (NULL);
739}
740
741/*
742 * Copy data from an mbuf chain starting "off" bytes from the beginning,
743 * continuing for "len" bytes, into the indicated buffer.
744 */
745void
746m_copydata(struct mbuf *m, int off, int len, caddr_t cp)
747{
748	unsigned count;
749
750	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
751	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
752	while (off > 0) {
753		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
754		if (off < m->m_len)
755			break;
756		off -= m->m_len;
757		m = m->m_next;
758	}
759	while (len > 0) {
760		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
761		count = min(m->m_len - off, len);
762		bcopy(mtod(m, caddr_t) + off, cp, count);
763		len -= count;
764		cp += count;
765		off = 0;
766		m = m->m_next;
767	}
768}
769
770/*
771 * Copy a packet header mbuf chain into a completely new chain, including
772 * copying any mbuf clusters.  Use this instead of m_copypacket() when
773 * you need a writable copy of an mbuf chain.
774 */
775struct mbuf *
776m_dup(struct mbuf *m, int how)
777{
778	struct mbuf **p, *top = NULL;
779	int remain, moff, nsize;
780
781	/* Sanity check */
782	if (m == NULL)
783		return (NULL);
784	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __FUNCTION__));
785
786	/* While there's more data, get a new mbuf, tack it on, and fill it */
787	remain = m->m_pkthdr.len;
788	moff = 0;
789	p = &top;
790	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
791		struct mbuf *n;
792
793		/* Get the next new mbuf */
794		MGET(n, how, m->m_type);
795		if (n == NULL)
796			goto nospace;
797		if (top == NULL) {		/* first one, must be PKTHDR */
798			M_COPY_PKTHDR(n, m);
799			nsize = MHLEN;
800		} else				/* not the first one */
801			nsize = MLEN;
802		if (remain >= MINCLSIZE) {
803			MCLGET(n, how);
804			if ((n->m_flags & M_EXT) == 0) {
805				(void)m_free(n);
806				goto nospace;
807			}
808			nsize = MCLBYTES;
809		}
810		n->m_len = 0;
811
812		/* Link it into the new chain */
813		*p = n;
814		p = &n->m_next;
815
816		/* Copy data from original mbuf(s) into new mbuf */
817		while (n->m_len < nsize && m != NULL) {
818			int chunk = min(nsize - n->m_len, m->m_len - moff);
819
820			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
821			moff += chunk;
822			n->m_len += chunk;
823			remain -= chunk;
824			if (moff == m->m_len) {
825				m = m->m_next;
826				moff = 0;
827			}
828		}
829
830		/* Check correct total mbuf length */
831		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
832		    	("%s: bogus m_pkthdr.len", __FUNCTION__));
833	}
834	return (top);
835
836nospace:
837	m_freem(top);
838	mtx_lock(&mbuf_mtx);
839	mbstat.m_mcfail++;
840	mtx_unlock(&mbuf_mtx);
841	return (NULL);
842}
843
844/*
845 * Concatenate mbuf chain n to m.
846 * Both chains must be of the same type (e.g. MT_DATA).
847 * Any m_pkthdr is not updated.
848 */
849void
850m_cat(struct mbuf *m, struct mbuf *n)
851{
852	while (m->m_next)
853		m = m->m_next;
854	while (n) {
855		if (m->m_flags & M_EXT ||
856		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
857			/* just join the two chains */
858			m->m_next = n;
859			return;
860		}
861		/* splat the data from one into the other */
862		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
863		    (u_int)n->m_len);
864		m->m_len += n->m_len;
865		n = m_free(n);
866	}
867}
868
869void
870m_adj(struct mbuf *mp, int req_len)
871{
872	int len = req_len;
873	struct mbuf *m;
874	int count;
875
876	if ((m = mp) == NULL)
877		return;
878	if (len >= 0) {
879		/*
880		 * Trim from head.
881		 */
882		while (m != NULL && len > 0) {
883			if (m->m_len <= len) {
884				len -= m->m_len;
885				m->m_len = 0;
886				m = m->m_next;
887			} else {
888				m->m_len -= len;
889				m->m_data += len;
890				len = 0;
891			}
892		}
893		m = mp;
894		if (mp->m_flags & M_PKTHDR)
895			m->m_pkthdr.len -= (req_len - len);
896	} else {
897		/*
898		 * Trim from tail.  Scan the mbuf chain,
899		 * calculating its length and finding the last mbuf.
900		 * If the adjustment only affects this mbuf, then just
901		 * adjust and return.  Otherwise, rescan and truncate
902		 * after the remaining size.
903		 */
904		len = -len;
905		count = 0;
906		for (;;) {
907			count += m->m_len;
908			if (m->m_next == (struct mbuf *)0)
909				break;
910			m = m->m_next;
911		}
912		if (m->m_len >= len) {
913			m->m_len -= len;
914			if (mp->m_flags & M_PKTHDR)
915				mp->m_pkthdr.len -= len;
916			return;
917		}
918		count -= len;
919		if (count < 0)
920			count = 0;
921		/*
922		 * Correct length for chain is "count".
923		 * Find the mbuf with last data, adjust its length,
924		 * and toss data from remaining mbufs on chain.
925		 */
926		m = mp;
927		if (m->m_flags & M_PKTHDR)
928			m->m_pkthdr.len = count;
929		for (; m; m = m->m_next) {
930			if (m->m_len >= count) {
931				m->m_len = count;
932				break;
933			}
934			count -= m->m_len;
935		}
936		while (m->m_next)
937			(m = m->m_next) ->m_len = 0;
938	}
939}
940
941/*
942 * Rearange an mbuf chain so that len bytes are contiguous
943 * and in the data area of an mbuf (so that mtod and dtom
944 * will work for a structure of size len).  Returns the resulting
945 * mbuf chain on success, frees it and returns null on failure.
946 * If there is room, it will add up to max_protohdr-len extra bytes to the
947 * contiguous region in an attempt to avoid being called next time.
948 */
949struct mbuf *
950m_pullup(struct mbuf *n, int len)
951{
952	struct mbuf *m;
953	int count;
954	int space;
955
956	/*
957	 * If first mbuf has no cluster, and has room for len bytes
958	 * without shifting current data, pullup into it,
959	 * otherwise allocate a new mbuf to prepend to the chain.
960	 */
961	if ((n->m_flags & M_EXT) == 0 &&
962	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
963		if (n->m_len >= len)
964			return (n);
965		m = n;
966		n = n->m_next;
967		len -= m->m_len;
968	} else {
969		if (len > MHLEN)
970			goto bad;
971		MGET(m, M_DONTWAIT, n->m_type);
972		if (m == NULL)
973			goto bad;
974		m->m_len = 0;
975		if (n->m_flags & M_PKTHDR) {
976			M_COPY_PKTHDR(m, n);
977			n->m_flags &= ~M_PKTHDR;
978		}
979	}
980	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
981	do {
982		count = min(min(max(len, max_protohdr), space), n->m_len);
983		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
984		  (unsigned)count);
985		len -= count;
986		m->m_len += count;
987		n->m_len -= count;
988		space -= count;
989		if (n->m_len)
990			n->m_data += count;
991		else
992			n = m_free(n);
993	} while (len > 0 && n);
994	if (len > 0) {
995		(void) m_free(m);
996		goto bad;
997	}
998	m->m_next = n;
999	return (m);
1000bad:
1001	m_freem(n);
1002	mtx_lock(&mbuf_mtx);
1003	mbstat.m_mpfail++;
1004	mtx_unlock(&mbuf_mtx);
1005	return (NULL);
1006}
1007
1008/*
1009 * Partition an mbuf chain in two pieces, returning the tail --
1010 * all but the first len0 bytes.  In case of failure, it returns NULL and
1011 * attempts to restore the chain to its original state.
1012 */
1013struct mbuf *
1014m_split(struct mbuf *m0, int len0, int wait)
1015{
1016	struct mbuf *m, *n;
1017	unsigned len = len0, remain;
1018
1019	for (m = m0; m && len > m->m_len; m = m->m_next)
1020		len -= m->m_len;
1021	if (m == NULL)
1022		return (NULL);
1023	remain = m->m_len - len;
1024	if (m0->m_flags & M_PKTHDR) {
1025		MGETHDR(n, wait, m0->m_type);
1026		if (n == NULL)
1027			return (NULL);
1028		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1029		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1030		m0->m_pkthdr.len = len0;
1031		if (m->m_flags & M_EXT)
1032			goto extpacket;
1033		if (remain > MHLEN) {
1034			/* m can't be the lead packet */
1035			MH_ALIGN(n, 0);
1036			n->m_next = m_split(m, len, wait);
1037			if (n->m_next == NULL) {
1038				(void) m_free(n);
1039				return (NULL);
1040			} else
1041				return (n);
1042		} else
1043			MH_ALIGN(n, remain);
1044	} else if (remain == 0) {
1045		n = m->m_next;
1046		m->m_next = NULL;
1047		return (n);
1048	} else {
1049		MGET(n, wait, m->m_type);
1050		if (n == NULL)
1051			return (NULL);
1052		M_ALIGN(n, remain);
1053	}
1054extpacket:
1055	if (m->m_flags & M_EXT) {
1056		n->m_flags |= M_EXT;
1057		n->m_ext = m->m_ext;
1058		MEXT_ADD_REF(m);
1059		m->m_ext.ext_size = 0; /* For Accounting XXXXXX danger */
1060		n->m_data = m->m_data + len;
1061	} else {
1062		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1063	}
1064	n->m_len = remain;
1065	m->m_len = len;
1066	n->m_next = m->m_next;
1067	m->m_next = NULL;
1068	return (n);
1069}
1070/*
1071 * Routine to copy from device local memory into mbufs.
1072 */
1073struct mbuf *
1074m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
1075	 void (*copy)(char *from, caddr_t to, u_int len))
1076{
1077	struct mbuf *m;
1078	struct mbuf *top = 0, **mp = &top;
1079	int off = off0, len;
1080	char *cp;
1081	char *epkt;
1082
1083	cp = buf;
1084	epkt = cp + totlen;
1085	if (off) {
1086		cp += off + 2 * sizeof(u_short);
1087		totlen -= 2 * sizeof(u_short);
1088	}
1089	MGETHDR(m, M_DONTWAIT, MT_DATA);
1090	if (m == NULL)
1091		return (NULL);
1092	m->m_pkthdr.rcvif = ifp;
1093	m->m_pkthdr.len = totlen;
1094	m->m_len = MHLEN;
1095
1096	while (totlen > 0) {
1097		if (top) {
1098			MGET(m, M_DONTWAIT, MT_DATA);
1099			if (m == NULL) {
1100				m_freem(top);
1101				return (NULL);
1102			}
1103			m->m_len = MLEN;
1104		}
1105		len = min(totlen, epkt - cp);
1106		if (len >= MINCLSIZE) {
1107			MCLGET(m, M_DONTWAIT);
1108			if (m->m_flags & M_EXT)
1109				m->m_len = len = min(len, MCLBYTES);
1110			else
1111				len = m->m_len;
1112		} else {
1113			/*
1114			 * Place initial small packet/header at end of mbuf.
1115			 */
1116			if (len < m->m_len) {
1117				if (top == NULL && len +
1118				    max_linkhdr <= m->m_len)
1119					m->m_data += max_linkhdr;
1120				m->m_len = len;
1121			} else
1122				len = m->m_len;
1123		}
1124		if (copy)
1125			copy(cp, mtod(m, caddr_t), (unsigned)len);
1126		else
1127			bcopy(cp, mtod(m, caddr_t), (unsigned)len);
1128		cp += len;
1129		*mp = m;
1130		mp = &m->m_next;
1131		totlen -= len;
1132		if (cp == epkt)
1133			cp = buf;
1134	}
1135	return (top);
1136}
1137
1138/*
1139 * Copy data from a buffer back into the indicated mbuf chain,
1140 * starting "off" bytes from the beginning, extending the mbuf
1141 * chain if necessary.
1142 */
1143void
1144m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
1145{
1146	int mlen;
1147	struct mbuf *m = m0, *n;
1148	int totlen = 0;
1149
1150	if (m0 == NULL)
1151		return;
1152	while (off > (mlen = m->m_len)) {
1153		off -= mlen;
1154		totlen += mlen;
1155		if (m->m_next == NULL) {
1156			n = m_getclr(M_DONTWAIT, m->m_type);
1157			if (n == NULL)
1158				goto out;
1159			n->m_len = min(MLEN, len + off);
1160			m->m_next = n;
1161		}
1162		m = m->m_next;
1163	}
1164	while (len > 0) {
1165		mlen = min (m->m_len - off, len);
1166		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
1167		cp += mlen;
1168		len -= mlen;
1169		mlen += off;
1170		off = 0;
1171		totlen += mlen;
1172		if (len == 0)
1173			break;
1174		if (m->m_next == NULL) {
1175			n = m_get(M_DONTWAIT, m->m_type);
1176			if (n == NULL)
1177				break;
1178			n->m_len = min(MLEN, len);
1179			m->m_next = n;
1180		}
1181		m = m->m_next;
1182	}
1183out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1184		m->m_pkthdr.len = totlen;
1185}
1186
1187void
1188m_print(const struct mbuf *m)
1189{
1190	int len;
1191	const struct mbuf *m2;
1192
1193	len = m->m_pkthdr.len;
1194	m2 = m;
1195	while (len) {
1196		printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
1197		len -= m2->m_len;
1198		m2 = m2->m_next;
1199	}
1200	return;
1201}
1202