uipc_mbuf.c revision 276910
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/kern/uipc_mbuf.c 276910 2015-01-10 10:41:23Z rwatson $");
34
35#include "opt_param.h"
36#include "opt_mbuf_stress_test.h"
37#include "opt_mbuf_profiling.h"
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/kernel.h>
42#include <sys/limits.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/mbuf.h>
46#include <sys/sysctl.h>
47#include <sys/domain.h>
48#include <sys/protosw.h>
49#include <sys/uio.h>
50
51int	max_linkhdr;
52int	max_protohdr;
53int	max_hdr;
54int	max_datalen;
55#ifdef MBUF_STRESS_TEST
56int	m_defragpackets;
57int	m_defragbytes;
58int	m_defraguseless;
59int	m_defragfailure;
60int	m_defragrandomfailures;
61#endif
62
63/*
64 * sysctl(8) exported objects
65 */
66SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
67	   &max_linkhdr, 0, "Size of largest link layer header");
68SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
69	   &max_protohdr, 0, "Size of largest protocol layer header");
70SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
71	   &max_hdr, 0, "Size of largest link plus protocol header");
72SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD,
73	   &max_datalen, 0, "Minimum space left in mbuf after max_hdr");
74#ifdef MBUF_STRESS_TEST
75SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
76	   &m_defragpackets, 0, "");
77SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
78	   &m_defragbytes, 0, "");
79SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
80	   &m_defraguseless, 0, "");
81SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
82	   &m_defragfailure, 0, "");
83SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
84	   &m_defragrandomfailures, 0, "");
85#endif
86
87/*
88 * Ensure the correct size of various mbuf parameters.  It could be off due
89 * to compiler-induced padding and alignment artifacts.
90 */
91CTASSERT(sizeof(struct mbuf) == MSIZE);
92CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
93CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
94
95/*
96 * m_get2() allocates minimum mbuf that would fit "size" argument.
97 */
98struct mbuf *
99m_get2(int size, int how, short type, int flags)
100{
101	struct mb_args args;
102	struct mbuf *m, *n;
103
104	args.flags = flags;
105	args.type = type;
106
107	if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0))
108		return (uma_zalloc_arg(zone_mbuf, &args, how));
109	if (size <= MCLBYTES)
110		return (uma_zalloc_arg(zone_pack, &args, how));
111
112	if (size > MJUMPAGESIZE)
113		return (NULL);
114
115	m = uma_zalloc_arg(zone_mbuf, &args, how);
116	if (m == NULL)
117		return (NULL);
118
119	n = uma_zalloc_arg(zone_jumbop, m, how);
120	if (n == NULL) {
121		uma_zfree(zone_mbuf, m);
122		return (NULL);
123	}
124
125	return (m);
126}
127
128/*
129 * m_getjcl() returns an mbuf with a cluster of the specified size attached.
130 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
131 */
132struct mbuf *
133m_getjcl(int how, short type, int flags, int size)
134{
135	struct mb_args args;
136	struct mbuf *m, *n;
137	uma_zone_t zone;
138
139	if (size == MCLBYTES)
140		return m_getcl(how, type, flags);
141
142	args.flags = flags;
143	args.type = type;
144
145	m = uma_zalloc_arg(zone_mbuf, &args, how);
146	if (m == NULL)
147		return (NULL);
148
149	zone = m_getzone(size);
150	n = uma_zalloc_arg(zone, m, how);
151	if (n == NULL) {
152		uma_zfree(zone_mbuf, m);
153		return (NULL);
154	}
155	return (m);
156}
157
158/*
159 * Allocate a given length worth of mbufs and/or clusters (whatever fits
160 * best) and return a pointer to the top of the allocated chain.  If an
161 * existing mbuf chain is provided, then we will append the new chain
162 * to the existing one but still return the top of the newly allocated
163 * chain.
164 */
165struct mbuf *
166m_getm2(struct mbuf *m, int len, int how, short type, int flags)
167{
168	struct mbuf *mb, *nm = NULL, *mtail = NULL;
169
170	KASSERT(len >= 0, ("%s: len is < 0", __func__));
171
172	/* Validate flags. */
173	flags &= (M_PKTHDR | M_EOR);
174
175	/* Packet header mbuf must be first in chain. */
176	if ((flags & M_PKTHDR) && m != NULL)
177		flags &= ~M_PKTHDR;
178
179	/* Loop and append maximum sized mbufs to the chain tail. */
180	while (len > 0) {
181		if (len > MCLBYTES)
182			mb = m_getjcl(how, type, (flags & M_PKTHDR),
183			    MJUMPAGESIZE);
184		else if (len >= MINCLSIZE)
185			mb = m_getcl(how, type, (flags & M_PKTHDR));
186		else if (flags & M_PKTHDR)
187			mb = m_gethdr(how, type);
188		else
189			mb = m_get(how, type);
190
191		/* Fail the whole operation if one mbuf can't be allocated. */
192		if (mb == NULL) {
193			if (nm != NULL)
194				m_freem(nm);
195			return (NULL);
196		}
197
198		/* Book keeping. */
199		len -= M_SIZE(mb);
200		if (mtail != NULL)
201			mtail->m_next = mb;
202		else
203			nm = mb;
204		mtail = mb;
205		flags &= ~M_PKTHDR;	/* Only valid on the first mbuf. */
206	}
207	if (flags & M_EOR)
208		mtail->m_flags |= M_EOR;  /* Only valid on the last mbuf. */
209
210	/* If mbuf was supplied, append new chain to the end of it. */
211	if (m != NULL) {
212		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
213			;
214		mtail->m_next = nm;
215		mtail->m_flags &= ~M_EOR;
216	} else
217		m = nm;
218
219	return (m);
220}
221
222/*
223 * Free an entire chain of mbufs and associated external buffers, if
224 * applicable.
225 */
226void
227m_freem(struct mbuf *mb)
228{
229
230	while (mb != NULL)
231		mb = m_free(mb);
232}
233
234/*-
235 * Configure a provided mbuf to refer to the provided external storage
236 * buffer and setup a reference count for said buffer.  If the setting
237 * up of the reference count fails, the M_EXT bit will not be set.  If
238 * successfull, the M_EXT bit is set in the mbuf's flags.
239 *
240 * Arguments:
241 *    mb     The existing mbuf to which to attach the provided buffer.
242 *    buf    The address of the provided external storage buffer.
243 *    size   The size of the provided buffer.
244 *    freef  A pointer to a routine that is responsible for freeing the
245 *           provided external storage buffer.
246 *    args   A pointer to an argument structure (of any type) to be passed
247 *           to the provided freef routine (may be NULL).
248 *    flags  Any other flags to be passed to the provided mbuf.
249 *    type   The type that the external storage buffer should be
250 *           labeled with.
251 *
252 * Returns:
253 *    Nothing.
254 */
255int
256m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
257    void (*freef)(struct mbuf *, void *, void *), void *arg1, void *arg2,
258    int flags, int type, int wait)
259{
260	KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__));
261
262	if (type != EXT_EXTREF)
263		mb->m_ext.ext_cnt = uma_zalloc(zone_ext_refcnt, wait);
264
265	if (mb->m_ext.ext_cnt == NULL)
266		return (ENOMEM);
267
268	*(mb->m_ext.ext_cnt) = 1;
269	mb->m_flags |= (M_EXT | flags);
270	mb->m_ext.ext_buf = buf;
271	mb->m_data = mb->m_ext.ext_buf;
272	mb->m_ext.ext_size = size;
273	mb->m_ext.ext_free = freef;
274	mb->m_ext.ext_arg1 = arg1;
275	mb->m_ext.ext_arg2 = arg2;
276	mb->m_ext.ext_type = type;
277	mb->m_ext.ext_flags = 0;
278
279	return (0);
280}
281
282/*
283 * Non-directly-exported function to clean up after mbufs with M_EXT
284 * storage attached to them if the reference count hits 1.
285 */
286void
287mb_free_ext(struct mbuf *m)
288{
289	int freembuf;
290
291	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
292
293	/*
294	 * Check if the header is embedded in the cluster.
295	 */
296	freembuf = (m->m_flags & M_NOFREE) ? 0 : 1;
297
298	switch (m->m_ext.ext_type) {
299	case EXT_SFBUF:
300		sf_ext_free(m->m_ext.ext_arg1, m->m_ext.ext_arg2);
301		break;
302	default:
303		KASSERT(m->m_ext.ext_cnt != NULL,
304		    ("%s: no refcounting pointer on %p", __func__, m));
305		/*
306		 * Free attached storage if this mbuf is the only
307		 * reference to it.
308		 */
309		if (*(m->m_ext.ext_cnt) != 1) {
310			if (atomic_fetchadd_int(m->m_ext.ext_cnt, -1) != 1)
311				break;
312		}
313
314		switch (m->m_ext.ext_type) {
315		case EXT_PACKET:	/* The packet zone is special. */
316			if (*(m->m_ext.ext_cnt) == 0)
317				*(m->m_ext.ext_cnt) = 1;
318			uma_zfree(zone_pack, m);
319			return;		/* Job done. */
320		case EXT_CLUSTER:
321			uma_zfree(zone_clust, m->m_ext.ext_buf);
322			break;
323		case EXT_JUMBOP:
324			uma_zfree(zone_jumbop, m->m_ext.ext_buf);
325			break;
326		case EXT_JUMBO9:
327			uma_zfree(zone_jumbo9, m->m_ext.ext_buf);
328			break;
329		case EXT_JUMBO16:
330			uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
331			break;
332		case EXT_NET_DRV:
333		case EXT_MOD_TYPE:
334		case EXT_DISPOSABLE:
335			*(m->m_ext.ext_cnt) = 0;
336			uma_zfree(zone_ext_refcnt, __DEVOLATILE(u_int *,
337				m->m_ext.ext_cnt));
338			/* FALLTHROUGH */
339		case EXT_EXTREF:
340			KASSERT(m->m_ext.ext_free != NULL,
341				("%s: ext_free not set", __func__));
342			(*(m->m_ext.ext_free))(m, m->m_ext.ext_arg1,
343			    m->m_ext.ext_arg2);
344			break;
345		default:
346			KASSERT(m->m_ext.ext_type == 0,
347				("%s: unknown ext_type", __func__));
348		}
349	}
350
351	if (freembuf)
352		uma_zfree(zone_mbuf, m);
353}
354
355/*
356 * Attach the cluster from *m to *n, set up m_ext in *n
357 * and bump the refcount of the cluster.
358 */
359static void
360mb_dupcl(struct mbuf *n, struct mbuf *m)
361{
362
363	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
364	KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n));
365
366	switch (m->m_ext.ext_type) {
367	case EXT_SFBUF:
368		sf_ext_ref(m->m_ext.ext_arg1, m->m_ext.ext_arg2);
369		break;
370	default:
371		KASSERT(m->m_ext.ext_cnt != NULL,
372		    ("%s: no refcounting pointer on %p", __func__, m));
373		if (*(m->m_ext.ext_cnt) == 1)
374			*(m->m_ext.ext_cnt) += 1;
375		else
376			atomic_add_int(m->m_ext.ext_cnt, 1);
377	}
378
379	n->m_ext = m->m_ext;
380	n->m_flags |= M_EXT;
381	n->m_flags |= m->m_flags & M_RDONLY;
382}
383
384/*
385 * Clean up mbuf (chain) from any tags and packet headers.
386 * If "all" is set then the first mbuf in the chain will be
387 * cleaned too.
388 */
389void
390m_demote(struct mbuf *m0, int all, int flags)
391{
392	struct mbuf *m;
393
394	for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
395		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p",
396		    __func__, m, m0));
397		if (m->m_flags & M_PKTHDR) {
398			m_tag_delete_chain(m, NULL);
399			m->m_flags &= ~M_PKTHDR;
400			bzero(&m->m_pkthdr, sizeof(struct pkthdr));
401		}
402		m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags);
403	}
404}
405
406/*
407 * Sanity checks on mbuf (chain) for use in KASSERT() and general
408 * debugging.
409 * Returns 0 or panics when bad and 1 on all tests passed.
410 * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
411 * blow up later.
412 */
413int
414m_sanity(struct mbuf *m0, int sanitize)
415{
416	struct mbuf *m;
417	caddr_t a, b;
418	int pktlen = 0;
419
420#ifdef INVARIANTS
421#define	M_SANITY_ACTION(s)	panic("mbuf %p: " s, m)
422#else
423#define	M_SANITY_ACTION(s)	printf("mbuf %p: " s, m)
424#endif
425
426	for (m = m0; m != NULL; m = m->m_next) {
427		/*
428		 * Basic pointer checks.  If any of these fails then some
429		 * unrelated kernel memory before or after us is trashed.
430		 * No way to recover from that.
431		 */
432		a = M_START(m);
433		b = a + M_SIZE(m);
434		if ((caddr_t)m->m_data < a)
435			M_SANITY_ACTION("m_data outside mbuf data range left");
436		if ((caddr_t)m->m_data > b)
437			M_SANITY_ACTION("m_data outside mbuf data range right");
438		if ((caddr_t)m->m_data + m->m_len > b)
439			M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
440
441		/* m->m_nextpkt may only be set on first mbuf in chain. */
442		if (m != m0 && m->m_nextpkt != NULL) {
443			if (sanitize) {
444				m_freem(m->m_nextpkt);
445				m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
446			} else
447				M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
448		}
449
450		/* packet length (not mbuf length!) calculation */
451		if (m0->m_flags & M_PKTHDR)
452			pktlen += m->m_len;
453
454		/* m_tags may only be attached to first mbuf in chain. */
455		if (m != m0 && m->m_flags & M_PKTHDR &&
456		    !SLIST_EMPTY(&m->m_pkthdr.tags)) {
457			if (sanitize) {
458				m_tag_delete_chain(m, NULL);
459				/* put in 0xDEADC0DE perhaps? */
460			} else
461				M_SANITY_ACTION("m_tags on in-chain mbuf");
462		}
463
464		/* M_PKTHDR may only be set on first mbuf in chain */
465		if (m != m0 && m->m_flags & M_PKTHDR) {
466			if (sanitize) {
467				bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
468				m->m_flags &= ~M_PKTHDR;
469				/* put in 0xDEADCODE and leave hdr flag in */
470			} else
471				M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
472		}
473	}
474	m = m0;
475	if (pktlen && pktlen != m->m_pkthdr.len) {
476		if (sanitize)
477			m->m_pkthdr.len = 0;
478		else
479			M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
480	}
481	return 1;
482
483#undef	M_SANITY_ACTION
484}
485
486
487/*
488 * "Move" mbuf pkthdr from "from" to "to".
489 * "from" must have M_PKTHDR set, and "to" must be empty.
490 */
491void
492m_move_pkthdr(struct mbuf *to, struct mbuf *from)
493{
494
495#if 0
496	/* see below for why these are not enabled */
497	M_ASSERTPKTHDR(to);
498	/* Note: with MAC, this may not be a good assertion. */
499	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
500	    ("m_move_pkthdr: to has tags"));
501#endif
502#ifdef MAC
503	/*
504	 * XXXMAC: It could be this should also occur for non-MAC?
505	 */
506	if (to->m_flags & M_PKTHDR)
507		m_tag_delete_chain(to, NULL);
508#endif
509	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
510	if ((to->m_flags & M_EXT) == 0)
511		to->m_data = to->m_pktdat;
512	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
513	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
514	from->m_flags &= ~M_PKTHDR;
515}
516
517/*
518 * Duplicate "from"'s mbuf pkthdr in "to".
519 * "from" must have M_PKTHDR set, and "to" must be empty.
520 * In particular, this does a deep copy of the packet tags.
521 */
522int
523m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
524{
525
526#if 0
527	/*
528	 * The mbuf allocator only initializes the pkthdr
529	 * when the mbuf is allocated with m_gethdr(). Many users
530	 * (e.g. m_copy*, m_prepend) use m_get() and then
531	 * smash the pkthdr as needed causing these
532	 * assertions to trip.  For now just disable them.
533	 */
534	M_ASSERTPKTHDR(to);
535	/* Note: with MAC, this may not be a good assertion. */
536	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
537#endif
538	MBUF_CHECKSLEEP(how);
539#ifdef MAC
540	if (to->m_flags & M_PKTHDR)
541		m_tag_delete_chain(to, NULL);
542#endif
543	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
544	if ((to->m_flags & M_EXT) == 0)
545		to->m_data = to->m_pktdat;
546	to->m_pkthdr = from->m_pkthdr;
547	SLIST_INIT(&to->m_pkthdr.tags);
548	return (m_tag_copy_chain(to, from, how));
549}
550
551/*
552 * Lesser-used path for M_PREPEND:
553 * allocate new mbuf to prepend to chain,
554 * copy junk along.
555 */
556struct mbuf *
557m_prepend(struct mbuf *m, int len, int how)
558{
559	struct mbuf *mn;
560
561	if (m->m_flags & M_PKTHDR)
562		mn = m_gethdr(how, m->m_type);
563	else
564		mn = m_get(how, m->m_type);
565	if (mn == NULL) {
566		m_freem(m);
567		return (NULL);
568	}
569	if (m->m_flags & M_PKTHDR)
570		m_move_pkthdr(mn, m);
571	mn->m_next = m;
572	m = mn;
573	if (len < M_SIZE(m))
574		M_ALIGN(m, len);
575	m->m_len = len;
576	return (m);
577}
578
579/*
580 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
581 * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
582 * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
583 * Note that the copy is read-only, because clusters are not copied,
584 * only their reference counts are incremented.
585 */
586struct mbuf *
587m_copym(struct mbuf *m, int off0, int len, int wait)
588{
589	struct mbuf *n, **np;
590	int off = off0;
591	struct mbuf *top;
592	int copyhdr = 0;
593
594	KASSERT(off >= 0, ("m_copym, negative off %d", off));
595	KASSERT(len >= 0, ("m_copym, negative len %d", len));
596	MBUF_CHECKSLEEP(wait);
597	if (off == 0 && m->m_flags & M_PKTHDR)
598		copyhdr = 1;
599	while (off > 0) {
600		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
601		if (off < m->m_len)
602			break;
603		off -= m->m_len;
604		m = m->m_next;
605	}
606	np = &top;
607	top = 0;
608	while (len > 0) {
609		if (m == NULL) {
610			KASSERT(len == M_COPYALL,
611			    ("m_copym, length > size of mbuf chain"));
612			break;
613		}
614		if (copyhdr)
615			n = m_gethdr(wait, m->m_type);
616		else
617			n = m_get(wait, m->m_type);
618		*np = n;
619		if (n == NULL)
620			goto nospace;
621		if (copyhdr) {
622			if (!m_dup_pkthdr(n, m, wait))
623				goto nospace;
624			if (len == M_COPYALL)
625				n->m_pkthdr.len -= off0;
626			else
627				n->m_pkthdr.len = len;
628			copyhdr = 0;
629		}
630		n->m_len = min(len, m->m_len - off);
631		if (m->m_flags & M_EXT) {
632			n->m_data = m->m_data + off;
633			mb_dupcl(n, m);
634		} else
635			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
636			    (u_int)n->m_len);
637		if (len != M_COPYALL)
638			len -= n->m_len;
639		off = 0;
640		m = m->m_next;
641		np = &n->m_next;
642	}
643
644	return (top);
645nospace:
646	m_freem(top);
647	return (NULL);
648}
649
650/*
651 * Copy an entire packet, including header (which must be present).
652 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
653 * Note that the copy is read-only, because clusters are not copied,
654 * only their reference counts are incremented.
655 * Preserve alignment of the first mbuf so if the creator has left
656 * some room at the beginning (e.g. for inserting protocol headers)
657 * the copies still have the room available.
658 */
659struct mbuf *
660m_copypacket(struct mbuf *m, int how)
661{
662	struct mbuf *top, *n, *o;
663
664	MBUF_CHECKSLEEP(how);
665	n = m_get(how, m->m_type);
666	top = n;
667	if (n == NULL)
668		goto nospace;
669
670	if (!m_dup_pkthdr(n, m, how))
671		goto nospace;
672	n->m_len = m->m_len;
673	if (m->m_flags & M_EXT) {
674		n->m_data = m->m_data;
675		mb_dupcl(n, m);
676	} else {
677		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
678		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
679	}
680
681	m = m->m_next;
682	while (m) {
683		o = m_get(how, m->m_type);
684		if (o == NULL)
685			goto nospace;
686
687		n->m_next = o;
688		n = n->m_next;
689
690		n->m_len = m->m_len;
691		if (m->m_flags & M_EXT) {
692			n->m_data = m->m_data;
693			mb_dupcl(n, m);
694		} else {
695			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
696		}
697
698		m = m->m_next;
699	}
700	return top;
701nospace:
702	m_freem(top);
703	return (NULL);
704}
705
706/*
707 * Copy data from an mbuf chain starting "off" bytes from the beginning,
708 * continuing for "len" bytes, into the indicated buffer.
709 */
710void
711m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
712{
713	u_int count;
714
715	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
716	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
717	while (off > 0) {
718		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
719		if (off < m->m_len)
720			break;
721		off -= m->m_len;
722		m = m->m_next;
723	}
724	while (len > 0) {
725		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
726		count = min(m->m_len - off, len);
727		bcopy(mtod(m, caddr_t) + off, cp, count);
728		len -= count;
729		cp += count;
730		off = 0;
731		m = m->m_next;
732	}
733}
734
735/*
736 * Copy a packet header mbuf chain into a completely new chain, including
737 * copying any mbuf clusters.  Use this instead of m_copypacket() when
738 * you need a writable copy of an mbuf chain.
739 */
740struct mbuf *
741m_dup(struct mbuf *m, int how)
742{
743	struct mbuf **p, *top = NULL;
744	int remain, moff, nsize;
745
746	MBUF_CHECKSLEEP(how);
747	/* Sanity check */
748	if (m == NULL)
749		return (NULL);
750	M_ASSERTPKTHDR(m);
751
752	/* While there's more data, get a new mbuf, tack it on, and fill it */
753	remain = m->m_pkthdr.len;
754	moff = 0;
755	p = &top;
756	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
757		struct mbuf *n;
758
759		/* Get the next new mbuf */
760		if (remain >= MINCLSIZE) {
761			n = m_getcl(how, m->m_type, 0);
762			nsize = MCLBYTES;
763		} else {
764			n = m_get(how, m->m_type);
765			nsize = MLEN;
766		}
767		if (n == NULL)
768			goto nospace;
769
770		if (top == NULL) {		/* First one, must be PKTHDR */
771			if (!m_dup_pkthdr(n, m, how)) {
772				m_free(n);
773				goto nospace;
774			}
775			if ((n->m_flags & M_EXT) == 0)
776				nsize = MHLEN;
777		}
778		n->m_len = 0;
779
780		/* Link it into the new chain */
781		*p = n;
782		p = &n->m_next;
783
784		/* Copy data from original mbuf(s) into new mbuf */
785		while (n->m_len < nsize && m != NULL) {
786			int chunk = min(nsize - n->m_len, m->m_len - moff);
787
788			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
789			moff += chunk;
790			n->m_len += chunk;
791			remain -= chunk;
792			if (moff == m->m_len) {
793				m = m->m_next;
794				moff = 0;
795			}
796		}
797
798		/* Check correct total mbuf length */
799		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
800		    	("%s: bogus m_pkthdr.len", __func__));
801	}
802	return (top);
803
804nospace:
805	m_freem(top);
806	return (NULL);
807}
808
809/*
810 * Concatenate mbuf chain n to m.
811 * Both chains must be of the same type (e.g. MT_DATA).
812 * Any m_pkthdr is not updated.
813 */
814void
815m_cat(struct mbuf *m, struct mbuf *n)
816{
817	while (m->m_next)
818		m = m->m_next;
819	while (n) {
820		if (!M_WRITABLE(m) ||
821		    M_TRAILINGSPACE(m) < n->m_len) {
822			/* just join the two chains */
823			m->m_next = n;
824			return;
825		}
826		/* splat the data from one into the other */
827		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
828		    (u_int)n->m_len);
829		m->m_len += n->m_len;
830		n = m_free(n);
831	}
832}
833
834/*
835 * Concatenate two pkthdr mbuf chains.
836 */
837void
838m_catpkt(struct mbuf *m, struct mbuf *n)
839{
840
841	M_ASSERTPKTHDR(m);
842	M_ASSERTPKTHDR(n);
843
844	m->m_pkthdr.len += n->m_pkthdr.len;
845	m_demote(n, 1, 0);
846
847	m_cat(m, n);
848}
849
850void
851m_adj(struct mbuf *mp, int req_len)
852{
853	int len = req_len;
854	struct mbuf *m;
855	int count;
856
857	if ((m = mp) == NULL)
858		return;
859	if (len >= 0) {
860		/*
861		 * Trim from head.
862		 */
863		while (m != NULL && len > 0) {
864			if (m->m_len <= len) {
865				len -= m->m_len;
866				m->m_len = 0;
867				m = m->m_next;
868			} else {
869				m->m_len -= len;
870				m->m_data += len;
871				len = 0;
872			}
873		}
874		if (mp->m_flags & M_PKTHDR)
875			mp->m_pkthdr.len -= (req_len - len);
876	} else {
877		/*
878		 * Trim from tail.  Scan the mbuf chain,
879		 * calculating its length and finding the last mbuf.
880		 * If the adjustment only affects this mbuf, then just
881		 * adjust and return.  Otherwise, rescan and truncate
882		 * after the remaining size.
883		 */
884		len = -len;
885		count = 0;
886		for (;;) {
887			count += m->m_len;
888			if (m->m_next == (struct mbuf *)0)
889				break;
890			m = m->m_next;
891		}
892		if (m->m_len >= len) {
893			m->m_len -= len;
894			if (mp->m_flags & M_PKTHDR)
895				mp->m_pkthdr.len -= len;
896			return;
897		}
898		count -= len;
899		if (count < 0)
900			count = 0;
901		/*
902		 * Correct length for chain is "count".
903		 * Find the mbuf with last data, adjust its length,
904		 * and toss data from remaining mbufs on chain.
905		 */
906		m = mp;
907		if (m->m_flags & M_PKTHDR)
908			m->m_pkthdr.len = count;
909		for (; m; m = m->m_next) {
910			if (m->m_len >= count) {
911				m->m_len = count;
912				if (m->m_next != NULL) {
913					m_freem(m->m_next);
914					m->m_next = NULL;
915				}
916				break;
917			}
918			count -= m->m_len;
919		}
920	}
921}
922
923/*
924 * Rearange an mbuf chain so that len bytes are contiguous
925 * and in the data area of an mbuf (so that mtod will work
926 * for a structure of size len).  Returns the resulting
927 * mbuf chain on success, frees it and returns null on failure.
928 * If there is room, it will add up to max_protohdr-len extra bytes to the
929 * contiguous region in an attempt to avoid being called next time.
930 */
931struct mbuf *
932m_pullup(struct mbuf *n, int len)
933{
934	struct mbuf *m;
935	int count;
936	int space;
937
938	/*
939	 * If first mbuf has no cluster, and has room for len bytes
940	 * without shifting current data, pullup into it,
941	 * otherwise allocate a new mbuf to prepend to the chain.
942	 */
943	if ((n->m_flags & M_EXT) == 0 &&
944	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
945		if (n->m_len >= len)
946			return (n);
947		m = n;
948		n = n->m_next;
949		len -= m->m_len;
950	} else {
951		if (len > MHLEN)
952			goto bad;
953		m = m_get(M_NOWAIT, n->m_type);
954		if (m == NULL)
955			goto bad;
956		if (n->m_flags & M_PKTHDR)
957			m_move_pkthdr(m, n);
958	}
959	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
960	do {
961		count = min(min(max(len, max_protohdr), space), n->m_len);
962		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
963		  (u_int)count);
964		len -= count;
965		m->m_len += count;
966		n->m_len -= count;
967		space -= count;
968		if (n->m_len)
969			n->m_data += count;
970		else
971			n = m_free(n);
972	} while (len > 0 && n);
973	if (len > 0) {
974		(void) m_free(m);
975		goto bad;
976	}
977	m->m_next = n;
978	return (m);
979bad:
980	m_freem(n);
981	return (NULL);
982}
983
984/*
985 * Like m_pullup(), except a new mbuf is always allocated, and we allow
986 * the amount of empty space before the data in the new mbuf to be specified
987 * (in the event that the caller expects to prepend later).
988 */
989int MSFail;
990
991struct mbuf *
992m_copyup(struct mbuf *n, int len, int dstoff)
993{
994	struct mbuf *m;
995	int count, space;
996
997	if (len > (MHLEN - dstoff))
998		goto bad;
999	m = m_get(M_NOWAIT, n->m_type);
1000	if (m == NULL)
1001		goto bad;
1002	if (n->m_flags & M_PKTHDR)
1003		m_move_pkthdr(m, n);
1004	m->m_data += dstoff;
1005	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1006	do {
1007		count = min(min(max(len, max_protohdr), space), n->m_len);
1008		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
1009		    (unsigned)count);
1010		len -= count;
1011		m->m_len += count;
1012		n->m_len -= count;
1013		space -= count;
1014		if (n->m_len)
1015			n->m_data += count;
1016		else
1017			n = m_free(n);
1018	} while (len > 0 && n);
1019	if (len > 0) {
1020		(void) m_free(m);
1021		goto bad;
1022	}
1023	m->m_next = n;
1024	return (m);
1025 bad:
1026	m_freem(n);
1027	MSFail++;
1028	return (NULL);
1029}
1030
1031/*
1032 * Partition an mbuf chain in two pieces, returning the tail --
1033 * all but the first len0 bytes.  In case of failure, it returns NULL and
1034 * attempts to restore the chain to its original state.
1035 *
1036 * Note that the resulting mbufs might be read-only, because the new
1037 * mbuf can end up sharing an mbuf cluster with the original mbuf if
1038 * the "breaking point" happens to lie within a cluster mbuf. Use the
1039 * M_WRITABLE() macro to check for this case.
1040 */
1041struct mbuf *
1042m_split(struct mbuf *m0, int len0, int wait)
1043{
1044	struct mbuf *m, *n;
1045	u_int len = len0, remain;
1046
1047	MBUF_CHECKSLEEP(wait);
1048	for (m = m0; m && len > m->m_len; m = m->m_next)
1049		len -= m->m_len;
1050	if (m == NULL)
1051		return (NULL);
1052	remain = m->m_len - len;
1053	if (m0->m_flags & M_PKTHDR && remain == 0) {
1054		n = m_gethdr(wait, m0->m_type);
1055		if (n == NULL)
1056			return (NULL);
1057		n->m_next = m->m_next;
1058		m->m_next = NULL;
1059		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1060		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1061		m0->m_pkthdr.len = len0;
1062		return (n);
1063	} else if (m0->m_flags & M_PKTHDR) {
1064		n = m_gethdr(wait, m0->m_type);
1065		if (n == NULL)
1066			return (NULL);
1067		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1068		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1069		m0->m_pkthdr.len = len0;
1070		if (m->m_flags & M_EXT)
1071			goto extpacket;
1072		if (remain > MHLEN) {
1073			/* m can't be the lead packet */
1074			M_ALIGN(n, 0);
1075			n->m_next = m_split(m, len, wait);
1076			if (n->m_next == NULL) {
1077				(void) m_free(n);
1078				return (NULL);
1079			} else {
1080				n->m_len = 0;
1081				return (n);
1082			}
1083		} else
1084			M_ALIGN(n, remain);
1085	} else if (remain == 0) {
1086		n = m->m_next;
1087		m->m_next = NULL;
1088		return (n);
1089	} else {
1090		n = m_get(wait, m->m_type);
1091		if (n == NULL)
1092			return (NULL);
1093		M_ALIGN(n, remain);
1094	}
1095extpacket:
1096	if (m->m_flags & M_EXT) {
1097		n->m_data = m->m_data + len;
1098		mb_dupcl(n, m);
1099	} else {
1100		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1101	}
1102	n->m_len = remain;
1103	m->m_len = len;
1104	n->m_next = m->m_next;
1105	m->m_next = NULL;
1106	return (n);
1107}
1108/*
1109 * Routine to copy from device local memory into mbufs.
1110 * Note that `off' argument is offset into first mbuf of target chain from
1111 * which to begin copying the data to.
1112 */
1113struct mbuf *
1114m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
1115    void (*copy)(char *from, caddr_t to, u_int len))
1116{
1117	struct mbuf *m;
1118	struct mbuf *top = NULL, **mp = &top;
1119	int len;
1120
1121	if (off < 0 || off > MHLEN)
1122		return (NULL);
1123
1124	while (totlen > 0) {
1125		if (top == NULL) {	/* First one, must be PKTHDR */
1126			if (totlen + off >= MINCLSIZE) {
1127				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
1128				len = MCLBYTES;
1129			} else {
1130				m = m_gethdr(M_NOWAIT, MT_DATA);
1131				len = MHLEN;
1132
1133				/* Place initial small packet/header at end of mbuf */
1134				if (m && totlen + off + max_linkhdr <= MLEN) {
1135					m->m_data += max_linkhdr;
1136					len -= max_linkhdr;
1137				}
1138			}
1139			if (m == NULL)
1140				return NULL;
1141			m->m_pkthdr.rcvif = ifp;
1142			m->m_pkthdr.len = totlen;
1143		} else {
1144			if (totlen + off >= MINCLSIZE) {
1145				m = m_getcl(M_NOWAIT, MT_DATA, 0);
1146				len = MCLBYTES;
1147			} else {
1148				m = m_get(M_NOWAIT, MT_DATA);
1149				len = MLEN;
1150			}
1151			if (m == NULL) {
1152				m_freem(top);
1153				return NULL;
1154			}
1155		}
1156		if (off) {
1157			m->m_data += off;
1158			len -= off;
1159			off = 0;
1160		}
1161		m->m_len = len = min(totlen, len);
1162		if (copy)
1163			copy(buf, mtod(m, caddr_t), (u_int)len);
1164		else
1165			bcopy(buf, mtod(m, caddr_t), (u_int)len);
1166		buf += len;
1167		*mp = m;
1168		mp = &m->m_next;
1169		totlen -= len;
1170	}
1171	return (top);
1172}
1173
1174/*
1175 * Copy data from a buffer back into the indicated mbuf chain,
1176 * starting "off" bytes from the beginning, extending the mbuf
1177 * chain if necessary.
1178 */
1179void
1180m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
1181{
1182	int mlen;
1183	struct mbuf *m = m0, *n;
1184	int totlen = 0;
1185
1186	if (m0 == NULL)
1187		return;
1188	while (off > (mlen = m->m_len)) {
1189		off -= mlen;
1190		totlen += mlen;
1191		if (m->m_next == NULL) {
1192			n = m_get(M_NOWAIT, m->m_type);
1193			if (n == NULL)
1194				goto out;
1195			bzero(mtod(n, caddr_t), MLEN);
1196			n->m_len = min(MLEN, len + off);
1197			m->m_next = n;
1198		}
1199		m = m->m_next;
1200	}
1201	while (len > 0) {
1202		if (m->m_next == NULL && (len > m->m_len - off)) {
1203			m->m_len += min(len - (m->m_len - off),
1204			    M_TRAILINGSPACE(m));
1205		}
1206		mlen = min (m->m_len - off, len);
1207		bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
1208		cp += mlen;
1209		len -= mlen;
1210		mlen += off;
1211		off = 0;
1212		totlen += mlen;
1213		if (len == 0)
1214			break;
1215		if (m->m_next == NULL) {
1216			n = m_get(M_NOWAIT, m->m_type);
1217			if (n == NULL)
1218				break;
1219			n->m_len = min(MLEN, len);
1220			m->m_next = n;
1221		}
1222		m = m->m_next;
1223	}
1224out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1225		m->m_pkthdr.len = totlen;
1226}
1227
1228/*
1229 * Append the specified data to the indicated mbuf chain,
1230 * Extend the mbuf chain if the new data does not fit in
1231 * existing space.
1232 *
1233 * Return 1 if able to complete the job; otherwise 0.
1234 */
1235int
1236m_append(struct mbuf *m0, int len, c_caddr_t cp)
1237{
1238	struct mbuf *m, *n;
1239	int remainder, space;
1240
1241	for (m = m0; m->m_next != NULL; m = m->m_next)
1242		;
1243	remainder = len;
1244	space = M_TRAILINGSPACE(m);
1245	if (space > 0) {
1246		/*
1247		 * Copy into available space.
1248		 */
1249		if (space > remainder)
1250			space = remainder;
1251		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1252		m->m_len += space;
1253		cp += space, remainder -= space;
1254	}
1255	while (remainder > 0) {
1256		/*
1257		 * Allocate a new mbuf; could check space
1258		 * and allocate a cluster instead.
1259		 */
1260		n = m_get(M_NOWAIT, m->m_type);
1261		if (n == NULL)
1262			break;
1263		n->m_len = min(MLEN, remainder);
1264		bcopy(cp, mtod(n, caddr_t), n->m_len);
1265		cp += n->m_len, remainder -= n->m_len;
1266		m->m_next = n;
1267		m = n;
1268	}
1269	if (m0->m_flags & M_PKTHDR)
1270		m0->m_pkthdr.len += len - remainder;
1271	return (remainder == 0);
1272}
1273
1274/*
1275 * Apply function f to the data in an mbuf chain starting "off" bytes from
1276 * the beginning, continuing for "len" bytes.
1277 */
1278int
1279m_apply(struct mbuf *m, int off, int len,
1280    int (*f)(void *, void *, u_int), void *arg)
1281{
1282	u_int count;
1283	int rval;
1284
1285	KASSERT(off >= 0, ("m_apply, negative off %d", off));
1286	KASSERT(len >= 0, ("m_apply, negative len %d", len));
1287	while (off > 0) {
1288		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1289		if (off < m->m_len)
1290			break;
1291		off -= m->m_len;
1292		m = m->m_next;
1293	}
1294	while (len > 0) {
1295		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1296		count = min(m->m_len - off, len);
1297		rval = (*f)(arg, mtod(m, caddr_t) + off, count);
1298		if (rval)
1299			return (rval);
1300		len -= count;
1301		off = 0;
1302		m = m->m_next;
1303	}
1304	return (0);
1305}
1306
1307/*
1308 * Return a pointer to mbuf/offset of location in mbuf chain.
1309 */
1310struct mbuf *
1311m_getptr(struct mbuf *m, int loc, int *off)
1312{
1313
1314	while (loc >= 0) {
1315		/* Normal end of search. */
1316		if (m->m_len > loc) {
1317			*off = loc;
1318			return (m);
1319		} else {
1320			loc -= m->m_len;
1321			if (m->m_next == NULL) {
1322				if (loc == 0) {
1323					/* Point at the end of valid data. */
1324					*off = m->m_len;
1325					return (m);
1326				}
1327				return (NULL);
1328			}
1329			m = m->m_next;
1330		}
1331	}
1332	return (NULL);
1333}
1334
1335void
1336m_print(const struct mbuf *m, int maxlen)
1337{
1338	int len;
1339	int pdata;
1340	const struct mbuf *m2;
1341
1342	if (m == NULL) {
1343		printf("mbuf: %p\n", m);
1344		return;
1345	}
1346
1347	if (m->m_flags & M_PKTHDR)
1348		len = m->m_pkthdr.len;
1349	else
1350		len = -1;
1351	m2 = m;
1352	while (m2 != NULL && (len == -1 || len)) {
1353		pdata = m2->m_len;
1354		if (maxlen != -1 && pdata > maxlen)
1355			pdata = maxlen;
1356		printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
1357		    m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
1358		    "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
1359		    "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
1360		if (pdata)
1361			printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
1362		if (len != -1)
1363			len -= m2->m_len;
1364		m2 = m2->m_next;
1365	}
1366	if (len > 0)
1367		printf("%d bytes unaccounted for.\n", len);
1368	return;
1369}
1370
1371u_int
1372m_fixhdr(struct mbuf *m0)
1373{
1374	u_int len;
1375
1376	len = m_length(m0, NULL);
1377	m0->m_pkthdr.len = len;
1378	return (len);
1379}
1380
1381u_int
1382m_length(struct mbuf *m0, struct mbuf **last)
1383{
1384	struct mbuf *m;
1385	u_int len;
1386
1387	len = 0;
1388	for (m = m0; m != NULL; m = m->m_next) {
1389		len += m->m_len;
1390		if (m->m_next == NULL)
1391			break;
1392	}
1393	if (last != NULL)
1394		*last = m;
1395	return (len);
1396}
1397
1398/*
1399 * Defragment a mbuf chain, returning the shortest possible
1400 * chain of mbufs and clusters.  If allocation fails and
1401 * this cannot be completed, NULL will be returned, but
1402 * the passed in chain will be unchanged.  Upon success,
1403 * the original chain will be freed, and the new chain
1404 * will be returned.
1405 *
1406 * If a non-packet header is passed in, the original
1407 * mbuf (chain?) will be returned unharmed.
1408 */
1409struct mbuf *
1410m_defrag(struct mbuf *m0, int how)
1411{
1412	struct mbuf *m_new = NULL, *m_final = NULL;
1413	int progress = 0, length;
1414
1415	MBUF_CHECKSLEEP(how);
1416	if (!(m0->m_flags & M_PKTHDR))
1417		return (m0);
1418
1419	m_fixhdr(m0); /* Needed sanity check */
1420
1421#ifdef MBUF_STRESS_TEST
1422	if (m_defragrandomfailures) {
1423		int temp = arc4random() & 0xff;
1424		if (temp == 0xba)
1425			goto nospace;
1426	}
1427#endif
1428
1429	if (m0->m_pkthdr.len > MHLEN)
1430		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1431	else
1432		m_final = m_gethdr(how, MT_DATA);
1433
1434	if (m_final == NULL)
1435		goto nospace;
1436
1437	if (m_dup_pkthdr(m_final, m0, how) == 0)
1438		goto nospace;
1439
1440	m_new = m_final;
1441
1442	while (progress < m0->m_pkthdr.len) {
1443		length = m0->m_pkthdr.len - progress;
1444		if (length > MCLBYTES)
1445			length = MCLBYTES;
1446
1447		if (m_new == NULL) {
1448			if (length > MLEN)
1449				m_new = m_getcl(how, MT_DATA, 0);
1450			else
1451				m_new = m_get(how, MT_DATA);
1452			if (m_new == NULL)
1453				goto nospace;
1454		}
1455
1456		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1457		progress += length;
1458		m_new->m_len = length;
1459		if (m_new != m_final)
1460			m_cat(m_final, m_new);
1461		m_new = NULL;
1462	}
1463#ifdef MBUF_STRESS_TEST
1464	if (m0->m_next == NULL)
1465		m_defraguseless++;
1466#endif
1467	m_freem(m0);
1468	m0 = m_final;
1469#ifdef MBUF_STRESS_TEST
1470	m_defragpackets++;
1471	m_defragbytes += m0->m_pkthdr.len;
1472#endif
1473	return (m0);
1474nospace:
1475#ifdef MBUF_STRESS_TEST
1476	m_defragfailure++;
1477#endif
1478	if (m_final)
1479		m_freem(m_final);
1480	return (NULL);
1481}
1482
1483/*
1484 * Defragment an mbuf chain, returning at most maxfrags separate
1485 * mbufs+clusters.  If this is not possible NULL is returned and
1486 * the original mbuf chain is left in it's present (potentially
1487 * modified) state.  We use two techniques: collapsing consecutive
1488 * mbufs and replacing consecutive mbufs by a cluster.
1489 *
1490 * NB: this should really be named m_defrag but that name is taken
1491 */
1492struct mbuf *
1493m_collapse(struct mbuf *m0, int how, int maxfrags)
1494{
1495	struct mbuf *m, *n, *n2, **prev;
1496	u_int curfrags;
1497
1498	/*
1499	 * Calculate the current number of frags.
1500	 */
1501	curfrags = 0;
1502	for (m = m0; m != NULL; m = m->m_next)
1503		curfrags++;
1504	/*
1505	 * First, try to collapse mbufs.  Note that we always collapse
1506	 * towards the front so we don't need to deal with moving the
1507	 * pkthdr.  This may be suboptimal if the first mbuf has much
1508	 * less data than the following.
1509	 */
1510	m = m0;
1511again:
1512	for (;;) {
1513		n = m->m_next;
1514		if (n == NULL)
1515			break;
1516		if (M_WRITABLE(m) &&
1517		    n->m_len < M_TRAILINGSPACE(m)) {
1518			bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
1519				n->m_len);
1520			m->m_len += n->m_len;
1521			m->m_next = n->m_next;
1522			m_free(n);
1523			if (--curfrags <= maxfrags)
1524				return m0;
1525		} else
1526			m = n;
1527	}
1528	KASSERT(maxfrags > 1,
1529		("maxfrags %u, but normal collapse failed", maxfrags));
1530	/*
1531	 * Collapse consecutive mbufs to a cluster.
1532	 */
1533	prev = &m0->m_next;		/* NB: not the first mbuf */
1534	while ((n = *prev) != NULL) {
1535		if ((n2 = n->m_next) != NULL &&
1536		    n->m_len + n2->m_len < MCLBYTES) {
1537			m = m_getcl(how, MT_DATA, 0);
1538			if (m == NULL)
1539				goto bad;
1540			bcopy(mtod(n, void *), mtod(m, void *), n->m_len);
1541			bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len,
1542				n2->m_len);
1543			m->m_len = n->m_len + n2->m_len;
1544			m->m_next = n2->m_next;
1545			*prev = m;
1546			m_free(n);
1547			m_free(n2);
1548			if (--curfrags <= maxfrags)	/* +1 cl -2 mbufs */
1549				return m0;
1550			/*
1551			 * Still not there, try the normal collapse
1552			 * again before we allocate another cluster.
1553			 */
1554			goto again;
1555		}
1556		prev = &n->m_next;
1557	}
1558	/*
1559	 * No place where we can collapse to a cluster; punt.
1560	 * This can occur if, for example, you request 2 frags
1561	 * but the packet requires that both be clusters (we
1562	 * never reallocate the first mbuf to avoid moving the
1563	 * packet header).
1564	 */
1565bad:
1566	return NULL;
1567}
1568
1569#ifdef MBUF_STRESS_TEST
1570
1571/*
1572 * Fragment an mbuf chain.  There's no reason you'd ever want to do
1573 * this in normal usage, but it's great for stress testing various
1574 * mbuf consumers.
1575 *
1576 * If fragmentation is not possible, the original chain will be
1577 * returned.
1578 *
1579 * Possible length values:
1580 * 0	 no fragmentation will occur
1581 * > 0	each fragment will be of the specified length
1582 * -1	each fragment will be the same random value in length
1583 * -2	each fragment's length will be entirely random
1584 * (Random values range from 1 to 256)
1585 */
1586struct mbuf *
1587m_fragment(struct mbuf *m0, int how, int length)
1588{
1589	struct mbuf *m_new = NULL, *m_final = NULL;
1590	int progress = 0;
1591
1592	if (!(m0->m_flags & M_PKTHDR))
1593		return (m0);
1594
1595	if ((length == 0) || (length < -2))
1596		return (m0);
1597
1598	m_fixhdr(m0); /* Needed sanity check */
1599
1600	m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1601
1602	if (m_final == NULL)
1603		goto nospace;
1604
1605	if (m_dup_pkthdr(m_final, m0, how) == 0)
1606		goto nospace;
1607
1608	m_new = m_final;
1609
1610	if (length == -1)
1611		length = 1 + (arc4random() & 255);
1612
1613	while (progress < m0->m_pkthdr.len) {
1614		int fraglen;
1615
1616		if (length > 0)
1617			fraglen = length;
1618		else
1619			fraglen = 1 + (arc4random() & 255);
1620		if (fraglen > m0->m_pkthdr.len - progress)
1621			fraglen = m0->m_pkthdr.len - progress;
1622
1623		if (fraglen > MCLBYTES)
1624			fraglen = MCLBYTES;
1625
1626		if (m_new == NULL) {
1627			m_new = m_getcl(how, MT_DATA, 0);
1628			if (m_new == NULL)
1629				goto nospace;
1630		}
1631
1632		m_copydata(m0, progress, fraglen, mtod(m_new, caddr_t));
1633		progress += fraglen;
1634		m_new->m_len = fraglen;
1635		if (m_new != m_final)
1636			m_cat(m_final, m_new);
1637		m_new = NULL;
1638	}
1639	m_freem(m0);
1640	m0 = m_final;
1641	return (m0);
1642nospace:
1643	if (m_final)
1644		m_freem(m_final);
1645	/* Return the original chain on failure */
1646	return (m0);
1647}
1648
1649#endif
1650
1651/*
1652 * Copy the contents of uio into a properly sized mbuf chain.
1653 */
1654struct mbuf *
1655m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
1656{
1657	struct mbuf *m, *mb;
1658	int error, length;
1659	ssize_t total;
1660	int progress = 0;
1661
1662	/*
1663	 * len can be zero or an arbitrary large value bound by
1664	 * the total data supplied by the uio.
1665	 */
1666	if (len > 0)
1667		total = min(uio->uio_resid, len);
1668	else
1669		total = uio->uio_resid;
1670
1671	/*
1672	 * The smallest unit returned by m_getm2() is a single mbuf
1673	 * with pkthdr.  We can't align past it.
1674	 */
1675	if (align >= MHLEN)
1676		return (NULL);
1677
1678	/*
1679	 * Give us the full allocation or nothing.
1680	 * If len is zero return the smallest empty mbuf.
1681	 */
1682	m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags);
1683	if (m == NULL)
1684		return (NULL);
1685	m->m_data += align;
1686
1687	/* Fill all mbufs with uio data and update header information. */
1688	for (mb = m; mb != NULL; mb = mb->m_next) {
1689		length = min(M_TRAILINGSPACE(mb), total - progress);
1690
1691		error = uiomove(mtod(mb, void *), length, uio);
1692		if (error) {
1693			m_freem(m);
1694			return (NULL);
1695		}
1696
1697		mb->m_len = length;
1698		progress += length;
1699		if (flags & M_PKTHDR)
1700			m->m_pkthdr.len += length;
1701	}
1702	KASSERT(progress == total, ("%s: progress != total", __func__));
1703
1704	return (m);
1705}
1706
1707/*
1708 * Copy an mbuf chain into a uio limited by len if set.
1709 */
1710int
1711m_mbuftouio(struct uio *uio, struct mbuf *m, int len)
1712{
1713	int error, length, total;
1714	int progress = 0;
1715
1716	if (len > 0)
1717		total = min(uio->uio_resid, len);
1718	else
1719		total = uio->uio_resid;
1720
1721	/* Fill the uio with data from the mbufs. */
1722	for (; m != NULL; m = m->m_next) {
1723		length = min(m->m_len, total - progress);
1724
1725		error = uiomove(mtod(m, void *), length, uio);
1726		if (error)
1727			return (error);
1728
1729		progress += length;
1730	}
1731
1732	return (0);
1733}
1734
1735/*
1736 * Create a writable copy of the mbuf chain.  While doing this
1737 * we compact the chain with a goal of producing a chain with
1738 * at most two mbufs.  The second mbuf in this chain is likely
1739 * to be a cluster.  The primary purpose of this work is to create
1740 * a writable packet for encryption, compression, etc.  The
1741 * secondary goal is to linearize the data so the data can be
1742 * passed to crypto hardware in the most efficient manner possible.
1743 */
1744struct mbuf *
1745m_unshare(struct mbuf *m0, int how)
1746{
1747	struct mbuf *m, *mprev;
1748	struct mbuf *n, *mfirst, *mlast;
1749	int len, off;
1750
1751	mprev = NULL;
1752	for (m = m0; m != NULL; m = mprev->m_next) {
1753		/*
1754		 * Regular mbufs are ignored unless there's a cluster
1755		 * in front of it that we can use to coalesce.  We do
1756		 * the latter mainly so later clusters can be coalesced
1757		 * also w/o having to handle them specially (i.e. convert
1758		 * mbuf+cluster -> cluster).  This optimization is heavily
1759		 * influenced by the assumption that we're running over
1760		 * Ethernet where MCLBYTES is large enough that the max
1761		 * packet size will permit lots of coalescing into a
1762		 * single cluster.  This in turn permits efficient
1763		 * crypto operations, especially when using hardware.
1764		 */
1765		if ((m->m_flags & M_EXT) == 0) {
1766			if (mprev && (mprev->m_flags & M_EXT) &&
1767			    m->m_len <= M_TRAILINGSPACE(mprev)) {
1768				/* XXX: this ignores mbuf types */
1769				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
1770				    mtod(m, caddr_t), m->m_len);
1771				mprev->m_len += m->m_len;
1772				mprev->m_next = m->m_next;	/* unlink from chain */
1773				m_free(m);			/* reclaim mbuf */
1774#if 0
1775				newipsecstat.ips_mbcoalesced++;
1776#endif
1777			} else {
1778				mprev = m;
1779			}
1780			continue;
1781		}
1782		/*
1783		 * Writable mbufs are left alone (for now).
1784		 */
1785		if (M_WRITABLE(m)) {
1786			mprev = m;
1787			continue;
1788		}
1789
1790		/*
1791		 * Not writable, replace with a copy or coalesce with
1792		 * the previous mbuf if possible (since we have to copy
1793		 * it anyway, we try to reduce the number of mbufs and
1794		 * clusters so that future work is easier).
1795		 */
1796		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
1797		/* NB: we only coalesce into a cluster or larger */
1798		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
1799		    m->m_len <= M_TRAILINGSPACE(mprev)) {
1800			/* XXX: this ignores mbuf types */
1801			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
1802			    mtod(m, caddr_t), m->m_len);
1803			mprev->m_len += m->m_len;
1804			mprev->m_next = m->m_next;	/* unlink from chain */
1805			m_free(m);			/* reclaim mbuf */
1806#if 0
1807			newipsecstat.ips_clcoalesced++;
1808#endif
1809			continue;
1810		}
1811
1812		/*
1813		 * Allocate new space to hold the copy and copy the data.
1814		 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
1815		 * splitting them into clusters.  We could just malloc a
1816		 * buffer and make it external but too many device drivers
1817		 * don't know how to break up the non-contiguous memory when
1818		 * doing DMA.
1819		 */
1820		n = m_getcl(how, m->m_type, m->m_flags);
1821		if (n == NULL) {
1822			m_freem(m0);
1823			return (NULL);
1824		}
1825		len = m->m_len;
1826		off = 0;
1827		mfirst = n;
1828		mlast = NULL;
1829		for (;;) {
1830			int cc = min(len, MCLBYTES);
1831			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
1832			n->m_len = cc;
1833			if (mlast != NULL)
1834				mlast->m_next = n;
1835			mlast = n;
1836#if 0
1837			newipsecstat.ips_clcopied++;
1838#endif
1839
1840			len -= cc;
1841			if (len <= 0)
1842				break;
1843			off += cc;
1844
1845			n = m_getcl(how, m->m_type, m->m_flags);
1846			if (n == NULL) {
1847				m_freem(mfirst);
1848				m_freem(m0);
1849				return (NULL);
1850			}
1851		}
1852		n->m_next = m->m_next;
1853		if (mprev == NULL)
1854			m0 = mfirst;		/* new head of chain */
1855		else
1856			mprev->m_next = mfirst;	/* replace old mbuf */
1857		m_free(m);			/* release old mbuf */
1858		mprev = mfirst;
1859	}
1860	return (m0);
1861}
1862
1863#ifdef MBUF_PROFILING
1864
1865#define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
1866struct mbufprofile {
1867	uintmax_t wasted[MP_BUCKETS];
1868	uintmax_t used[MP_BUCKETS];
1869	uintmax_t segments[MP_BUCKETS];
1870} mbprof;
1871
1872#define MP_MAXDIGITS 21	/* strlen("16,000,000,000,000,000,000") == 21 */
1873#define MP_NUMLINES 6
1874#define MP_NUMSPERLINE 16
1875#define MP_EXTRABYTES 64	/* > strlen("used:\nwasted:\nsegments:\n") */
1876/* work out max space needed and add a bit of spare space too */
1877#define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE)
1878#define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES)
1879
1880char mbprofbuf[MP_BUFSIZE];
1881
1882void
1883m_profile(struct mbuf *m)
1884{
1885	int segments = 0;
1886	int used = 0;
1887	int wasted = 0;
1888
1889	while (m) {
1890		segments++;
1891		used += m->m_len;
1892		if (m->m_flags & M_EXT) {
1893			wasted += MHLEN - sizeof(m->m_ext) +
1894			    m->m_ext.ext_size - m->m_len;
1895		} else {
1896			if (m->m_flags & M_PKTHDR)
1897				wasted += MHLEN - m->m_len;
1898			else
1899				wasted += MLEN - m->m_len;
1900		}
1901		m = m->m_next;
1902	}
1903	/* be paranoid.. it helps */
1904	if (segments > MP_BUCKETS - 1)
1905		segments = MP_BUCKETS - 1;
1906	if (used > 100000)
1907		used = 100000;
1908	if (wasted > 100000)
1909		wasted = 100000;
1910	/* store in the appropriate bucket */
1911	/* don't bother locking. if it's slightly off, so what? */
1912	mbprof.segments[segments]++;
1913	mbprof.used[fls(used)]++;
1914	mbprof.wasted[fls(wasted)]++;
1915}
1916
1917static void
1918mbprof_textify(void)
1919{
1920	int offset;
1921	char *c;
1922	uint64_t *p;
1923
1924	p = &mbprof.wasted[0];
1925	c = mbprofbuf;
1926	offset = snprintf(c, MP_MAXLINE + 10,
1927	    "wasted:\n"
1928	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1929	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1930	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1931	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1932#ifdef BIG_ARRAY
1933	p = &mbprof.wasted[16];
1934	c += offset;
1935	offset = snprintf(c, MP_MAXLINE,
1936	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1937	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1938	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1939	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1940#endif
1941	p = &mbprof.used[0];
1942	c += offset;
1943	offset = snprintf(c, MP_MAXLINE + 10,
1944	    "used:\n"
1945	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1946	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1947	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1948	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1949#ifdef BIG_ARRAY
1950	p = &mbprof.used[16];
1951	c += offset;
1952	offset = snprintf(c, MP_MAXLINE,
1953	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1954	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1955	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1956	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1957#endif
1958	p = &mbprof.segments[0];
1959	c += offset;
1960	offset = snprintf(c, MP_MAXLINE + 10,
1961	    "segments:\n"
1962	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1963	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1964	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1965	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1966#ifdef BIG_ARRAY
1967	p = &mbprof.segments[16];
1968	c += offset;
1969	offset = snprintf(c, MP_MAXLINE,
1970	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1971	    "%ju %ju %ju %ju %ju %ju %ju %jju",
1972	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1973	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1974#endif
1975}
1976
1977static int
1978mbprof_handler(SYSCTL_HANDLER_ARGS)
1979{
1980	int error;
1981
1982	mbprof_textify();
1983	error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1);
1984	return (error);
1985}
1986
1987static int
1988mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
1989{
1990	int clear, error;
1991
1992	clear = 0;
1993	error = sysctl_handle_int(oidp, &clear, 0, req);
1994	if (error || !req->newptr)
1995		return (error);
1996
1997	if (clear) {
1998		bzero(&mbprof, sizeof(mbprof));
1999	}
2000
2001	return (error);
2002}
2003
2004
2005SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING|CTLFLAG_RD,
2006	    NULL, 0, mbprof_handler, "A", "mbuf profiling statistics");
2007
2008SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT|CTLFLAG_RW,
2009	    NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics");
2010#endif
2011
2012