1/*
2 * Copyright (c) 1998-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 *	The Regents of the University of California.  All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 *    must display the following acknowledgement:
43 *	This product includes software developed by the University of
44 *	California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 *    may be used to endorse or promote products derived from this software
47 *    without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
62 */
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections.  This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/malloc.h>
73#include <sys/mbuf.h>
74#include <sys/kernel.h>
75#include <sys/sysctl.h>
76#include <sys/syslog.h>
77#include <sys/protosw.h>
78#include <sys/domain.h>
79#include <sys/queue.h>
80#include <sys/proc.h>
81
82#include <dev/random/randomdev.h>
83
84#include <kern/kern_types.h>
85#include <kern/simple_lock.h>
86#include <kern/queue.h>
87#include <kern/sched_prim.h>
88#include <kern/cpu_number.h>
89#include <kern/zalloc.h>
90
91#include <libkern/OSAtomic.h>
92#include <libkern/OSDebug.h>
93#include <libkern/libkern.h>
94
95#include <IOKit/IOMapper.h>
96
97#include <machine/limits.h>
98#include <machine/machine_routines.h>
99
100#if CONFIG_MACF_NET
101#include <security/mac_framework.h>
102#endif /* MAC_NET */
103
104#include <sys/mcache.h>
105
106/*
107 * MBUF IMPLEMENTATION NOTES.
108 *
109 * There is a total of 5 per-CPU caches:
110 *
111 * MC_MBUF:
112 *	This is a cache of rudimentary objects of MSIZE in size; each
113 *	object represents an mbuf structure.  This cache preserves only
114 *	the m_type field of the mbuf during its transactions.
115 *
116 * MC_CL:
117 *	This is a cache of rudimentary objects of MCLBYTES in size; each
118 *	object represents a mcluster structure.  This cache does not
119 *	preserve the contents of the objects during its transactions.
120 *
121 * MC_BIGCL:
122 *	This is a cache of rudimentary objects of MBIGCLBYTES in size; each
123 *	object represents a mbigcluster structure.  This cache does not
124 *	preserve the contents of the objects during its transaction.
125 *
126 * MC_MBUF_CL:
127 *	This is a cache of mbufs each having a cluster attached to it.
128 *	It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
129 *	fields of the mbuf related to the external cluster are preserved
130 *	during transactions.
131 *
132 * MC_MBUF_BIGCL:
133 *	This is a cache of mbufs each having a big cluster attached to it.
134 *	It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
135 *	fields of the mbuf related to the external cluster are preserved
136 *	during transactions.
137 *
138 * OBJECT ALLOCATION:
139 *
140 * Allocation requests are handled first at the per-CPU (mcache) layer
141 * before falling back to the slab layer.  Performance is optimal when
142 * the request is satisfied at the CPU layer because global data/lock
143 * never gets accessed.  When the slab layer is entered for allocation,
144 * the slab freelist will be checked first for available objects before
145 * the VM backing store is invoked.  Slab layer operations are serialized
146 * for all of the caches as the mbuf global lock is held most of the time.
147 * Allocation paths are different depending on the class of objects:
148 *
149 * a. Rudimentary object:
150 *
151 *	{ m_get_common(), m_clattach(), m_mclget(),
152 *	  m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
153 *	  composite object allocation }
154 *			|	^
155 *			|	|
156 *			|	+-----------------------+
157 *			v				|
158 *	   mcache_alloc/mcache_alloc_ext()	mbuf_slab_audit()
159 *			|				^
160 *			v				|
161 *		   [CPU cache] ------->	(found?) -------+
162 *			|				|
163 *			v				|
164 *		 mbuf_slab_alloc()			|
165 *			|				|
166 *			v				|
167 *	+---------> [freelist] ------->	(found?) -------+
168 *	|		|
169 *	|		v
170 *	|	    m_clalloc()
171 *	|		|
172 *	|		v
173 *	+---<<---- kmem_mb_alloc()
174 *
175 * b. Composite object:
176 *
177 *	{ m_getpackets_internal(), m_allocpacket_internal() }
178 *			|	^
179 *			|	|
180 *			|	+------	(done) ---------+
181 *			v				|
182 *	   mcache_alloc/mcache_alloc_ext()	mbuf_cslab_audit()
183 *			|				^
184 *			v				|
185 *		   [CPU cache] ------->	(found?) -------+
186 *			|				|
187 *			v				|
188 *		 mbuf_cslab_alloc()			|
189 *			|				|
190 *			v				|
191 *		    [freelist] ------->	(found?) -------+
192 *			|				|
193 *			v				|
194 *		(rudimentary object)			|
195 *	   mcache_alloc/mcache_alloc_ext() ------>>-----+
196 *
197 * Auditing notes: If auditing is enabled, buffers will be subjected to
198 * integrity checks by the audit routine.  This is done by verifying their
199 * contents against DEADBEEF (free) pattern before returning them to caller.
200 * As part of this step, the routine will also record the transaction and
201 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
202 * also restore any constructed data structure fields if necessary.
203 *
204 * OBJECT DEALLOCATION:
205 *
206 * Freeing an object simply involves placing it into the CPU cache; this
207 * pollutes the cache to benefit subsequent allocations.  The slab layer
208 * will only be entered if the object is to be purged out of the cache.
209 * During normal operations, this happens only when the CPU layer resizes
210 * its bucket while it's adjusting to the allocation load.  Deallocation
211 * paths are different depending on the class of objects:
212 *
213 * a. Rudimentary object:
214 *
215 *	{ m_free(), m_freem_list(), composite object deallocation }
216 *			|	^
217 *			|	|
218 *			|	+------	(done) ---------+
219 *			v				|
220 *	   mcache_free/mcache_free_ext()		|
221 *			|				|
222 *			v				|
223 *		mbuf_slab_audit()			|
224 *			|				|
225 *			v				|
226 *		   [CPU cache] ---> (not purging?) -----+
227 *			|				|
228 *			v				|
229 *		 mbuf_slab_free()			|
230 *			|				|
231 *			v				|
232 *		    [freelist] ----------->>------------+
233 *	 (objects never get purged to VM)
234 *
235 * b. Composite object:
236 *
237 *	{ m_free(), m_freem_list() }
238 *			|	^
239 *			|	|
240 *			|	+------	(done) ---------+
241 *			v				|
242 *	   mcache_free/mcache_free_ext()		|
243 *			|				|
244 *			v				|
245 *		mbuf_cslab_audit()			|
246 *			|				|
247 *			v				|
248 *		   [CPU cache] ---> (not purging?) -----+
249 *			|				|
250 *			v				|
251 *		 mbuf_cslab_free()			|
252 *			|				|
253 *			v				|
254 *		    [freelist] ---> (not purging?) -----+
255 *			|				|
256 *			v				|
257 *		(rudimentary object)			|
258 *	   mcache_free/mcache_free_ext() ------->>------+
259 *
260 * Auditing notes: If auditing is enabled, the audit routine will save
261 * any constructed data structure fields (if necessary) before filling the
262 * contents of the buffers with DEADBEEF (free) pattern and recording the
263 * transaction.  Buffers that are freed (whether at CPU or slab layer) are
264 * expected to contain the free pattern.
265 *
266 * DEBUGGING:
267 *
268 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
269 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
270 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
271 * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
272 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
273 * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
274 *
275 * Each object is associated with exactly one mcache_audit_t structure that
276 * contains the information related to its last buffer transaction.  Given
277 * an address of an object, the audit structure can be retrieved by finding
278 * the position of the object relevant to the base address of the cluster:
279 *
280 *	+------------+			+=============+
281 *	| mbuf addr  |			| mclaudit[i] |
282 *	+------------+			+=============+
283 *	      |				| cl_audit[0] |
284 *	i = MTOBG(addr)			+-------------+
285 *	      |			+----->	| cl_audit[1] | -----> mcache_audit_t
286 *	b = BGTOM(i)		|	+-------------+
287 *	      |			|	|     ...     |
288 *	x = MCLIDX(b, addr)	|	+-------------+
289 *	      |			|	| cl_audit[7] |
290 *	      +-----------------+	+-------------+
291 *		 (e.g. x == 1)
292 *
293 * The mclaudit[] array is allocated at initialization time, but its contents
294 * get populated when the corresponding cluster is created.  Because a page
295 * can be turned into NMBPBG number of mbufs, we preserve enough space for the
296 * mbufs so that there is a 1-to-1 mapping between them.  A page that never
297 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
298 * remaining entries unused.  For 16KB cluster, only one entry from the first
299 * page is allocated and used for the entire object.
300 */
301
302/* TODO: should be in header file */
303/* kernel translater */
304extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
305extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
306extern vm_map_t mb_map;		/* special map */
307
308/* Global lock */
309decl_lck_mtx_data(static, mbuf_mlock_data);
310static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
311static lck_attr_t *mbuf_mlock_attr;
312static lck_grp_t *mbuf_mlock_grp;
313static lck_grp_attr_t *mbuf_mlock_grp_attr;
314
315/* Back-end (common) layer */
316static void *mbuf_worker_run;	/* wait channel for worker thread */
317static int mbuf_worker_ready;	/* worker thread is runnable */
318static int mbuf_expand_mcl;	/* number of cluster creation requets */
319static int mbuf_expand_big;	/* number of big cluster creation requests */
320static int mbuf_expand_16k;	/* number of 16KB cluster creation requests */
321static int ncpu;		/* number of CPUs */
322static ppnum_t *mcl_paddr;	/* Array of cluster physical addresses */
323static ppnum_t mcl_pages;	/* Size of array (# physical pages) */
324static ppnum_t mcl_paddr_base;	/* Handle returned by IOMapper::iovmAlloc() */
325static mcache_t *ref_cache;	/* Cache of cluster reference & flags */
326static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
327static unsigned int mbuf_debug;	/* patchable mbuf mcache flags */
328static unsigned int mb_normalized; /* number of packets "normalized" */
329
330#define	MB_GROWTH_AGGRESSIVE	1	/* Threshold: 1/2 of total */
331#define	MB_GROWTH_NORMAL	2	/* Threshold: 3/4 of total */
332
333typedef enum {
334	MC_MBUF = 0,	/* Regular mbuf */
335	MC_CL,		/* Cluster */
336	MC_BIGCL,	/* Large (4KB) cluster */
337	MC_16KCL,	/* Jumbo (16KB) cluster */
338	MC_MBUF_CL,	/* mbuf + cluster */
339	MC_MBUF_BIGCL,	/* mbuf + large (4KB) cluster */
340	MC_MBUF_16KCL	/* mbuf + jumbo (16KB) cluster */
341} mbuf_class_t;
342
343#define	MBUF_CLASS_MIN		MC_MBUF
344#define	MBUF_CLASS_MAX		MC_MBUF_16KCL
345#define	MBUF_CLASS_LAST		MC_16KCL
346#define	MBUF_CLASS_VALID(c) \
347	((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
348#define	MBUF_CLASS_COMPOSITE(c) \
349	((int)(c) > MBUF_CLASS_LAST)
350
351
352/*
353 * mbuf specific mcache allocation request flags.
354 */
355#define	MCR_COMP	MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
356
357/*
358 * Per-cluster slab structure.
359 *
360 * A slab is a cluster control structure that contains one or more object
361 * chunks; the available chunks are chained in the slab's freelist (sl_head).
362 * Each time a chunk is taken out of the slab, the slab's reference count
363 * gets incremented.  When all chunks have been taken out, the empty slab
364 * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
365 * returned to a slab causes the slab's reference count to be decremented;
366 * it also causes the slab to be reinserted back to class's slab list, if
367 * it's not already done.
368 *
369 * Compartmentalizing of the object chunks into slabs allows us to easily
370 * merge one or more slabs together when the adjacent slabs are idle, as
371 * well as to convert or move a slab from one class to another; e.g. the
372 * mbuf cluster slab can be converted to a regular cluster slab when all
373 * mbufs in the slab have been freed.
374 *
375 * A slab may also span across multiple clusters for chunks larger than
376 * a cluster's size.  In this case, only the slab of the first cluster is
377 * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
378 * that they are part of the larger slab.
379 *
380 * Each slab controls a page of memory.
381 */
382typedef struct mcl_slab {
383	struct mcl_slab	*sl_next;	/* neighboring slab */
384	u_int8_t	sl_class;	/* controlling mbuf class */
385	int8_t		sl_refcnt;	/* outstanding allocations */
386	int8_t		sl_chunks;	/* chunks (bufs) in this slab */
387	u_int16_t	sl_flags;	/* slab flags (see below) */
388	u_int16_t	sl_len;		/* slab length */
389	void		*sl_base;	/* base of allocated memory */
390	void		*sl_head;	/* first free buffer */
391	TAILQ_ENTRY(mcl_slab) sl_link;	/* next/prev slab on freelist */
392} mcl_slab_t;
393
394#define	SLF_MAPPED	0x0001		/* backed by a mapped page */
395#define	SLF_PARTIAL	0x0002		/* part of another slab */
396#define	SLF_DETACHED	0x0004		/* not in slab freelist */
397
398/*
399 * The array of slabs are broken into groups of arrays per 1MB of kernel
400 * memory to reduce the footprint.  Each group is allocated on demand
401 * whenever a new piece of memory mapped in from the VM crosses the 1MB
402 * boundary.
403 */
404#define	NSLABSPMB	((1 << MBSHIFT) >> PGSHIFT)	/* 256 slabs/grp */
405
406typedef struct mcl_slabg {
407	mcl_slab_t	slg_slab[NSLABSPMB];	/* group of slabs */
408} mcl_slabg_t;
409
410/*
411 * Number of slabs needed to control a 16KB cluster object.
412 */
413#define	NSLABSP16KB	(M16KCLBYTES >> PGSHIFT)
414
415/*
416 * Per-cluster audit structure.
417 */
418typedef struct {
419	mcache_audit_t	*cl_audit[NMBPBG];	/* array of audits */
420} mcl_audit_t;
421
422typedef struct {
423	struct thread	*msa_thread;	/* thread doing transaction */
424	struct thread	*msa_pthread;	/* previous transaction thread */
425	uint32_t	msa_tstamp;	/* transaction timestamp (ms) */
426	uint32_t	msa_ptstamp;	/* prev transaction timestamp (ms) */
427	uint16_t	msa_depth;	/* pc stack depth */
428	uint16_t	msa_pdepth;	/* previous transaction pc stack */
429	void		*msa_stack[MCACHE_STACK_DEPTH];
430	void		*msa_pstack[MCACHE_STACK_DEPTH];
431} mcl_scratch_audit_t;
432
433typedef struct {
434	/*
435	 * Size of data from the beginning of an mbuf that covers m_hdr,
436	 * pkthdr and m_ext structures.  If auditing is enabled, we allocate
437	 * a shadow mbuf structure of this size inside each audit structure,
438	 * and the contents of the real mbuf gets copied into it when the mbuf
439	 * is freed.  This allows us to pattern-fill the mbuf for integrity
440	 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
441	 * cluster cache case).  Note that we don't save the contents of
442	 * clusters when they are freed; we simply pattern-fill them.
443	 */
444	u_int8_t		sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
445	mcl_scratch_audit_t	sc_scratch __attribute__((aligned(8)));
446} mcl_saved_contents_t;
447
448#define	AUDIT_CONTENTS_SIZE	(sizeof (mcl_saved_contents_t))
449
450#define	MCA_SAVED_MBUF_PTR(_mca)					\
451	((struct mbuf *)(void *)((mcl_saved_contents_t *)		\
452	(_mca)->mca_contents)->sc_mbuf)
453#define	MCA_SAVED_MBUF_SIZE						\
454	(sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
455#define	MCA_SAVED_SCRATCH_PTR(_mca)					\
456	(&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
457
458/*
459 * mbuf specific mcache audit flags
460 */
461#define	MB_INUSE	0x01	/* object has not been returned to slab */
462#define	MB_COMP_INUSE	0x02	/* object has not been returned to cslab */
463#define	MB_SCVALID	0x04	/* object has valid saved contents */
464
465/*
466 * Each of the following two arrays hold up to nmbclusters elements.
467 */
468static mcl_audit_t *mclaudit;	/* array of cluster audit information */
469static unsigned int maxclaudit;	/* max # of entries in audit table */
470static mcl_slabg_t **slabstbl;	/* cluster slabs table */
471static unsigned int maxslabgrp;	/* max # of entries in slabs table */
472static unsigned int slabgrp;	/* # of entries in slabs table */
473
474/* Globals */
475int nclusters;			/* # of clusters for non-jumbo (legacy) sizes */
476int njcl;			/* # of clusters for jumbo sizes */
477int njclbytes;			/* size of a jumbo cluster */
478union mbigcluster *mbutl;	/* first mapped cluster address */
479union mbigcluster *embutl;	/* ending virtual address of mclusters */
480int _max_linkhdr;		/* largest link-level header */
481int _max_protohdr;		/* largest protocol header */
482int max_hdr;			/* largest link+protocol header */
483int max_datalen;		/* MHLEN - max_hdr */
484
485static boolean_t mclverify;	/* debug: pattern-checking */
486static boolean_t mcltrace;	/* debug: stack tracing */
487static boolean_t mclfindleak;	/* debug: leak detection */
488static boolean_t mclexpleak;	/* debug: expose leak info to user space */
489
490static struct timeval mb_start;	/* beginning of time */
491
492/* mbuf leak detection variables */
493static struct mleak_table mleak_table;
494static mleak_stat_t *mleak_stat;
495
496#define	MLEAK_STAT_SIZE(n) \
497	((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
498
499struct mallocation {
500	mcache_obj_t *element;	/* the alloc'ed element, NULL if unused */
501	u_int32_t trace_index;	/* mtrace index for corresponding backtrace */
502	u_int32_t count;	/* How many objects were requested */
503	u_int64_t hitcount;	/* for determining hash effectiveness */
504};
505
506struct mtrace {
507	u_int64_t	collisions;
508	u_int64_t	hitcount;
509	u_int64_t	allocs;
510	u_int64_t	depth;
511	uintptr_t	addr[MLEAK_STACK_DEPTH];
512};
513
514/* Size must be a power of two for the zhash to be able to just mask off bits */
515#define	MLEAK_ALLOCATION_MAP_NUM	512
516#define	MLEAK_TRACE_MAP_NUM		256
517
518/*
519 * Sample factor for how often to record a trace.  This is overwritable
520 * by the boot-arg mleak_sample_factor.
521 */
522#define	MLEAK_SAMPLE_FACTOR		500
523
524/*
525 * Number of top leakers recorded.
526 */
527#define	MLEAK_NUM_TRACES		5
528
529#define	MB_LEAK_SPACING_64 "                    "
530#define MB_LEAK_SPACING_32 "            "
531
532
533#define	MB_LEAK_HDR_32	"\n\
534    trace [1]   trace [2]   trace [3]   trace [4]   trace [5]  \n\
535    ----------  ----------  ----------  ----------  ---------- \n\
536"
537
538#define	MB_LEAK_HDR_64	"\n\
539    trace [1]           trace [2]           trace [3]       \
540        trace [4]           trace [5]      \n\
541    ------------------  ------------------  ------------------  \
542    ------------------  ------------------ \n\
543"
544
545static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
546static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
547
548/* Hashmaps of allocations and their corresponding traces */
549static struct mallocation *mleak_allocations;
550static struct mtrace *mleak_traces;
551static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
552
553/* Lock to protect mleak tables from concurrent modification */
554decl_lck_mtx_data(static, mleak_lock_data);
555static lck_mtx_t *mleak_lock = &mleak_lock_data;
556static lck_attr_t *mleak_lock_attr;
557static lck_grp_t *mleak_lock_grp;
558static lck_grp_attr_t *mleak_lock_grp_attr;
559
560extern u_int32_t high_sb_max;
561
562/* The minimum number of objects that are allocated, to start. */
563#define	MINCL		32
564#define	MINBIGCL	(MINCL >> 1)
565#define	MIN16KCL	(MINCL >> 2)
566
567/* Low watermarks (only map in pages once free counts go below) */
568#define	MBIGCL_LOWAT	MINBIGCL
569#define	M16KCL_LOWAT	MIN16KCL
570
571typedef struct {
572	mbuf_class_t	mtbl_class;	/* class type */
573	mcache_t	*mtbl_cache;	/* mcache for this buffer class */
574	TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
575	mcache_obj_t	*mtbl_cobjlist;	/* composite objects freelist */
576	mb_class_stat_t	*mtbl_stats;	/* statistics fetchable via sysctl */
577	u_int32_t	mtbl_maxsize;	/* maximum buffer size */
578	int		mtbl_minlimit;	/* minimum allowed */
579	int		mtbl_maxlimit;	/* maximum allowed */
580	u_int32_t	mtbl_wantpurge;	/* purge during next reclaim */
581} mbuf_table_t;
582
583#define	m_class(c)	mbuf_table[c].mtbl_class
584#define	m_cache(c)	mbuf_table[c].mtbl_cache
585#define	m_slablist(c)	mbuf_table[c].mtbl_slablist
586#define	m_cobjlist(c)	mbuf_table[c].mtbl_cobjlist
587#define	m_maxsize(c)	mbuf_table[c].mtbl_maxsize
588#define	m_minlimit(c)	mbuf_table[c].mtbl_minlimit
589#define	m_maxlimit(c)	mbuf_table[c].mtbl_maxlimit
590#define	m_wantpurge(c)	mbuf_table[c].mtbl_wantpurge
591#define	m_cname(c)	mbuf_table[c].mtbl_stats->mbcl_cname
592#define	m_size(c)	mbuf_table[c].mtbl_stats->mbcl_size
593#define	m_total(c)	mbuf_table[c].mtbl_stats->mbcl_total
594#define	m_active(c)	mbuf_table[c].mtbl_stats->mbcl_active
595#define	m_infree(c)	mbuf_table[c].mtbl_stats->mbcl_infree
596#define	m_slab_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_slab_cnt
597#define	m_alloc_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
598#define	m_free_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_free_cnt
599#define	m_notified(c)	mbuf_table[c].mtbl_stats->mbcl_notified
600#define	m_purge_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_purge_cnt
601#define	m_fail_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_fail_cnt
602#define	m_ctotal(c)	mbuf_table[c].mtbl_stats->mbcl_ctotal
603
604static mbuf_table_t mbuf_table[] = {
605	/*
606	 * The caches for mbufs, regular clusters and big clusters.
607	 */
608	{ MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
609	    NULL, NULL, 0, 0, 0, 0 },
610	{ MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
611	    NULL, NULL, 0, 0, 0, 0 },
612	{ MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
613	    NULL, NULL, 0, 0, 0, 0 },
614	{ MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
615	    NULL, NULL, 0, 0, 0, 0 },
616	/*
617	 * The following are special caches; they serve as intermediate
618	 * caches backed by the above rudimentary caches.  Each object
619	 * in the cache is an mbuf with a cluster attached to it.  Unlike
620	 * the above caches, these intermediate caches do not directly
621	 * deal with the slab structures; instead, the constructed
622	 * cached elements are simply stored in the freelists.
623	 */
624	{ MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
625	{ MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
626	{ MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
627};
628
629#define	NELEM(a)	(sizeof (a) / sizeof ((a)[0]))
630
631static void *mb_waitchan = &mbuf_table;	/* wait channel for all caches */
632static int mb_waiters;			/* number of waiters */
633
634#define	MB_WDT_MAXTIME	10		/* # of secs before watchdog panic */
635static struct timeval mb_wdtstart;	/* watchdog start timestamp */
636static char *mbuf_dump_buf;
637
638#define	MBUF_DUMP_BUF_SIZE	2048
639
640/*
641 * mbuf watchdog is enabled by default on embedded platforms.  It is
642 * also toggeable via the kern.ipc.mb_watchdog sysctl.
643 */
644static unsigned int mb_watchdog = 0;
645
646/* Red zone */
647static u_int32_t mb_redzone_cookie;
648static void m_redzone_init(struct mbuf *);
649static void m_redzone_verify(struct mbuf *m);
650
651/* The following are used to serialize m_clalloc() */
652static boolean_t mb_clalloc_busy;
653static void *mb_clalloc_waitchan = &mb_clalloc_busy;
654static int mb_clalloc_waiters;
655
656static void mbuf_mtypes_sync(boolean_t);
657static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
658static void mbuf_stat_sync(void);
659static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
660static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
661static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
662static char *mbuf_dump(void);
663static void mbuf_table_init(void);
664static inline void m_incref(struct mbuf *);
665static inline u_int32_t m_decref(struct mbuf *);
666static int m_clalloc(const u_int32_t, const int, const u_int32_t);
667static void mbuf_worker_thread_init(void);
668static mcache_obj_t *slab_alloc(mbuf_class_t, int);
669static void slab_free(mbuf_class_t, mcache_obj_t *);
670static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
671    unsigned int, int);
672static void mbuf_slab_free(void *, mcache_obj_t *, int);
673static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
674static void mbuf_slab_notify(void *, u_int32_t);
675static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
676    unsigned int);
677static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
678static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
679    unsigned int, int);
680static void mbuf_cslab_free(void *, mcache_obj_t *, int);
681static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
682static int freelist_populate(mbuf_class_t, unsigned int, int);
683static void freelist_init(mbuf_class_t);
684static boolean_t mbuf_cached_above(mbuf_class_t, int);
685static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
686static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
687static int m_howmany(int, size_t);
688static void mbuf_worker_thread(void);
689static void mbuf_watchdog(void);
690static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
691
692static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
693    size_t, unsigned int);
694static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
695static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
696static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
697    boolean_t);
698static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
699static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
700static void mcl_audit_scratch(mcache_audit_t *);
701static void mcl_audit_mcheck_panic(struct mbuf *);
702static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
703
704static void mleak_activate(void);
705static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
706static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
707static void mleak_free(mcache_obj_t *);
708static void mleak_sort_traces(void);
709static void mleak_update_stats(void);
710
711static mcl_slab_t *slab_get(void *);
712static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
713    void *, void *, unsigned int, int, int);
714static void slab_insert(mcl_slab_t *, mbuf_class_t);
715static void slab_remove(mcl_slab_t *, mbuf_class_t);
716static boolean_t slab_inrange(mcl_slab_t *, void *);
717static void slab_nextptr_panic(mcl_slab_t *, void *);
718static void slab_detach(mcl_slab_t *);
719static boolean_t slab_is_detached(mcl_slab_t *);
720
721static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
722static struct mbuf *m_split0(struct mbuf *, int, int, int);
723
724/* flags for m_copyback0 */
725#define	M_COPYBACK0_COPYBACK	0x0001	/* copyback from cp */
726#define	M_COPYBACK0_PRESERVE	0x0002	/* preserve original data */
727#define	M_COPYBACK0_COW		0x0004	/* do copy-on-write */
728#define	M_COPYBACK0_EXTEND	0x0008	/* extend chain */
729
730/*
731 * This flag is set for all mbufs that come out of and into the composite
732 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
733 * are marked with such a flag have clusters attached to them, and will be
734 * treated differently when they are freed; instead of being placed back
735 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
736 * are placed back into the appropriate composite cache's freelist, and the
737 * actual freeing is deferred until the composite objects are purged.  At
738 * such a time, this flag will be cleared from the mbufs and the objects
739 * will be freed into their own separate freelists.
740 */
741#define	EXTF_COMPOSITE	0x1
742
743/*
744 * This flag indicates that the external cluster is read-only, i.e. it is
745 * or was referred to by more than one mbufs.  Once set, this flag is never
746 * cleared.
747 */
748#define	EXTF_READONLY	0x2
749#define	EXTF_MASK	(EXTF_COMPOSITE | EXTF_READONLY)
750
751#define	MEXT_RFA(m)		((m)->m_ext.ext_refflags)
752#define	MEXT_REF(m)		(MEXT_RFA(m)->refcnt)
753#define	MEXT_FLAGS(m)		(MEXT_RFA(m)->flags)
754#define	MBUF_IS_COMPOSITE(m)	\
755	(MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
756
757/*
758 * Macros used to verify the integrity of the mbuf.
759 */
760#define	_MCHECK(m) {							\
761	if ((m)->m_type != MT_FREE) {					\
762		if (mclaudit == NULL)					\
763			panic("MCHECK: m_type=%d m=%p",			\
764			    (u_int16_t)(m)->m_type, m);			\
765		else							\
766			mcl_audit_mcheck_panic(m);			\
767	}								\
768}
769
770#define	MBUF_IN_MAP(addr)						\
771	((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
772
773#define	MRANGE(addr) {							\
774	if (!MBUF_IN_MAP(addr))						\
775		panic("MRANGE: address out of range 0x%p", addr);	\
776}
777
778/*
779 * Macro version of mtod.
780 */
781#define	MTOD(m, t)	((t)((m)->m_data))
782
783/*
784 * Macros to obtain (4KB) cluster index and base cluster address.
785 */
786
787#define	MTOBG(x)	(((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
788#define	BGTOM(x)	((union mbigcluster *)(mbutl + (x)))
789
790/*
791 * Macro to find the mbuf index relative to a base.
792 */
793#define	MCLIDX(c, m)	(((char *)(m) - (char *)(c)) >> MSIZESHIFT)
794
795/*
796 * Same thing for 2KB cluster index.
797 */
798#define	CLBGIDX(c, m)	(((char *)(m) - (char *)(c)) >> MCLSHIFT)
799
800/*
801 * Macros used during mbuf and cluster initialization.
802 */
803#define	MBUF_INIT_PKTHDR(m) {						\
804	(m)->m_pkthdr.rcvif = NULL;					\
805	(m)->m_pkthdr.pkt_hdr = NULL;					\
806	(m)->m_pkthdr.len = 0;						\
807	(m)->m_pkthdr.csum_flags = 0;					\
808	(m)->m_pkthdr.csum_data = 0;					\
809	(m)->m_pkthdr.vlan_tag = 0;					\
810	m_classifier_init(m, 0);					\
811	m_tag_init(m, 1);						\
812	m_scratch_init(m);						\
813	m_redzone_init(m);						\
814}
815
816#define	MBUF_INIT(m, pkthdr, type) {					\
817	_MCHECK(m);							\
818	(m)->m_next = (m)->m_nextpkt = NULL;				\
819	(m)->m_len = 0;							\
820	(m)->m_type = type;						\
821	if ((pkthdr) == 0) {						\
822		(m)->m_data = (m)->m_dat;				\
823		(m)->m_flags = 0;					\
824	} else {							\
825		(m)->m_data = (m)->m_pktdat;				\
826		(m)->m_flags = M_PKTHDR;				\
827		MBUF_INIT_PKTHDR(m);					\
828	}								\
829}
830
831#define	MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) {		\
832	(m)->m_data = (m)->m_ext.ext_buf = (buf);			\
833	(m)->m_flags |= M_EXT;						\
834	(m)->m_ext.ext_size = (size);					\
835	(m)->m_ext.ext_free = (free);					\
836	(m)->m_ext.ext_arg = (arg);					\
837	(m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward =	\
838	    &(m)->m_ext.ext_refs;					\
839	MEXT_RFA(m) = (rfa);						\
840	MEXT_REF(m) = (ref);						\
841	MEXT_FLAGS(m) = (flag);						\
842}
843
844#define	MBUF_CL_INIT(m, buf, rfa, ref, flag)	\
845	MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
846
847#define	MBUF_BIGCL_INIT(m, buf, rfa, ref, flag)	\
848	MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
849
850#define	MBUF_16KCL_INIT(m, buf, rfa, ref, flag)	\
851	MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
852
853/*
854 * Macro to convert BSD malloc sleep flag to mcache's
855 */
856#define	MSLEEPF(f)	((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
857
858/*
859 * The structure that holds all mbuf class statistics exportable via sysctl.
860 * Similar to mbstat structure, the mb_stat structure is protected by the
861 * global mbuf lock.  It contains additional information about the classes
862 * that allows for a more accurate view of the state of the allocator.
863 */
864struct mb_stat *mb_stat;
865struct omb_stat *omb_stat;	/* For backwards compatibility */
866
867#define	MB_STAT_SIZE(n) \
868	((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
869#define	OMB_STAT_SIZE(n) \
870	((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
871
872/*
873 * The legacy structure holding all of the mbuf allocation statistics.
874 * The actual statistics used by the kernel are stored in the mbuf_table
875 * instead, and are updated atomically while the global mbuf lock is held.
876 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
877 * Unlike before, the kernel no longer relies on the contents of mbstat for
878 * its operations (e.g. cluster expansion) because the structure is exposed
879 * to outside and could possibly be modified, therefore making it unsafe.
880 * With the exception of the mbstat.m_mtypes array (see below), all of the
881 * statistics are updated as they change.
882 */
883struct mbstat mbstat;
884
885#define	MBSTAT_MTYPES_MAX \
886	(sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
887
888/*
889 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
890 * atomically and stored in a per-CPU structure which is lock-free; this is
891 * done in order to avoid writing to the global mbstat data structure which
892 * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
893 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
894 * array and returned to the application.  Any updates for types greater or
895 * equal than MT_MAX would be done atomically to the mbstat; this slows down
896 * performance but is okay since the kernel uses only up to MT_MAX-1 while
897 * anything beyond that (up to type 255) is considered a corner case.
898 */
899typedef struct {
900	unsigned int	cpu_mtypes[MT_MAX];
901} __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
902
903typedef struct {
904	mtypes_cpu_t	mbs_cpu[1];
905} mbuf_mtypes_t;
906
907static mbuf_mtypes_t *mbuf_mtypes;	/* per-CPU statistics */
908
909#define	MBUF_MTYPES_SIZE(n) \
910	((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
911
912#define	MTYPES_CPU(p) \
913	((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
914
915#define	mtype_stat_add(type, n) {					\
916	if ((unsigned)(type) < MT_MAX) {				\
917		mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);		\
918		atomic_add_32(&mbs->cpu_mtypes[type], n);		\
919	} else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {	\
920		atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);	\
921	}								\
922}
923
924#define	mtype_stat_sub(t, n)	mtype_stat_add(t, -(n))
925#define	mtype_stat_inc(t)	mtype_stat_add(t, 1)
926#define	mtype_stat_dec(t)	mtype_stat_sub(t, 1)
927
928static void
929mbuf_mtypes_sync(boolean_t locked)
930{
931	int m, n;
932	mtypes_cpu_t mtc;
933
934	if (locked)
935		lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
936
937	bzero(&mtc, sizeof (mtc));
938	for (m = 0; m < ncpu; m++) {
939		mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
940		mtypes_cpu_t temp;
941
942		bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
943		    sizeof (temp.cpu_mtypes));
944
945		for (n = 0; n < MT_MAX; n++)
946			mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
947	}
948	if (!locked)
949		lck_mtx_lock(mbuf_mlock);
950	for (n = 0; n < MT_MAX; n++)
951		mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
952	if (!locked)
953		lck_mtx_unlock(mbuf_mlock);
954}
955
956static int
957mbstat_sysctl SYSCTL_HANDLER_ARGS
958{
959#pragma unused(oidp, arg1, arg2)
960	mbuf_mtypes_sync(FALSE);
961
962	return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
963}
964
965static void
966mbuf_stat_sync(void)
967{
968	mb_class_stat_t *sp;
969	mcache_cpu_t *ccp;
970	mcache_t *cp;
971	int k, m, bktsize;
972
973	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
974
975	for (k = 0; k < NELEM(mbuf_table); k++) {
976		cp = m_cache(k);
977		ccp = &cp->mc_cpu[0];
978		bktsize = ccp->cc_bktsize;
979		sp = mbuf_table[k].mtbl_stats;
980
981		if (cp->mc_flags & MCF_NOCPUCACHE)
982			sp->mbcl_mc_state = MCS_DISABLED;
983		else if (cp->mc_purge_cnt > 0)
984			sp->mbcl_mc_state = MCS_PURGING;
985		else if (bktsize == 0)
986			sp->mbcl_mc_state = MCS_OFFLINE;
987		else
988			sp->mbcl_mc_state = MCS_ONLINE;
989
990		sp->mbcl_mc_cached = 0;
991		for (m = 0; m < ncpu; m++) {
992			ccp = &cp->mc_cpu[m];
993			if (ccp->cc_objs > 0)
994				sp->mbcl_mc_cached += ccp->cc_objs;
995			if (ccp->cc_pobjs > 0)
996				sp->mbcl_mc_cached += ccp->cc_pobjs;
997		}
998		sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
999		sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1000		    sp->mbcl_infree;
1001
1002		sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1003		sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1004		sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1005
1006		/* Calculate total count specific to each class */
1007		sp->mbcl_ctotal = sp->mbcl_total;
1008		switch (m_class(k)) {
1009		case MC_MBUF:
1010			/* Deduct mbufs used in composite caches */
1011			sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1012			    m_total(MC_MBUF_BIGCL));
1013			break;
1014
1015		case MC_CL:
1016			/* Deduct clusters used in composite cache */
1017			sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1018			break;
1019
1020		case MC_BIGCL:
1021			/* Deduct clusters used in composite cache */
1022			sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1023			break;
1024
1025		case MC_16KCL:
1026			/* Deduct clusters used in composite cache */
1027			sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1028			break;
1029
1030		default:
1031			break;
1032		}
1033	}
1034}
1035
1036static int
1037mb_stat_sysctl SYSCTL_HANDLER_ARGS
1038{
1039#pragma unused(oidp, arg1, arg2)
1040	void *statp;
1041	int k, statsz, proc64 = proc_is64bit(req->p);
1042
1043	lck_mtx_lock(mbuf_mlock);
1044	mbuf_stat_sync();
1045
1046	if (!proc64) {
1047		struct omb_class_stat *oc;
1048		struct mb_class_stat *c;
1049
1050		omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1051		oc = &omb_stat->mbs_class[0];
1052		c = &mb_stat->mbs_class[0];
1053		for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1054			(void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1055			    "%s", c->mbcl_cname);
1056			oc->mbcl_size = c->mbcl_size;
1057			oc->mbcl_total = c->mbcl_total;
1058			oc->mbcl_active = c->mbcl_active;
1059			oc->mbcl_infree = c->mbcl_infree;
1060			oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1061			oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1062			oc->mbcl_free_cnt = c->mbcl_free_cnt;
1063			oc->mbcl_notified = c->mbcl_notified;
1064			oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1065			oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1066			oc->mbcl_ctotal = c->mbcl_ctotal;
1067			oc->mbcl_mc_state = c->mbcl_mc_state;
1068			oc->mbcl_mc_cached = c->mbcl_mc_cached;
1069			oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1070			oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1071			oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1072		}
1073		statp = omb_stat;
1074		statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1075	} else {
1076		statp = mb_stat;
1077		statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1078	}
1079
1080	lck_mtx_unlock(mbuf_mlock);
1081
1082	return (SYSCTL_OUT(req, statp, statsz));
1083}
1084
1085static int
1086mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1087{
1088#pragma unused(oidp, arg1, arg2)
1089	int i;
1090
1091	/* Ensure leak tracing turned on */
1092	if (!mclfindleak || !mclexpleak)
1093		return (ENXIO);
1094
1095	lck_mtx_lock(mleak_lock);
1096	mleak_update_stats();
1097	i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1098	lck_mtx_unlock(mleak_lock);
1099
1100	return (i);
1101}
1102
1103static int
1104mleak_table_sysctl SYSCTL_HANDLER_ARGS
1105{
1106#pragma unused(oidp, arg1, arg2)
1107	int i = 0;
1108
1109	/* Ensure leak tracing turned on */
1110	if (!mclfindleak || !mclexpleak)
1111		return (ENXIO);
1112
1113	lck_mtx_lock(mleak_lock);
1114	i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1115	lck_mtx_unlock(mleak_lock);
1116
1117	return (i);
1118}
1119
1120static inline void
1121m_incref(struct mbuf *m)
1122{
1123	UInt32 old, new;
1124	volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1125
1126	do {
1127		old = *addr;
1128		new = old + 1;
1129		ASSERT(new != 0);
1130	} while (!OSCompareAndSwap(old, new, addr));
1131
1132	/*
1133	 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1134	 * we don't clear the flag when the refcount goes back to 1
1135	 * to simplify code calling m_mclhasreference().
1136	 */
1137	if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1138		(void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1139}
1140
1141static inline u_int32_t
1142m_decref(struct mbuf *m)
1143{
1144	UInt32 old, new;
1145	volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1146
1147	do {
1148		old = *addr;
1149		new = old - 1;
1150		ASSERT(old != 0);
1151	} while (!OSCompareAndSwap(old, new, addr));
1152
1153	return (new);
1154}
1155
1156static void
1157mbuf_table_init(void)
1158{
1159	unsigned int b, c, s;
1160	int m;
1161
1162	MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1163	    M_TEMP, M_WAITOK | M_ZERO);
1164	VERIFY(omb_stat != NULL);
1165
1166	MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1167	    M_TEMP, M_WAITOK | M_ZERO);
1168	VERIFY(mb_stat != NULL);
1169
1170	mb_stat->mbs_cnt = NELEM(mbuf_table);
1171	for (m = 0; m < NELEM(mbuf_table); m++)
1172		mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1173
1174#if CONFIG_MBUF_JUMBO
1175	/*
1176	 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1177	 * this only on platforms where jumbo cluster pool is enabled.
1178	 */
1179	njcl = nmbclusters / 3;
1180	njclbytes = M16KCLBYTES;
1181#endif /* CONFIG_MBUF_JUMBO */
1182
1183	/*
1184	 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1185	 * a multiple of 4KB clusters.
1186	 */
1187	nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1188	if (njcl > 0) {
1189		/*
1190		 * Each jumbo cluster takes 8 2KB clusters, so make
1191		 * sure that the pool size is evenly divisible by 8;
1192		 * njcl is in 2KB unit, hence treated as such.
1193		 */
1194		njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1195
1196		/* Update nclusters with rounded down value of njcl */
1197		nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1198	}
1199
1200	/*
1201	 * njcl is valid only on platforms with 16KB jumbo clusters, where
1202	 * it is configured to 1/3 of the pool size.  On these platforms,
1203	 * the remaining is used for 2KB and 4KB clusters.  On platforms
1204	 * without 16KB jumbo clusters, the entire pool is used for both
1205	 * 2KB and 4KB clusters.  A 4KB cluster can either be splitted into
1206	 * 16 mbufs, or into 2 2KB clusters.
1207	 *
1208	 *  +---+---+------------ ... -----------+------- ... -------+
1209	 *  | c | b |              s             |        njcl       |
1210	 *  +---+---+------------ ... -----------+------- ... -------+
1211	 *
1212	 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1213	 * clusters (1/64th each.)
1214	 */
1215	c = P2ROUNDDOWN((nclusters >> 6), 2);		/* in 2KB unit */
1216	b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1217	s = nclusters - (c + (b << NCLPBGSHIFT));	/* in 2KB unit */
1218
1219	/*
1220	 * 1/64th (c) is reserved for 2KB clusters.
1221	 */
1222	m_minlimit(MC_CL) = c;
1223	m_maxlimit(MC_CL) = s + c;			/* in 2KB unit */
1224	m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1225	(void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1226
1227	/*
1228	 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1229	 * It cannot be turned into 2KB clusters or mbufs.
1230	 */
1231	m_minlimit(MC_BIGCL) = b;
1232	m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;	/* in 4KB unit */
1233	m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1234	(void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1235
1236	/*
1237	 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1238	 */
1239	m_minlimit(MC_MBUF) = 0;
1240	m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT);	/* in mbuf unit */
1241	m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1242	(void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1243
1244	/*
1245	 * Set limits for the composite classes.
1246	 */
1247	m_minlimit(MC_MBUF_CL) = 0;
1248	m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1249	m_maxsize(MC_MBUF_CL) = MCLBYTES;
1250	m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1251	(void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1252
1253	m_minlimit(MC_MBUF_BIGCL) = 0;
1254	m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1255	m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1256	m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1257	(void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1258
1259	/*
1260	 * And for jumbo classes.
1261	 */
1262	m_minlimit(MC_16KCL) = 0;
1263	m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);	/* in 16KB unit */
1264	m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1265	(void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1266
1267	m_minlimit(MC_MBUF_16KCL) = 0;
1268	m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1269	m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1270	m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1271	(void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1272
1273	/*
1274	 * Initialize the legacy mbstat structure.
1275	 */
1276	bzero(&mbstat, sizeof (mbstat));
1277	mbstat.m_msize = m_maxsize(MC_MBUF);
1278	mbstat.m_mclbytes = m_maxsize(MC_CL);
1279	mbstat.m_minclsize = MINCLSIZE;
1280	mbstat.m_mlen = MLEN;
1281	mbstat.m_mhlen = MHLEN;
1282	mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1283}
1284
1285#if defined(__LP64__)
1286typedef struct ncl_tbl {
1287	uint64_t nt_maxmem;	/* memory (sane) size */
1288	uint32_t nt_mbpool;	/* mbuf pool size */
1289} ncl_tbl_t;
1290
1291/* Non-server */
1292static ncl_tbl_t ncl_table[] = {
1293	{ (1ULL << GBSHIFT)	  /*  1 GB */,	(64 << MBSHIFT)	 /*  64 MB */ },
1294	{ (1ULL << (GBSHIFT + 3)) /*  8 GB */,	(96 << MBSHIFT)	 /*  96 MB */ },
1295	{ (1ULL << (GBSHIFT + 4)) /* 16 GB */,	(128 << MBSHIFT) /* 128 MB */ },
1296	{ 0, 0 }
1297};
1298
1299/* Server */
1300static ncl_tbl_t ncl_table_srv[] = {
1301	{ (1ULL << GBSHIFT)	  /*  1 GB */,	(96 << MBSHIFT)  /*  96 MB */ },
1302	{ (1ULL << (GBSHIFT + 2)) /*  4 GB */,	(128 << MBSHIFT) /* 128 MB */ },
1303	{ (1ULL << (GBSHIFT + 3)) /*  8 GB */,	(160 << MBSHIFT) /* 160 MB */ },
1304	{ (1ULL << (GBSHIFT + 4)) /* 16 GB */,	(192 << MBSHIFT) /* 192 MB */ },
1305	{ (1ULL << (GBSHIFT + 5)) /* 32 GB */,	(256 << MBSHIFT) /* 256 MB */ },
1306	{ (1ULL << (GBSHIFT + 6)) /* 64 GB */,	(384 << MBSHIFT) /* 384 MB */ },
1307	{ 0, 0 }
1308};
1309#endif /* __LP64__ */
1310
1311__private_extern__ unsigned int
1312mbuf_default_ncl(int server, uint64_t mem)
1313{
1314#if !defined(__LP64__)
1315#pragma unused(server)
1316	unsigned int n;
1317	/*
1318	 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1319	 */
1320	if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1321		n = 32768;
1322#else
1323	unsigned int n, i;
1324	ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1325	/*
1326	 * 64-bit kernel (mbuf pool size based on table).
1327	 */
1328	n = tbl[0].nt_mbpool;
1329	for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1330		if (mem < tbl[i].nt_maxmem)
1331			break;
1332		n = tbl[i].nt_mbpool;
1333	}
1334	n >>= MCLSHIFT;
1335#endif /* !__LP64__ */
1336	return (n);
1337}
1338
1339__private_extern__ void
1340mbinit(void)
1341{
1342	unsigned int m;
1343	unsigned int initmcl = 0;
1344	void *buf;
1345	thread_t thread = THREAD_NULL;
1346
1347	microuptime(&mb_start);
1348
1349	/*
1350	 * These MBUF_ values must be equal to their private counterparts.
1351	 */
1352	_CASSERT(MBUF_EXT == M_EXT);
1353	_CASSERT(MBUF_PKTHDR == M_PKTHDR);
1354	_CASSERT(MBUF_EOR == M_EOR);
1355	_CASSERT(MBUF_LOOP == M_LOOP);
1356	_CASSERT(MBUF_BCAST == M_BCAST);
1357	_CASSERT(MBUF_MCAST == M_MCAST);
1358	_CASSERT(MBUF_FRAG == M_FRAG);
1359	_CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1360	_CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1361	_CASSERT(MBUF_PROMISC == M_PROMISC);
1362	_CASSERT(MBUF_HASFCS == M_HASFCS);
1363
1364	_CASSERT(MBUF_TYPE_FREE == MT_FREE);
1365	_CASSERT(MBUF_TYPE_DATA == MT_DATA);
1366	_CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1367	_CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1368	_CASSERT(MBUF_TYPE_PCB == MT_PCB);
1369	_CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1370	_CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1371	_CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1372	_CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1373	_CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1374	_CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1375	_CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1376	_CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1377	_CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1378	_CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1379
1380	_CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1381	_CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1382	_CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1383	_CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1384	_CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1385	_CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1386	_CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1387	_CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1388	_CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1389	_CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1390	_CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1391	_CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1392	_CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1393
1394	_CASSERT(MBUF_WAITOK == M_WAIT);
1395	_CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1396	_CASSERT(MBUF_COPYALL == M_COPYALL);
1397
1398	_CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1399	_CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1400	_CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1401	_CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1402	_CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1403	_CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1404	_CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1405	_CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1406	_CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1407	_CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1408
1409	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1410	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1411	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1412	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1413
1414	/* Module specific scratch space (32-bit alignment requirement) */
1415	_CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1416	    sizeof (uint32_t)));
1417
1418	/* Initialize random red zone cookie value */
1419	_CASSERT(sizeof (mb_redzone_cookie) ==
1420	    sizeof (((struct pkthdr *)0)->redzone));
1421	read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1422
1423	/* Make sure we don't save more than we should */
1424	_CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1425
1426	if (nmbclusters == 0)
1427		nmbclusters = NMBCLUSTERS;
1428
1429	/* This should be a sane (at least even) value by now */
1430	VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1431
1432	/* Setup the mbuf table */
1433	mbuf_table_init();
1434
1435	/* Global lock for common layer */
1436	mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1437	mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1438	mbuf_mlock_attr = lck_attr_alloc_init();
1439	lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1440
1441	/*
1442	 * Allocate cluster slabs table:
1443	 *
1444	 *	maxslabgrp = (N * 2048) / (1024 * 1024)
1445	 *
1446	 * Where N is nmbclusters rounded up to the nearest 512.  This yields
1447	 * mcl_slab_g_t units, each one representing a MB of memory.
1448	 */
1449	maxslabgrp =
1450	    (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
1451	MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1452	    M_TEMP, M_WAITOK | M_ZERO);
1453	VERIFY(slabstbl != NULL);
1454
1455	/*
1456	 * Allocate audit structures, if needed:
1457	 *
1458	 *	maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1459	 *
1460	 * This yields mcl_audit_t units, each one representing a page.
1461	 */
1462	PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1463	mbuf_debug |= mcache_getflags();
1464	if (mbuf_debug & MCF_DEBUG) {
1465		maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1466		MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1467		    M_TEMP, M_WAITOK | M_ZERO);
1468		VERIFY(mclaudit != NULL);
1469
1470		mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1471		    AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP);
1472		VERIFY(mcl_audit_con_cache != NULL);
1473	}
1474	mclverify = (mbuf_debug & MCF_VERIFY);
1475	mcltrace = (mbuf_debug & MCF_TRACE);
1476	mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1477	mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1478
1479	/* Enable mbuf leak logging, with a lock to protect the tables */
1480
1481	mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1482	mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1483	mleak_lock_attr = lck_attr_alloc_init();
1484	lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1485
1486	mleak_activate();
1487
1488	/* Calculate the number of pages assigned to the cluster pool */
1489	mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1490	MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1491	    M_TEMP, M_WAITOK);
1492	VERIFY(mcl_paddr != NULL);
1493
1494	/* Register with the I/O Bus mapper */
1495	mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1496	bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1497
1498	embutl = (union mbigcluster *)
1499	    ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)));
1500	VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
1501
1502	/* Prime up the freelist */
1503	PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1504	if (initmcl != 0) {
1505		initmcl >>= NCLPBGSHIFT;	/* become a 4K unit */
1506		if (initmcl > m_maxlimit(MC_BIGCL))
1507			initmcl = m_maxlimit(MC_BIGCL);
1508	}
1509	if (initmcl < m_minlimit(MC_BIGCL))
1510		initmcl = m_minlimit(MC_BIGCL);
1511
1512	lck_mtx_lock(mbuf_mlock);
1513
1514	/*
1515	 * For classes with non-zero minimum limits, populate their freelists
1516	 * so that m_total(class) is at least m_minlimit(class).
1517	 */
1518	VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1519	freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1520	VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1521	freelist_init(m_class(MC_CL));
1522
1523	for (m = 0; m < NELEM(mbuf_table); m++) {
1524		/* Make sure we didn't miss any */
1525		VERIFY(m_minlimit(m_class(m)) == 0 ||
1526		    m_total(m_class(m)) >= m_minlimit(m_class(m)));
1527	}
1528
1529	lck_mtx_unlock(mbuf_mlock);
1530
1531	(void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1532	    NULL, &thread);
1533	thread_deallocate(thread);
1534
1535	ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1536	    0, 0, MCR_SLEEP);
1537
1538	/* Create the cache for each class */
1539	for (m = 0; m < NELEM(mbuf_table); m++) {
1540		void *allocfunc, *freefunc, *auditfunc, *logfunc;
1541		u_int32_t flags;
1542
1543		flags = mbuf_debug;
1544		if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1545		    m_class(m) == MC_MBUF_16KCL) {
1546			allocfunc = mbuf_cslab_alloc;
1547			freefunc = mbuf_cslab_free;
1548			auditfunc = mbuf_cslab_audit;
1549			logfunc = mleak_logger;
1550		} else {
1551			allocfunc = mbuf_slab_alloc;
1552			freefunc = mbuf_slab_free;
1553			auditfunc = mbuf_slab_audit;
1554			logfunc = mleak_logger;
1555		}
1556
1557		/*
1558		 * Disable per-CPU caches for jumbo classes if there
1559		 * is no jumbo cluster pool available in the system.
1560		 * The cache itself is still created (but will never
1561		 * be populated) since it simplifies the code.
1562		 */
1563		if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1564		    njcl == 0)
1565			flags |= MCF_NOCPUCACHE;
1566
1567		if (!mclfindleak)
1568			flags |= MCF_NOLEAKLOG;
1569
1570		m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1571		    allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1572		    (void *)(uintptr_t)m, flags, MCR_SLEEP);
1573	}
1574
1575	/*
1576	 * Allocate structure for per-CPU statistics that's aligned
1577	 * on the CPU cache boundary; this code assumes that we never
1578	 * uninitialize this framework, since the original address
1579	 * before alignment is not saved.
1580	 */
1581	ncpu = ml_get_max_cpus();
1582	MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
1583	    M_TEMP, M_WAITOK);
1584	VERIFY(buf != NULL);
1585
1586	mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1587	    CPU_CACHE_LINE_SIZE);
1588	bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1589
1590	/*
1591	 * Set the max limit on sb_max to be 1/16 th of the size of
1592	 * memory allocated for mbuf clusters.
1593	 */
1594	high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1595	if (high_sb_max < sb_max) {
1596		/* sb_max is too large for this configuration, scale it down */
1597		if (high_sb_max > (1 << MBSHIFT)) {
1598			/* We have atleast 16 M of mbuf pool */
1599			sb_max = high_sb_max;
1600		} else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1601			/*
1602			 * If we have more than 1M of mbufpool, cap the size of
1603			 * max sock buf at 1M
1604			 */
1605			sb_max = high_sb_max = (1 << MBSHIFT);
1606		} else {
1607			sb_max = high_sb_max;
1608		}
1609	}
1610
1611	/* allocate space for mbuf_dump_buf */
1612	MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1613	VERIFY(mbuf_dump_buf != NULL);
1614
1615	if (mbuf_debug & MCF_DEBUG) {
1616		printf("%s: MLEN %d, MHLEN %d\n", __func__,
1617		    (int)_MLEN, (int)_MHLEN);
1618	}
1619
1620	printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1621	    (nmbclusters << MCLSHIFT) >> MBSHIFT,
1622	    (nclusters << MCLSHIFT) >> MBSHIFT,
1623	    (njcl << MCLSHIFT) >> MBSHIFT);
1624}
1625
1626/*
1627 * Obtain a slab of object(s) from the class's freelist.
1628 */
1629static mcache_obj_t *
1630slab_alloc(mbuf_class_t class, int wait)
1631{
1632	mcl_slab_t *sp;
1633	mcache_obj_t *buf;
1634
1635	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1636
1637	VERIFY(class != MC_16KCL || njcl > 0);
1638
1639	/* This should always be NULL for us */
1640	VERIFY(m_cobjlist(class) == NULL);
1641
1642	/*
1643	 * Treat composite objects as having longer lifespan by using
1644	 * a slab from the reverse direction, in hoping that this could
1645	 * reduce the probability of fragmentation for slabs that hold
1646	 * more than one buffer chunks (e.g. mbuf slabs).  For other
1647	 * slabs, this probably doesn't make much of a difference.
1648	 */
1649	if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
1650		sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1651	else
1652		sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1653
1654	if (sp == NULL) {
1655		VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1656		/* The slab list for this class is empty */
1657		return (NULL);
1658	}
1659
1660	VERIFY(m_infree(class) > 0);
1661	VERIFY(!slab_is_detached(sp));
1662	VERIFY(sp->sl_class == class &&
1663	    (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1664	buf = sp->sl_head;
1665	VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1666
1667	if (class == MC_MBUF) {
1668		sp->sl_head = buf->obj_next;
1669		VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1670	} else if (class == MC_CL) {
1671		sp->sl_head = buf->obj_next;
1672		VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
1673	} else {
1674		sp->sl_head = NULL;
1675	}
1676	if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1677		slab_nextptr_panic(sp, sp->sl_head);
1678		/* In case sl_head is in the map but not in the slab */
1679		VERIFY(slab_inrange(sp, sp->sl_head));
1680		/* NOTREACHED */
1681	}
1682
1683	/* Increment slab reference */
1684	sp->sl_refcnt++;
1685
1686	if (mclaudit != NULL) {
1687		mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1688		mca->mca_uflags = 0;
1689		/* Save contents on mbuf objects only */
1690		if (class == MC_MBUF)
1691			mca->mca_uflags |= MB_SCVALID;
1692	}
1693
1694	if (class == MC_CL) {
1695		mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1696		/*
1697		 * A 2K cluster slab can have at most NCLPBG references.
1698		 */
1699		VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1700		    sp->sl_chunks == NCLPBG &&
1701		    sp->sl_len == m_maxsize(MC_BIGCL));
1702		VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
1703	} else if (class == MC_BIGCL) {
1704		mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1705		    m_infree(MC_MBUF_BIGCL);
1706		/*
1707		 * A 4K cluster slab can have at most 1 reference.
1708		 */
1709		VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1710		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1711	} else if (class == MC_16KCL) {
1712		mcl_slab_t *nsp;
1713		int k;
1714
1715		--m_infree(MC_16KCL);
1716		VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1717		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1718		/*
1719		 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1720		 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1721		 * most 1 reference.
1722		 */
1723		for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1724			nsp = nsp->sl_next;
1725			/* Next slab must already be present */
1726			VERIFY(nsp != NULL);
1727			nsp->sl_refcnt++;
1728			VERIFY(!slab_is_detached(nsp));
1729			VERIFY(nsp->sl_class == MC_16KCL &&
1730			    nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1731			    nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1732			    nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1733			    nsp->sl_head == NULL);
1734		}
1735	} else {
1736		VERIFY(class == MC_MBUF);
1737		--m_infree(MC_MBUF);
1738		/*
1739		 * If auditing is turned on, this check is
1740		 * deferred until later in mbuf_slab_audit().
1741		 */
1742		if (mclaudit == NULL)
1743			_MCHECK((struct mbuf *)buf);
1744		/*
1745		 * Since we have incremented the reference count above,
1746		 * an mbuf slab (formerly a 4KB cluster slab that was cut
1747		 * up into mbufs) must have a reference count between 1
1748		 * and NMBPBG at this point.
1749		 */
1750		VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1751		    sp->sl_chunks == NMBPBG &&
1752		    sp->sl_len == m_maxsize(MC_BIGCL));
1753		VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
1754	}
1755
1756	/* If empty, remove this slab from the class's freelist */
1757	if (sp->sl_head == NULL) {
1758		VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1759		VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
1760		slab_remove(sp, class);
1761	}
1762
1763	return (buf);
1764}
1765
1766/*
1767 * Place a slab of object(s) back into a class's slab list.
1768 */
1769static void
1770slab_free(mbuf_class_t class, mcache_obj_t *buf)
1771{
1772	mcl_slab_t *sp;
1773
1774	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1775
1776	VERIFY(class != MC_16KCL || njcl > 0);
1777	VERIFY(buf->obj_next == NULL);
1778	sp = slab_get(buf);
1779	VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1780	    (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1781
1782	/* Decrement slab reference */
1783	sp->sl_refcnt--;
1784
1785	if (class == MC_CL) {
1786		VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1787		/*
1788		 * A slab that has been splitted for 2KB clusters can have
1789		 * at most 1 outstanding reference at this point.
1790		 */
1791		VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1792		    sp->sl_chunks == NCLPBG &&
1793		    sp->sl_len == m_maxsize(MC_BIGCL));
1794		VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1795		    (slab_is_detached(sp) && sp->sl_head == NULL));
1796	} else if (class == MC_BIGCL) {
1797		VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1798		/*
1799		 * A 4KB cluster slab can have at most 1 reference
1800		 * which must be 0 at this point.
1801		 */
1802		VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1803		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1804		VERIFY(slab_is_detached(sp));
1805	} else if (class == MC_16KCL) {
1806		mcl_slab_t *nsp;
1807		int k;
1808		/*
1809		 * A 16KB cluster takes NSLABSP16KB slabs, all must
1810		 * now have 0 reference.
1811		 */
1812		VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1813		VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1814		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1815		VERIFY(slab_is_detached(sp));
1816		for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1817			nsp = nsp->sl_next;
1818			/* Next slab must already be present */
1819			VERIFY(nsp != NULL);
1820			nsp->sl_refcnt--;
1821			VERIFY(slab_is_detached(nsp));
1822			VERIFY(nsp->sl_class == MC_16KCL &&
1823			    (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1824			    nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1825			    nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1826			    nsp->sl_head == NULL);
1827		}
1828	} else {
1829		/*
1830		 * A slab that has been splitted for mbufs has at most NMBPBG
1831		 * reference counts.  Since we have decremented one reference
1832		 * above, it must now be between 0 and NMBPBG-1.
1833		 */
1834		VERIFY(class == MC_MBUF);
1835		VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1836		    sp->sl_chunks == NMBPBG &&
1837		    sp->sl_len == m_maxsize(MC_BIGCL));
1838		VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
1839		    (slab_is_detached(sp) && sp->sl_head == NULL));
1840	}
1841
1842	/*
1843	 * When auditing is enabled, ensure that the buffer still
1844	 * contains the free pattern.  Otherwise it got corrupted
1845	 * while at the CPU cache layer.
1846	 */
1847	if (mclaudit != NULL) {
1848		mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1849		if (mclverify) {
1850			mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1851		}
1852		mca->mca_uflags &= ~MB_SCVALID;
1853	}
1854
1855	if (class == MC_CL) {
1856		mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1857		buf->obj_next = sp->sl_head;
1858	} else if (class == MC_BIGCL) {
1859		mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1860		    m_infree(MC_MBUF_BIGCL);
1861	} else if (class == MC_16KCL) {
1862		++m_infree(MC_16KCL);
1863	} else {
1864		++m_infree(MC_MBUF);
1865		buf->obj_next = sp->sl_head;
1866	}
1867	sp->sl_head = buf;
1868
1869	/*
1870	 * If a slab has been splitted to either one which holds 2KB clusters,
1871	 * or one which holds mbufs, turn it back to one which holds a 4KB
1872	 * cluster.
1873	 */
1874	if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1875	    m_total(class) > m_minlimit(class) &&
1876	    m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1877		int i = NMBPBG;
1878
1879		m_total(MC_BIGCL)++;
1880		mbstat.m_bigclusters = m_total(MC_BIGCL);
1881		m_total(MC_MBUF) -= NMBPBG;
1882		mbstat.m_mbufs = m_total(MC_MBUF);
1883		m_infree(MC_MBUF) -= NMBPBG;
1884		mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1885
1886		VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1887		VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
1888
1889		while (i--) {
1890			struct mbuf *m = sp->sl_head;
1891			VERIFY(m != NULL);
1892			sp->sl_head = m->m_next;
1893			m->m_next = NULL;
1894		}
1895		VERIFY(sp->sl_head == NULL);
1896
1897		/* Remove the slab from the mbuf class's slab list */
1898		slab_remove(sp, class);
1899
1900		/* Reinitialize it as a 4KB cluster slab */
1901		slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1902		    sp->sl_len, 0, 1);
1903
1904		if (mclverify) {
1905			mcache_set_pattern(MCACHE_FREE_PATTERN,
1906			    (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1907		}
1908		mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1909		    m_infree(MC_MBUF_BIGCL);
1910
1911		VERIFY(slab_is_detached(sp));
1912		/* And finally switch class */
1913		class = MC_BIGCL;
1914	} else if (class == MC_CL && sp->sl_refcnt == 0 &&
1915	    m_total(class) > m_minlimit(class) &&
1916	    m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1917		int i = NCLPBG;
1918
1919		m_total(MC_BIGCL)++;
1920		mbstat.m_bigclusters = m_total(MC_BIGCL);
1921		m_total(MC_CL) -= NCLPBG;
1922		mbstat.m_clusters = m_total(MC_CL);
1923		m_infree(MC_CL) -= NCLPBG;
1924		VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1925		VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1926
1927		while (i--) {
1928			union mcluster *c = sp->sl_head;
1929			VERIFY(c != NULL);
1930			sp->sl_head = c->mcl_next;
1931			c->mcl_next = NULL;
1932		}
1933		VERIFY(sp->sl_head == NULL);
1934
1935		/* Remove the slab from the 2KB cluster class's slab list */
1936		slab_remove(sp, class);
1937
1938		/* Reinitialize it as a 4KB cluster slab */
1939		slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1940		    sp->sl_len, 0, 1);
1941
1942		if (mclverify) {
1943			mcache_set_pattern(MCACHE_FREE_PATTERN,
1944			    (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1945		}
1946		mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1947		    m_infree(MC_MBUF_BIGCL);
1948
1949		VERIFY(slab_is_detached(sp));
1950		/* And finally switch class */
1951		class = MC_BIGCL;
1952	}
1953
1954	/* Reinsert the slab to the class's slab list */
1955	if (slab_is_detached(sp))
1956		slab_insert(sp, class);
1957}
1958
1959/*
1960 * Common allocator for rudimentary objects called by the CPU cache layer
1961 * during an allocation request whenever there is no available element in the
1962 * bucket layer.  It returns one or more elements from the appropriate global
1963 * freelist.  If the freelist is empty, it will attempt to populate it and
1964 * retry the allocation.
1965 */
1966static unsigned int
1967mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1968{
1969	mbuf_class_t class = (mbuf_class_t)arg;
1970	unsigned int need = num;
1971	mcache_obj_t **list = *plist;
1972
1973	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1974	ASSERT(need > 0);
1975
1976	lck_mtx_lock(mbuf_mlock);
1977
1978	for (;;) {
1979		if ((*list = slab_alloc(class, wait)) != NULL) {
1980			(*list)->obj_next = NULL;
1981			list = *plist = &(*list)->obj_next;
1982
1983			if (--need == 0) {
1984				/*
1985				 * If the number of elements in freelist has
1986				 * dropped below low watermark, asynchronously
1987				 * populate the freelist now rather than doing
1988				 * it later when we run out of elements.
1989				 */
1990				if (!mbuf_cached_above(class, wait) &&
1991				    m_infree(class) < m_total(class) >> 5) {
1992					(void) freelist_populate(class, 1,
1993					    M_DONTWAIT);
1994				}
1995				break;
1996			}
1997		} else {
1998			VERIFY(m_infree(class) == 0 || class == MC_CL);
1999
2000			(void) freelist_populate(class, 1,
2001			    (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2002
2003			if (m_infree(class) > 0)
2004				continue;
2005
2006			/* Check if there's anything at the cache layer */
2007			if (mbuf_cached_above(class, wait))
2008				break;
2009
2010			/* watchdog checkpoint */
2011			mbuf_watchdog();
2012
2013			/* We have nothing and cannot block; give up */
2014			if (wait & MCR_NOSLEEP) {
2015				if (!(wait & MCR_TRYHARD)) {
2016					m_fail_cnt(class)++;
2017					mbstat.m_drops++;
2018					break;
2019				}
2020			}
2021
2022			/*
2023			 * If the freelist is still empty and the caller is
2024			 * willing to be blocked, sleep on the wait channel
2025			 * until an element is available.  Otherwise, if
2026			 * MCR_TRYHARD is set, do our best to satisfy the
2027			 * request without having to go to sleep.
2028			 */
2029			if (mbuf_worker_ready &&
2030			    mbuf_sleep(class, need, wait))
2031				break;
2032
2033			lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2034		}
2035	}
2036
2037	m_alloc_cnt(class) += num - need;
2038	lck_mtx_unlock(mbuf_mlock);
2039
2040	return (num - need);
2041}
2042
2043/*
2044 * Common de-allocator for rudimentary objects called by the CPU cache
2045 * layer when one or more elements need to be returned to the appropriate
2046 * global freelist.
2047 */
2048static void
2049mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2050{
2051	mbuf_class_t class = (mbuf_class_t)arg;
2052	mcache_obj_t *nlist;
2053	unsigned int num = 0;
2054	int w;
2055
2056	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2057
2058	lck_mtx_lock(mbuf_mlock);
2059
2060	for (;;) {
2061		nlist = list->obj_next;
2062		list->obj_next = NULL;
2063		slab_free(class, list);
2064		++num;
2065		if ((list = nlist) == NULL)
2066			break;
2067	}
2068	m_free_cnt(class) += num;
2069
2070	if ((w = mb_waiters) > 0)
2071		mb_waiters = 0;
2072
2073	lck_mtx_unlock(mbuf_mlock);
2074
2075	if (w != 0)
2076		wakeup(mb_waitchan);
2077}
2078
2079/*
2080 * Common auditor for rudimentary objects called by the CPU cache layer
2081 * during an allocation or free request.  For the former, this is called
2082 * after the objects are obtained from either the bucket or slab layer
2083 * and before they are returned to the caller.  For the latter, this is
2084 * called immediately during free and before placing the objects into
2085 * the bucket or slab layer.
2086 */
2087static void
2088mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2089{
2090	mbuf_class_t class = (mbuf_class_t)arg;
2091	mcache_audit_t *mca;
2092
2093	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2094
2095	while (list != NULL) {
2096		lck_mtx_lock(mbuf_mlock);
2097		mca = mcl_audit_buf2mca(class, list);
2098
2099		/* Do the sanity checks */
2100		if (class == MC_MBUF) {
2101			mcl_audit_mbuf(mca, list, FALSE, alloc);
2102			ASSERT(mca->mca_uflags & MB_SCVALID);
2103		} else {
2104			mcl_audit_cluster(mca, list, m_maxsize(class),
2105			    alloc, TRUE);
2106			ASSERT(!(mca->mca_uflags & MB_SCVALID));
2107		}
2108		/* Record this transaction */
2109		if (mcltrace)
2110			mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2111
2112		if (alloc)
2113			mca->mca_uflags |= MB_INUSE;
2114		else
2115			mca->mca_uflags &= ~MB_INUSE;
2116		/* Unpair the object (unconditionally) */
2117		mca->mca_uptr = NULL;
2118		lck_mtx_unlock(mbuf_mlock);
2119
2120		list = list->obj_next;
2121	}
2122}
2123
2124/*
2125 * Common notify routine for all caches.  It is called by mcache when
2126 * one or more objects get freed.  We use this indication to trigger
2127 * the wakeup of any sleeping threads so that they can retry their
2128 * allocation requests.
2129 */
2130static void
2131mbuf_slab_notify(void *arg, u_int32_t reason)
2132{
2133	mbuf_class_t class = (mbuf_class_t)arg;
2134	int w;
2135
2136	ASSERT(MBUF_CLASS_VALID(class));
2137
2138	if (reason != MCN_RETRYALLOC)
2139		return;
2140
2141	lck_mtx_lock(mbuf_mlock);
2142	if ((w = mb_waiters) > 0) {
2143		m_notified(class)++;
2144		mb_waiters = 0;
2145	}
2146	lck_mtx_unlock(mbuf_mlock);
2147
2148	if (w != 0)
2149		wakeup(mb_waitchan);
2150}
2151
2152/*
2153 * Obtain object(s) from the composite class's freelist.
2154 */
2155static unsigned int
2156cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2157{
2158	unsigned int need = num;
2159	mcl_slab_t *sp, *clsp, *nsp;
2160	struct mbuf *m;
2161	mcache_obj_t **list = *plist;
2162	void *cl;
2163
2164	VERIFY(need > 0);
2165	VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2166	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2167
2168	/* Get what we can from the freelist */
2169	while ((*list = m_cobjlist(class)) != NULL) {
2170		MRANGE(*list);
2171
2172		m = (struct mbuf *)*list;
2173		sp = slab_get(m);
2174		cl = m->m_ext.ext_buf;
2175		clsp = slab_get(cl);
2176		VERIFY(m->m_flags == M_EXT && cl != NULL);
2177		VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
2178
2179		if (class == MC_MBUF_CL) {
2180			VERIFY(clsp->sl_refcnt >= 1 &&
2181			    clsp->sl_refcnt <= NCLPBG);
2182		} else {
2183			VERIFY(clsp->sl_refcnt == 1);
2184		}
2185
2186		if (class == MC_MBUF_16KCL) {
2187			int k;
2188			for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2189				nsp = nsp->sl_next;
2190				/* Next slab must already be present */
2191				VERIFY(nsp != NULL);
2192				VERIFY(nsp->sl_refcnt == 1);
2193			}
2194		}
2195
2196		if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2197		    !MBUF_IN_MAP(m_cobjlist(class))) {
2198			slab_nextptr_panic(sp, m_cobjlist(class));
2199			/* NOTREACHED */
2200		}
2201		(*list)->obj_next = NULL;
2202		list = *plist = &(*list)->obj_next;
2203
2204		if (--need == 0)
2205			break;
2206	}
2207	m_infree(class) -= (num - need);
2208
2209	return (num - need);
2210}
2211
2212/*
2213 * Place object(s) back into a composite class's freelist.
2214 */
2215static unsigned int
2216cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2217{
2218	mcache_obj_t *o, *tail;
2219	unsigned int num = 0;
2220	struct mbuf *m, *ms;
2221	mcache_audit_t *mca = NULL;
2222	mcache_obj_t *ref_list = NULL;
2223	mcl_slab_t *clsp, *nsp;
2224	void *cl;
2225	mbuf_class_t cl_class;
2226
2227	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2228	VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2229	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2230
2231	if (class == MC_MBUF_CL) {
2232		cl_class = MC_CL;
2233	} else if (class == MC_MBUF_BIGCL) {
2234		cl_class = MC_BIGCL;
2235	} else {
2236		VERIFY(class == MC_MBUF_16KCL);
2237		cl_class = MC_16KCL;
2238	}
2239
2240	o = tail = list;
2241
2242	while ((m = ms = (struct mbuf *)o) != NULL) {
2243		mcache_obj_t *rfa, *nexto = o->obj_next;
2244
2245		/* Do the mbuf sanity checks */
2246		if (mclaudit != NULL) {
2247			mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2248			if (mclverify) {
2249				mcache_audit_free_verify(mca, m, 0,
2250				    m_maxsize(MC_MBUF));
2251			}
2252			ms = MCA_SAVED_MBUF_PTR(mca);
2253		}
2254
2255		/* Do the cluster sanity checks */
2256		cl = ms->m_ext.ext_buf;
2257		clsp = slab_get(cl);
2258		if (mclverify) {
2259			size_t size = m_maxsize(cl_class);
2260			mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2261			    (mcache_obj_t *)cl), cl, 0, size);
2262		}
2263		VERIFY(ms->m_type == MT_FREE);
2264		VERIFY(ms->m_flags == M_EXT);
2265		VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2266		if (cl_class == MC_CL) {
2267			VERIFY(clsp->sl_refcnt >= 1 &&
2268			    clsp->sl_refcnt <= NCLPBG);
2269		} else {
2270			VERIFY(clsp->sl_refcnt == 1);
2271		}
2272		if (cl_class == MC_16KCL) {
2273			int k;
2274			for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2275				nsp = nsp->sl_next;
2276				/* Next slab must already be present */
2277				VERIFY(nsp != NULL);
2278				VERIFY(nsp->sl_refcnt == 1);
2279			}
2280		}
2281
2282		/*
2283		 * If we're asked to purge, restore the actual mbuf using
2284		 * contents of the shadow structure (if auditing is enabled)
2285		 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2286		 * about to free it and the attached cluster into their caches.
2287		 */
2288		if (purged) {
2289			/* Restore constructed mbuf fields */
2290			if (mclaudit != NULL)
2291				mcl_audit_restore_mbuf(m, mca, TRUE);
2292
2293			MEXT_REF(m) = 0;
2294			MEXT_FLAGS(m) = 0;
2295
2296			rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2297			rfa->obj_next = ref_list;
2298			ref_list = rfa;
2299			MEXT_RFA(m) = NULL;
2300
2301			m->m_type = MT_FREE;
2302			m->m_flags = m->m_len = 0;
2303			m->m_next = m->m_nextpkt = NULL;
2304
2305			/* Save mbuf fields and make auditing happy */
2306			if (mclaudit != NULL)
2307				mcl_audit_mbuf(mca, o, FALSE, FALSE);
2308
2309			VERIFY(m_total(class) > 0);
2310			m_total(class)--;
2311
2312			/* Free the mbuf */
2313			o->obj_next = NULL;
2314			slab_free(MC_MBUF, o);
2315
2316			/* And free the cluster */
2317			((mcache_obj_t *)cl)->obj_next = NULL;
2318			if (class == MC_MBUF_CL)
2319				slab_free(MC_CL, cl);
2320			else if (class == MC_MBUF_BIGCL)
2321				slab_free(MC_BIGCL, cl);
2322			else
2323				slab_free(MC_16KCL, cl);
2324		}
2325
2326		++num;
2327		tail = o;
2328		o = nexto;
2329	}
2330
2331	if (!purged) {
2332		tail->obj_next = m_cobjlist(class);
2333		m_cobjlist(class) = list;
2334		m_infree(class) += num;
2335	} else if (ref_list != NULL) {
2336		mcache_free_ext(ref_cache, ref_list);
2337	}
2338
2339	return (num);
2340}
2341
2342/*
2343 * Common allocator for composite objects called by the CPU cache layer
2344 * during an allocation request whenever there is no available element in
2345 * the bucket layer.  It returns one or more composite elements from the
2346 * appropriate global freelist.  If the freelist is empty, it will attempt
2347 * to obtain the rudimentary objects from their caches and construct them
2348 * into composite mbuf + cluster objects.
2349 */
2350static unsigned int
2351mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2352    int wait)
2353{
2354	mbuf_class_t class = (mbuf_class_t)arg;
2355	mbuf_class_t cl_class = 0;
2356	unsigned int num = 0, cnum = 0, want = needed;
2357	mcache_obj_t *ref_list = NULL;
2358	mcache_obj_t *mp_list = NULL;
2359	mcache_obj_t *clp_list = NULL;
2360	mcache_obj_t **list;
2361	struct ext_ref *rfa;
2362	struct mbuf *m;
2363	void *cl;
2364
2365	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2366	ASSERT(needed > 0);
2367
2368	VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2369
2370	/* There should not be any slab for this class */
2371	VERIFY(m_slab_cnt(class) == 0 &&
2372	    m_slablist(class).tqh_first == NULL &&
2373	    m_slablist(class).tqh_last == NULL);
2374
2375	lck_mtx_lock(mbuf_mlock);
2376
2377	/* Try using the freelist first */
2378	num = cslab_alloc(class, plist, needed);
2379	list = *plist;
2380	if (num == needed) {
2381		m_alloc_cnt(class) += num;
2382		lck_mtx_unlock(mbuf_mlock);
2383		return (needed);
2384	}
2385
2386	lck_mtx_unlock(mbuf_mlock);
2387
2388	/*
2389	 * We could not satisfy the request using the freelist alone;
2390	 * allocate from the appropriate rudimentary caches and use
2391	 * whatever we can get to construct the composite objects.
2392	 */
2393	needed -= num;
2394
2395	/*
2396	 * Mark these allocation requests as coming from a composite cache.
2397	 * Also, if the caller is willing to be blocked, mark the request
2398	 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2399	 * slab layer waiting for the individual object when one or more
2400	 * of the already-constructed composite objects are available.
2401	 */
2402	wait |= MCR_COMP;
2403	if (!(wait & MCR_NOSLEEP))
2404		wait |= MCR_FAILOK;
2405
2406	/* allocate mbufs */
2407	needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2408	if (needed == 0) {
2409		ASSERT(mp_list == NULL);
2410		goto fail;
2411	}
2412
2413	/* allocate clusters */
2414	if (class == MC_MBUF_CL) {
2415		cl_class = MC_CL;
2416	} else if (class == MC_MBUF_BIGCL) {
2417		cl_class = MC_BIGCL;
2418	} else {
2419		VERIFY(class == MC_MBUF_16KCL);
2420		cl_class = MC_16KCL;
2421	}
2422	needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2423	if (needed == 0) {
2424		ASSERT(clp_list == NULL);
2425		goto fail;
2426	}
2427
2428	needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2429	if (needed == 0) {
2430		ASSERT(ref_list == NULL);
2431		goto fail;
2432	}
2433
2434	/*
2435	 * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2436	 * overs will get freed accordingly before we return to caller.
2437	 */
2438	for (cnum = 0; cnum < needed; cnum++) {
2439		struct mbuf *ms;
2440
2441		m = ms = (struct mbuf *)mp_list;
2442		mp_list = mp_list->obj_next;
2443
2444		cl = clp_list;
2445		clp_list = clp_list->obj_next;
2446		((mcache_obj_t *)cl)->obj_next = NULL;
2447
2448		rfa = (struct ext_ref *)ref_list;
2449		ref_list = ref_list->obj_next;
2450		((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2451
2452		/*
2453		 * If auditing is enabled, construct the shadow mbuf
2454		 * in the audit structure instead of in the actual one.
2455		 * mbuf_cslab_audit() will take care of restoring the
2456		 * contents after the integrity check.
2457		 */
2458		if (mclaudit != NULL) {
2459			mcache_audit_t *mca, *cl_mca;
2460
2461			lck_mtx_lock(mbuf_mlock);
2462			mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2463			ms = MCA_SAVED_MBUF_PTR(mca);
2464			cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2465
2466			/*
2467			 * Pair them up.  Note that this is done at the time
2468			 * the mbuf+cluster objects are constructed.  This
2469			 * information should be treated as "best effort"
2470			 * debugging hint since more than one mbufs can refer
2471			 * to a cluster.  In that case, the cluster might not
2472			 * be freed along with the mbuf it was paired with.
2473			 */
2474			mca->mca_uptr = cl_mca;
2475			cl_mca->mca_uptr = mca;
2476
2477			ASSERT(mca->mca_uflags & MB_SCVALID);
2478			ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2479			lck_mtx_unlock(mbuf_mlock);
2480
2481			/* Technically, they are in the freelist */
2482			if (mclverify) {
2483				size_t size;
2484
2485				mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2486				    m_maxsize(MC_MBUF));
2487
2488				if (class == MC_MBUF_CL)
2489					size = m_maxsize(MC_CL);
2490				else if (class == MC_MBUF_BIGCL)
2491					size = m_maxsize(MC_BIGCL);
2492				else
2493					size = m_maxsize(MC_16KCL);
2494
2495				mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2496				    size);
2497			}
2498		}
2499
2500		MBUF_INIT(ms, 0, MT_FREE);
2501		if (class == MC_MBUF_16KCL) {
2502			MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2503		} else if (class == MC_MBUF_BIGCL) {
2504			MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2505		} else {
2506			MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2507		}
2508		VERIFY(ms->m_flags == M_EXT);
2509		VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2510
2511		*list = (mcache_obj_t *)m;
2512		(*list)->obj_next = NULL;
2513		list = *plist = &(*list)->obj_next;
2514	}
2515
2516fail:
2517	/*
2518	 * Free up what's left of the above.
2519	 */
2520	if (mp_list != NULL)
2521		mcache_free_ext(m_cache(MC_MBUF), mp_list);
2522	if (clp_list != NULL)
2523		mcache_free_ext(m_cache(cl_class), clp_list);
2524	if (ref_list != NULL)
2525		mcache_free_ext(ref_cache, ref_list);
2526
2527	lck_mtx_lock(mbuf_mlock);
2528	if (num > 0 || cnum > 0) {
2529		m_total(class) += cnum;
2530		VERIFY(m_total(class) <= m_maxlimit(class));
2531		m_alloc_cnt(class) += num + cnum;
2532	}
2533	if ((num + cnum) < want)
2534		m_fail_cnt(class) += (want - (num + cnum));
2535	lck_mtx_unlock(mbuf_mlock);
2536
2537	return (num + cnum);
2538}
2539
2540/*
2541 * Common de-allocator for composite objects called by the CPU cache
2542 * layer when one or more elements need to be returned to the appropriate
2543 * global freelist.
2544 */
2545static void
2546mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2547{
2548	mbuf_class_t class = (mbuf_class_t)arg;
2549	unsigned int num;
2550	int w;
2551
2552	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2553
2554	lck_mtx_lock(mbuf_mlock);
2555
2556	num = cslab_free(class, list, purged);
2557	m_free_cnt(class) += num;
2558
2559	if ((w = mb_waiters) > 0)
2560		mb_waiters = 0;
2561
2562	lck_mtx_unlock(mbuf_mlock);
2563
2564	if (w != 0)
2565		wakeup(mb_waitchan);
2566}
2567
2568/*
2569 * Common auditor for composite objects called by the CPU cache layer
2570 * during an allocation or free request.  For the former, this is called
2571 * after the objects are obtained from either the bucket or slab layer
2572 * and before they are returned to the caller.  For the latter, this is
2573 * called immediately during free and before placing the objects into
2574 * the bucket or slab layer.
2575 */
2576static void
2577mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2578{
2579	mbuf_class_t class = (mbuf_class_t)arg;
2580	mcache_audit_t *mca;
2581	struct mbuf *m, *ms;
2582	mcl_slab_t *clsp, *nsp;
2583	size_t size;
2584	void *cl;
2585
2586	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2587
2588	while ((m = ms = (struct mbuf *)list) != NULL) {
2589		lck_mtx_lock(mbuf_mlock);
2590		/* Do the mbuf sanity checks and record its transaction */
2591		mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2592		mcl_audit_mbuf(mca, m, TRUE, alloc);
2593		if (mcltrace)
2594			mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2595
2596		if (alloc)
2597			mca->mca_uflags |= MB_COMP_INUSE;
2598		else
2599			mca->mca_uflags &= ~MB_COMP_INUSE;
2600
2601		/*
2602		 * Use the shadow mbuf in the audit structure if we are
2603		 * freeing, since the contents of the actual mbuf has been
2604		 * pattern-filled by the above call to mcl_audit_mbuf().
2605		 */
2606		if (!alloc && mclverify)
2607			ms = MCA_SAVED_MBUF_PTR(mca);
2608
2609		/* Do the cluster sanity checks and record its transaction */
2610		cl = ms->m_ext.ext_buf;
2611		clsp = slab_get(cl);
2612		VERIFY(ms->m_flags == M_EXT && cl != NULL);
2613		VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2614		if (class == MC_MBUF_CL)
2615			VERIFY(clsp->sl_refcnt >= 1 &&
2616			    clsp->sl_refcnt <= NCLPBG);
2617		else
2618			VERIFY(clsp->sl_refcnt == 1);
2619
2620		if (class == MC_MBUF_16KCL) {
2621			int k;
2622			for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2623				nsp = nsp->sl_next;
2624				/* Next slab must already be present */
2625				VERIFY(nsp != NULL);
2626				VERIFY(nsp->sl_refcnt == 1);
2627			}
2628		}
2629
2630		mca = mcl_audit_buf2mca(MC_CL, cl);
2631		if (class == MC_MBUF_CL)
2632			size = m_maxsize(MC_CL);
2633		else if (class == MC_MBUF_BIGCL)
2634			size = m_maxsize(MC_BIGCL);
2635		else
2636			size = m_maxsize(MC_16KCL);
2637		mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2638		if (mcltrace)
2639			mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2640
2641		if (alloc)
2642			mca->mca_uflags |= MB_COMP_INUSE;
2643		else
2644			mca->mca_uflags &= ~MB_COMP_INUSE;
2645		lck_mtx_unlock(mbuf_mlock);
2646
2647		list = list->obj_next;
2648	}
2649}
2650
2651/*
2652 * Allocate some number of mbuf clusters and place on cluster freelist.
2653 */
2654static int
2655m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2656{
2657	int i;
2658	vm_size_t size = 0;
2659	int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2660	vm_offset_t page = 0;
2661	mcache_audit_t *mca_list = NULL;
2662	mcache_obj_t *con_list = NULL;
2663	mcl_slab_t *sp;
2664
2665	VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2666	    bufsize == m_maxsize(MC_16KCL));
2667
2668	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2669
2670	/*
2671	 * Multiple threads may attempt to populate the cluster map one
2672	 * after another.  Since we drop the lock below prior to acquiring
2673	 * the physical page(s), our view of the cluster map may no longer
2674	 * be accurate, and we could end up over-committing the pages beyond
2675	 * the maximum allowed for each class.  To prevent it, this entire
2676	 * operation (including the page mapping) is serialized.
2677	 */
2678	while (mb_clalloc_busy) {
2679		mb_clalloc_waiters++;
2680		(void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2681		    (PZERO-1), "m_clalloc", NULL);
2682		lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2683	}
2684
2685	/* We are busy now; tell everyone else to go away */
2686	mb_clalloc_busy = TRUE;
2687
2688	/*
2689	 * Honor the caller's wish to block or not block.  We have a way
2690	 * to grow the pool asynchronously using the mbuf worker thread.
2691	 */
2692	i = m_howmany(num, bufsize);
2693	if (i == 0 || (wait & M_DONTWAIT))
2694		goto out;
2695
2696	lck_mtx_unlock(mbuf_mlock);
2697
2698	size = round_page(i * bufsize);
2699	page = kmem_mb_alloc(mb_map, size, large_buffer);
2700
2701	/*
2702	 * If we did ask for "n" 16KB physically contiguous chunks
2703	 * and didn't get them, then please try again without this
2704	 * restriction.
2705	 */
2706	if (large_buffer && page == 0)
2707		page = kmem_mb_alloc(mb_map, size, 0);
2708
2709	if (page == 0) {
2710		if (bufsize == m_maxsize(MC_BIGCL)) {
2711			/* Try for 1 page if failed, only 4KB request */
2712			size = NBPG;
2713			page = kmem_mb_alloc(mb_map, size, 0);
2714		}
2715
2716		if (page == 0) {
2717			lck_mtx_lock(mbuf_mlock);
2718			goto out;
2719		}
2720	}
2721
2722	VERIFY(IS_P2ALIGNED(page, NBPG));
2723	numpages = size / NBPG;
2724
2725	/* If auditing is enabled, allocate the audit structures now */
2726	if (mclaudit != NULL) {
2727		int needed;
2728
2729		/*
2730		 * Yes, I realize this is a waste of memory for clusters
2731		 * that never get transformed into mbufs, as we may end
2732		 * up with NMBPBG-1 unused audit structures per cluster.
2733		 * But doing so tremendously simplifies the allocation
2734		 * strategy, since at this point we are not holding the
2735		 * mbuf lock and the caller is okay to be blocked.
2736		 */
2737		if (bufsize == m_maxsize(MC_BIGCL)) {
2738			needed = numpages * NMBPBG;
2739
2740			i = mcache_alloc_ext(mcl_audit_con_cache,
2741			    &con_list, needed, MCR_SLEEP);
2742
2743			VERIFY(con_list != NULL && i == needed);
2744		} else {
2745			needed = numpages / NSLABSP16KB;
2746		}
2747
2748		i = mcache_alloc_ext(mcache_audit_cache,
2749		    (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2750
2751		VERIFY(mca_list != NULL && i == needed);
2752	}
2753
2754	lck_mtx_lock(mbuf_mlock);
2755
2756	for (i = 0; i < numpages; i++, page += NBPG) {
2757		ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2758		ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
2759
2760		/*
2761		 * If there is a mapper the appropriate I/O page is returned;
2762		 * zero out the page to discard its past contents to prevent
2763		 * exposing leftover kernel memory.
2764		 */
2765		VERIFY(offset < mcl_pages);
2766		if (mcl_paddr_base != 0) {
2767			bzero((void *)(uintptr_t) page, page_size);
2768			new_page = IOMapperInsertPage(mcl_paddr_base,
2769			    offset, new_page);
2770		}
2771		mcl_paddr[offset] = new_page;
2772
2773		/* Pattern-fill this fresh page */
2774		if (mclverify) {
2775			mcache_set_pattern(MCACHE_FREE_PATTERN,
2776			    (caddr_t)page, NBPG);
2777		}
2778		if (bufsize == m_maxsize(MC_BIGCL)) {
2779			union mbigcluster *mbc = (union mbigcluster *)page;
2780
2781			/* One for the entire page */
2782			sp = slab_get(mbc);
2783			if (mclaudit != NULL) {
2784				mcl_audit_init(mbc, &mca_list, &con_list,
2785				    AUDIT_CONTENTS_SIZE, NMBPBG);
2786			}
2787			VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2788			slab_init(sp, MC_BIGCL, SLF_MAPPED,
2789			    mbc, mbc, bufsize, 0, 1);
2790
2791			/* Insert this slab */
2792			slab_insert(sp, MC_BIGCL);
2793
2794			/* Update stats now since slab_get() drops the lock */
2795			mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2796			    m_infree(MC_MBUF_BIGCL);
2797			mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2798			VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2799		} else if ((i % NSLABSP16KB) == 0) {
2800			union m16kcluster *m16kcl = (union m16kcluster *)page;
2801			mcl_slab_t *nsp;
2802			int k;
2803
2804			VERIFY(njcl > 0);
2805			/* One for the entire 16KB */
2806			sp = slab_get(m16kcl);
2807			if (mclaudit != NULL)
2808				mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2809
2810			VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2811			slab_init(sp, MC_16KCL, SLF_MAPPED,
2812			    m16kcl, m16kcl, bufsize, 0, 1);
2813
2814			/*
2815			 * 2nd-Nth page's slab is part of the first one,
2816			 * where N is NSLABSP16KB.
2817			 */
2818			for (k = 1; k < NSLABSP16KB; k++) {
2819				nsp = slab_get(((union mbigcluster *)page) + k);
2820				VERIFY(nsp->sl_refcnt == 0 &&
2821				    nsp->sl_flags == 0);
2822				slab_init(nsp, MC_16KCL,
2823				    SLF_MAPPED | SLF_PARTIAL,
2824				    m16kcl, NULL, 0, 0, 0);
2825			}
2826
2827			/* Insert this slab */
2828			slab_insert(sp, MC_16KCL);
2829
2830			/* Update stats now since slab_get() drops the lock */
2831			m_infree(MC_16KCL)++;
2832			m_total(MC_16KCL)++;
2833			VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2834		}
2835	}
2836	VERIFY(mca_list == NULL && con_list == NULL);
2837
2838	/* We're done; let others enter */
2839	mb_clalloc_busy = FALSE;
2840	if (mb_clalloc_waiters > 0) {
2841		mb_clalloc_waiters = 0;
2842		wakeup(mb_clalloc_waitchan);
2843	}
2844
2845	if (bufsize == m_maxsize(MC_BIGCL))
2846		return (numpages);
2847
2848	VERIFY(bufsize == m_maxsize(MC_16KCL));
2849	return (numpages / NSLABSP16KB);
2850
2851out:
2852	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2853
2854	/* We're done; let others enter */
2855	mb_clalloc_busy = FALSE;
2856	if (mb_clalloc_waiters > 0) {
2857		mb_clalloc_waiters = 0;
2858		wakeup(mb_clalloc_waitchan);
2859	}
2860
2861	/*
2862	 * When non-blocking we kick a thread if we have to grow the
2863	 * pool or if the number of free clusters is less than requested.
2864	 */
2865	if (bufsize == m_maxsize(MC_BIGCL)) {
2866		if (i > 0) {
2867			/*
2868			 * Remember total number of 4KB clusters needed
2869			 * at this time.
2870			 */
2871			i += m_total(MC_BIGCL);
2872			if (i > mbuf_expand_big) {
2873				mbuf_expand_big = i;
2874				if (mbuf_worker_ready)
2875					wakeup((caddr_t)&mbuf_worker_run);
2876			}
2877		}
2878
2879		if (m_infree(MC_BIGCL) >= num)
2880			return (1);
2881	} else {
2882		if (i > 0) {
2883			/*
2884			 * Remember total number of 16KB clusters needed
2885			 * at this time.
2886			 */
2887			i += m_total(MC_16KCL);
2888			if (i > mbuf_expand_16k) {
2889				mbuf_expand_16k = i;
2890				if (mbuf_worker_ready)
2891					wakeup((caddr_t)&mbuf_worker_run);
2892			}
2893		}
2894
2895		if (m_infree(MC_16KCL) >= num)
2896			return (1);
2897	}
2898	return (0);
2899}
2900
2901/*
2902 * Populate the global freelist of the corresponding buffer class.
2903 */
2904static int
2905freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2906{
2907	mcache_obj_t *o = NULL;
2908	int i, numpages = 0, count;
2909
2910	VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2911	    class == MC_16KCL);
2912
2913	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2914
2915	switch (class) {
2916	case MC_MBUF:
2917	case MC_CL:
2918	case MC_BIGCL:
2919		numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2920		i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2921
2922		/* Respect the 4KB clusters minimum limit */
2923		if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2924		    m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2925			if (class != MC_BIGCL || (wait & MCR_COMP))
2926				return (0);
2927		}
2928		if (class == MC_BIGCL)
2929			return (i != 0);
2930		break;
2931
2932	case MC_16KCL:
2933		return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2934		/* NOTREACHED */
2935
2936	default:
2937		VERIFY(0);
2938		/* NOTREACHED */
2939	}
2940
2941	VERIFY(class == MC_MBUF || class == MC_CL);
2942
2943	/* how many objects will we cut the page into? */
2944	int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2945
2946	for (count = 0; count < numpages; count++) {
2947
2948		/* respect totals, minlimit, maxlimit */
2949		if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2950		    m_total(class) >= m_maxlimit(class))
2951			break;
2952
2953		if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2954			break;
2955
2956		struct mbuf *m = (struct mbuf *)o;
2957		union mcluster *c = (union mcluster *)o;
2958		mcl_slab_t *sp = slab_get(o);
2959		mcache_audit_t *mca = NULL;
2960
2961		VERIFY(slab_is_detached(sp) &&
2962		    (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2963
2964		/*
2965		 * Make sure that the cluster is unmolested
2966		 * while in freelist
2967		 */
2968		if (mclverify) {
2969			mca = mcl_audit_buf2mca(MC_BIGCL, o);
2970			mcache_audit_free_verify(mca, o, 0,
2971			    m_maxsize(MC_BIGCL));
2972		}
2973
2974		/* Reinitialize it as an mbuf or 2K slab */
2975		slab_init(sp, class, sp->sl_flags,
2976		    sp->sl_base, NULL, sp->sl_len, 0, numobj);
2977
2978		VERIFY(o == (mcache_obj_t *)sp->sl_base);
2979		VERIFY(sp->sl_head == NULL);
2980
2981		VERIFY(m_total(MC_BIGCL) > 0);
2982		m_total(MC_BIGCL)--;
2983		mbstat.m_bigclusters = m_total(MC_BIGCL);
2984
2985		m_total(class) += numobj;
2986		m_infree(class) += numobj;
2987
2988		VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2989		VERIFY(m_total(class) <= m_maxlimit(class));
2990
2991		i = numobj;
2992		if (class == MC_MBUF) {
2993			mbstat.m_mbufs = m_total(MC_MBUF);
2994			mtype_stat_add(MT_FREE, NMBPBG);
2995			while (i--) {
2996				/*
2997				 * If auditing is enabled, construct the
2998				 * shadow mbuf in the audit structure
2999				 * instead of the actual one.
3000				 * mbuf_slab_audit() will take care of
3001				 * restoring the contents after the
3002				 * integrity check.
3003				 */
3004				if (mclaudit != NULL) {
3005					struct mbuf *ms;
3006					mca = mcl_audit_buf2mca(MC_MBUF,
3007					    (mcache_obj_t *)m);
3008					ms = MCA_SAVED_MBUF_PTR(mca);
3009					ms->m_type = MT_FREE;
3010				} else {
3011					m->m_type = MT_FREE;
3012				}
3013				m->m_next = sp->sl_head;
3014				sp->sl_head = (void *)m++;
3015			}
3016		} else { /* MC_CL */
3017			mbstat.m_clfree =
3018			    m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3019			mbstat.m_clusters = m_total(MC_CL);
3020			while (i--) {
3021				c->mcl_next = sp->sl_head;
3022				sp->sl_head = (void *)c++;
3023			}
3024		}
3025
3026		/* Insert into the mbuf or 2k slab list */
3027		slab_insert(sp, class);
3028
3029		if ((i = mb_waiters) > 0)
3030			mb_waiters = 0;
3031		if (i != 0)
3032			wakeup(mb_waitchan);
3033	}
3034	return (count != 0);
3035}
3036
3037/*
3038 * For each class, initialize the freelist to hold m_minlimit() objects.
3039 */
3040static void
3041freelist_init(mbuf_class_t class)
3042{
3043	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3044
3045	VERIFY(class == MC_CL || class == MC_BIGCL);
3046	VERIFY(m_total(class) == 0);
3047	VERIFY(m_minlimit(class) > 0);
3048
3049	while (m_total(class) < m_minlimit(class))
3050		(void) freelist_populate(class, m_minlimit(class), M_WAIT);
3051
3052	VERIFY(m_total(class) >= m_minlimit(class));
3053}
3054
3055/*
3056 * (Inaccurately) check if it might be worth a trip back to the
3057 * mcache layer due the availability of objects there.  We'll
3058 * end up back here if there's nothing up there.
3059 */
3060static boolean_t
3061mbuf_cached_above(mbuf_class_t class, int wait)
3062{
3063	switch (class) {
3064	case MC_MBUF:
3065		if (wait & MCR_COMP)
3066			return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3067			    !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3068		break;
3069
3070	case MC_CL:
3071		if (wait & MCR_COMP)
3072			return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3073		break;
3074
3075	case MC_BIGCL:
3076		if (wait & MCR_COMP)
3077			return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3078		break;
3079
3080	case MC_16KCL:
3081		if (wait & MCR_COMP)
3082			return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3083		break;
3084
3085	case MC_MBUF_CL:
3086	case MC_MBUF_BIGCL:
3087	case MC_MBUF_16KCL:
3088		break;
3089
3090	default:
3091		VERIFY(0);
3092		/* NOTREACHED */
3093	}
3094
3095	return (!mcache_bkt_isempty(m_cache(class)));
3096}
3097
3098/*
3099 * If possible, convert constructed objects to raw ones.
3100 */
3101static boolean_t
3102mbuf_steal(mbuf_class_t class, unsigned int num)
3103{
3104	mcache_obj_t *top = NULL;
3105	mcache_obj_t **list = &top;
3106	unsigned int tot = 0;
3107
3108	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3109
3110	switch (class) {
3111	case MC_MBUF:
3112	case MC_CL:
3113	case MC_BIGCL:
3114	case MC_16KCL:
3115		return (FALSE);
3116
3117	case MC_MBUF_CL:
3118	case MC_MBUF_BIGCL:
3119	case MC_MBUF_16KCL:
3120		/* Get the required number of constructed objects if possible */
3121		if (m_infree(class) > m_minlimit(class)) {
3122			tot = cslab_alloc(class, &list,
3123			    MIN(num, m_infree(class)));
3124		}
3125
3126		/* And destroy them to get back the raw objects */
3127		if (top != NULL)
3128			(void) cslab_free(class, top, 1);
3129		break;
3130
3131	default:
3132		VERIFY(0);
3133		/* NOTREACHED */
3134	}
3135
3136	return (tot == num);
3137}
3138
3139static void
3140m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3141{
3142	int m, bmap = 0;
3143
3144	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3145
3146	VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3147	VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3148	VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3149
3150	/*
3151	 * This logic can be made smarter; for now, simply mark
3152	 * all other related classes as potential victims.
3153	 */
3154	switch (class) {
3155	case MC_MBUF:
3156		m_wantpurge(MC_CL)++;
3157		m_wantpurge(MC_BIGCL)++;
3158		m_wantpurge(MC_MBUF_CL)++;
3159		m_wantpurge(MC_MBUF_BIGCL)++;
3160		break;
3161
3162	case MC_CL:
3163		m_wantpurge(MC_MBUF)++;
3164		m_wantpurge(MC_BIGCL)++;
3165		m_wantpurge(MC_MBUF_BIGCL)++;
3166		if (!comp)
3167			m_wantpurge(MC_MBUF_CL)++;
3168		break;
3169
3170	case MC_BIGCL:
3171		m_wantpurge(MC_MBUF)++;
3172		m_wantpurge(MC_CL)++;
3173		m_wantpurge(MC_MBUF_CL)++;
3174		if (!comp)
3175			m_wantpurge(MC_MBUF_BIGCL)++;
3176		break;
3177
3178	case MC_16KCL:
3179		if (!comp)
3180			m_wantpurge(MC_MBUF_16KCL)++;
3181		break;
3182
3183	default:
3184		VERIFY(0);
3185		/* NOTREACHED */
3186	}
3187
3188	/*
3189	 * Run through each marked class and check if we really need to
3190	 * purge (and therefore temporarily disable) the per-CPU caches
3191	 * layer used by the class.  If so, remember the classes since
3192	 * we are going to drop the lock below prior to purging.
3193	 */
3194	for (m = 0; m < NELEM(mbuf_table); m++) {
3195		if (m_wantpurge(m) > 0) {
3196			m_wantpurge(m) = 0;
3197			/*
3198			 * Try hard to steal the required number of objects
3199			 * from the freelist of other mbuf classes.  Only
3200			 * purge and disable the per-CPU caches layer when
3201			 * we don't have enough; it's the last resort.
3202			 */
3203			if (!mbuf_steal(m, num))
3204				bmap |= (1 << m);
3205		}
3206	}
3207
3208	lck_mtx_unlock(mbuf_mlock);
3209
3210	if (bmap != 0) {
3211		/* signal the domains to drain */
3212		net_drain_domains();
3213
3214		/* Sigh; we have no other choices but to ask mcache to purge */
3215		for (m = 0; m < NELEM(mbuf_table); m++) {
3216			if ((bmap & (1 << m)) &&
3217			    mcache_purge_cache(m_cache(m))) {
3218				lck_mtx_lock(mbuf_mlock);
3219				m_purge_cnt(m)++;
3220				mbstat.m_drain++;
3221				lck_mtx_unlock(mbuf_mlock);
3222			}
3223		}
3224	} else {
3225		/*
3226		 * Request mcache to reap extra elements from all of its caches;
3227		 * note that all reaps are serialized and happen only at a fixed
3228		 * interval.
3229		 */
3230		mcache_reap();
3231	}
3232	lck_mtx_lock(mbuf_mlock);
3233}
3234
3235static inline struct mbuf *
3236m_get_common(int wait, short type, int hdr)
3237{
3238	struct mbuf *m;
3239	int mcflags = MSLEEPF(wait);
3240
3241	/* Is this due to a non-blocking retry?  If so, then try harder */
3242	if (mcflags & MCR_NOSLEEP)
3243		mcflags |= MCR_TRYHARD;
3244
3245	m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3246	if (m != NULL) {
3247		MBUF_INIT(m, hdr, type);
3248		mtype_stat_inc(type);
3249		mtype_stat_dec(MT_FREE);
3250#if CONFIG_MACF_NET
3251		if (hdr && mac_init_mbuf(m, wait) != 0) {
3252			m_free(m);
3253			return (NULL);
3254		}
3255#endif /* MAC_NET */
3256	}
3257	return (m);
3258}
3259
3260/*
3261 * Space allocation routines; these are also available as macros
3262 * for critical paths.
3263 */
3264#define	_M_GET(wait, type)	m_get_common(wait, type, 0)
3265#define	_M_GETHDR(wait, type)	m_get_common(wait, type, 1)
3266#define	_M_RETRY(wait, type)	_M_GET(wait, type)
3267#define	_M_RETRYHDR(wait, type)	_M_GETHDR(wait, type)
3268#define	_MGET(m, how, type)	((m) = _M_GET(how, type))
3269#define	_MGETHDR(m, how, type)	((m) = _M_GETHDR(how, type))
3270
3271struct mbuf *
3272m_get(int wait, int type)
3273{
3274	return (_M_GET(wait, type));
3275}
3276
3277struct mbuf *
3278m_gethdr(int wait, int type)
3279{
3280	return (_M_GETHDR(wait, type));
3281}
3282
3283struct mbuf *
3284m_retry(int wait, int type)
3285{
3286	return (_M_RETRY(wait, type));
3287}
3288
3289struct mbuf *
3290m_retryhdr(int wait, int type)
3291{
3292	return (_M_RETRYHDR(wait, type));
3293}
3294
3295struct mbuf *
3296m_getclr(int wait, int type)
3297{
3298	struct mbuf *m;
3299
3300	_MGET(m, wait, type);
3301	if (m != NULL)
3302		bzero(MTOD(m, caddr_t), MLEN);
3303	return (m);
3304}
3305
3306struct mbuf *
3307m_free(struct mbuf *m)
3308{
3309	struct mbuf *n = m->m_next;
3310
3311	if (m->m_type == MT_FREE)
3312		panic("m_free: freeing an already freed mbuf");
3313
3314	if (m->m_flags & M_PKTHDR) {
3315		/* Check for scratch area overflow */
3316		m_redzone_verify(m);
3317		/* Free the aux data and tags if there is any */
3318		m_tag_delete_chain(m, NULL);
3319	}
3320
3321	if (m->m_flags & M_EXT) {
3322		u_int32_t refcnt;
3323		u_int32_t composite;
3324
3325		refcnt = m_decref(m);
3326		composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3327		if (refcnt == 0 && !composite) {
3328			if (m->m_ext.ext_free == NULL) {
3329				mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3330			} else if (m->m_ext.ext_free == m_bigfree) {
3331				mcache_free(m_cache(MC_BIGCL),
3332				    m->m_ext.ext_buf);
3333			} else if (m->m_ext.ext_free == m_16kfree) {
3334				mcache_free(m_cache(MC_16KCL),
3335				    m->m_ext.ext_buf);
3336			} else {
3337				(*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3338				    m->m_ext.ext_size, m->m_ext.ext_arg);
3339			}
3340			mcache_free(ref_cache, MEXT_RFA(m));
3341			MEXT_RFA(m) = NULL;
3342		} else if (refcnt == 0 && composite) {
3343			VERIFY(m->m_type != MT_FREE);
3344
3345			mtype_stat_dec(m->m_type);
3346			mtype_stat_inc(MT_FREE);
3347
3348			m->m_type = MT_FREE;
3349			m->m_flags = M_EXT;
3350			m->m_len = 0;
3351			m->m_next = m->m_nextpkt = NULL;
3352
3353			MEXT_FLAGS(m) &= ~EXTF_READONLY;
3354
3355			/* "Free" into the intermediate cache */
3356			if (m->m_ext.ext_free == NULL) {
3357				mcache_free(m_cache(MC_MBUF_CL), m);
3358			} else if (m->m_ext.ext_free == m_bigfree) {
3359				mcache_free(m_cache(MC_MBUF_BIGCL), m);
3360			} else {
3361				VERIFY(m->m_ext.ext_free == m_16kfree);
3362				mcache_free(m_cache(MC_MBUF_16KCL), m);
3363			}
3364			return (n);
3365		}
3366	}
3367
3368	if (m->m_type != MT_FREE) {
3369		mtype_stat_dec(m->m_type);
3370		mtype_stat_inc(MT_FREE);
3371	}
3372
3373	m->m_type = MT_FREE;
3374	m->m_flags = m->m_len = 0;
3375	m->m_next = m->m_nextpkt = NULL;
3376
3377	mcache_free(m_cache(MC_MBUF), m);
3378
3379	return (n);
3380}
3381
3382__private_extern__ struct mbuf *
3383m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3384    void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3385    int wait)
3386{
3387	struct ext_ref *rfa = NULL;
3388
3389	if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3390		return (NULL);
3391
3392	if (m->m_flags & M_EXT) {
3393		u_int32_t refcnt;
3394		u_int32_t composite;
3395
3396		refcnt = m_decref(m);
3397		composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3398		if (refcnt == 0 && !composite) {
3399			if (m->m_ext.ext_free == NULL) {
3400				mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3401			} else if (m->m_ext.ext_free == m_bigfree) {
3402				mcache_free(m_cache(MC_BIGCL),
3403				    m->m_ext.ext_buf);
3404			} else if (m->m_ext.ext_free == m_16kfree) {
3405				mcache_free(m_cache(MC_16KCL),
3406				    m->m_ext.ext_buf);
3407			} else {
3408				(*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3409				    m->m_ext.ext_size, m->m_ext.ext_arg);
3410			}
3411			/* Re-use the reference structure */
3412			rfa = MEXT_RFA(m);
3413		} else if (refcnt == 0 && composite) {
3414			VERIFY(m->m_type != MT_FREE);
3415
3416			mtype_stat_dec(m->m_type);
3417			mtype_stat_inc(MT_FREE);
3418
3419			m->m_type = MT_FREE;
3420			m->m_flags = M_EXT;
3421			m->m_len = 0;
3422			m->m_next = m->m_nextpkt = NULL;
3423
3424			MEXT_FLAGS(m) &= ~EXTF_READONLY;
3425
3426			/* "Free" into the intermediate cache */
3427			if (m->m_ext.ext_free == NULL) {
3428				mcache_free(m_cache(MC_MBUF_CL), m);
3429			} else if (m->m_ext.ext_free == m_bigfree) {
3430				mcache_free(m_cache(MC_MBUF_BIGCL), m);
3431			} else {
3432				VERIFY(m->m_ext.ext_free == m_16kfree);
3433				mcache_free(m_cache(MC_MBUF_16KCL), m);
3434			}
3435			/*
3436			 * Allocate a new mbuf, since we didn't divorce
3437			 * the composite mbuf + cluster pair above.
3438			 */
3439			if ((m = _M_GETHDR(wait, type)) == NULL)
3440				return (NULL);
3441		}
3442	}
3443
3444	if (rfa == NULL &&
3445	    (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3446		m_free(m);
3447		return (NULL);
3448	}
3449
3450	MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3451
3452	return (m);
3453}
3454
3455/*
3456 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3457 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3458 */
3459struct mbuf *
3460m_getcl(int wait, int type, int flags)
3461{
3462	struct mbuf *m;
3463	int mcflags = MSLEEPF(wait);
3464	int hdr = (flags & M_PKTHDR);
3465
3466	/* Is this due to a non-blocking retry?  If so, then try harder */
3467	if (mcflags & MCR_NOSLEEP)
3468		mcflags |= MCR_TRYHARD;
3469
3470	m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3471	if (m != NULL) {
3472		u_int32_t flag;
3473		struct ext_ref *rfa;
3474		void *cl;
3475
3476		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3477		cl = m->m_ext.ext_buf;
3478		rfa = MEXT_RFA(m);
3479
3480		ASSERT(cl != NULL && rfa != NULL);
3481		VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3482
3483		flag = MEXT_FLAGS(m);
3484
3485		MBUF_INIT(m, hdr, type);
3486		MBUF_CL_INIT(m, cl, rfa, 1, flag);
3487
3488		mtype_stat_inc(type);
3489		mtype_stat_dec(MT_FREE);
3490#if CONFIG_MACF_NET
3491		if (hdr && mac_init_mbuf(m, wait) != 0) {
3492			m_freem(m);
3493			return (NULL);
3494		}
3495#endif /* MAC_NET */
3496	}
3497	return (m);
3498}
3499
3500/* m_mclget() add an mbuf cluster to a normal mbuf */
3501struct mbuf *
3502m_mclget(struct mbuf *m, int wait)
3503{
3504	struct ext_ref *rfa;
3505
3506	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3507		return (m);
3508
3509	m->m_ext.ext_buf = m_mclalloc(wait);
3510	if (m->m_ext.ext_buf != NULL) {
3511		MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3512	} else {
3513		mcache_free(ref_cache, rfa);
3514	}
3515	return (m);
3516}
3517
3518/* Allocate an mbuf cluster */
3519caddr_t
3520m_mclalloc(int wait)
3521{
3522	int mcflags = MSLEEPF(wait);
3523
3524	/* Is this due to a non-blocking retry?  If so, then try harder */
3525	if (mcflags & MCR_NOSLEEP)
3526		mcflags |= MCR_TRYHARD;
3527
3528	return (mcache_alloc(m_cache(MC_CL), mcflags));
3529}
3530
3531/* Free an mbuf cluster */
3532void
3533m_mclfree(caddr_t p)
3534{
3535	mcache_free(m_cache(MC_CL), p);
3536}
3537
3538/*
3539 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3540 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3541 */
3542int
3543m_mclhasreference(struct mbuf *m)
3544{
3545	if (!(m->m_flags & M_EXT))
3546		return (0);
3547
3548	ASSERT(MEXT_RFA(m) != NULL);
3549
3550	return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3551}
3552
3553__private_extern__ caddr_t
3554m_bigalloc(int wait)
3555{
3556	int mcflags = MSLEEPF(wait);
3557
3558	/* Is this due to a non-blocking retry?  If so, then try harder */
3559	if (mcflags & MCR_NOSLEEP)
3560		mcflags |= MCR_TRYHARD;
3561
3562	return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3563}
3564
3565__private_extern__ void
3566m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3567{
3568	mcache_free(m_cache(MC_BIGCL), p);
3569}
3570
3571/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3572__private_extern__ struct mbuf *
3573m_mbigget(struct mbuf *m, int wait)
3574{
3575	struct ext_ref *rfa;
3576
3577	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3578		return (m);
3579
3580	m->m_ext.ext_buf =  m_bigalloc(wait);
3581	if (m->m_ext.ext_buf != NULL) {
3582		MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3583	} else {
3584		mcache_free(ref_cache, rfa);
3585	}
3586	return (m);
3587}
3588
3589__private_extern__ caddr_t
3590m_16kalloc(int wait)
3591{
3592	int mcflags = MSLEEPF(wait);
3593
3594	/* Is this due to a non-blocking retry?  If so, then try harder */
3595	if (mcflags & MCR_NOSLEEP)
3596		mcflags |= MCR_TRYHARD;
3597
3598	return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3599}
3600
3601__private_extern__ void
3602m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3603{
3604	mcache_free(m_cache(MC_16KCL), p);
3605}
3606
3607/* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3608__private_extern__ struct mbuf *
3609m_m16kget(struct mbuf *m, int wait)
3610{
3611	struct ext_ref *rfa;
3612
3613	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3614		return (m);
3615
3616	m->m_ext.ext_buf =  m_16kalloc(wait);
3617	if (m->m_ext.ext_buf != NULL) {
3618		MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3619	} else {
3620		mcache_free(ref_cache, rfa);
3621	}
3622	return (m);
3623}
3624
3625/*
3626 * "Move" mbuf pkthdr from "from" to "to".
3627 * "from" must have M_PKTHDR set, and "to" must be empty.
3628 */
3629void
3630m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3631{
3632	VERIFY(from->m_flags & M_PKTHDR);
3633
3634	/* Check for scratch area overflow */
3635	m_redzone_verify(from);
3636
3637	if (to->m_flags & M_PKTHDR) {
3638		/* Check for scratch area overflow */
3639		m_redzone_verify(to);
3640		/* We will be taking over the tags of 'to' */
3641		m_tag_delete_chain(to, NULL);
3642	}
3643	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
3644	m_classifier_init(from, 0);		/* purge classifier info */
3645	m_tag_init(from, 1);			/* purge all tags from src */
3646	m_scratch_init(from);			/* clear src scratch area */
3647	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3648	if ((to->m_flags & M_EXT) == 0)
3649		to->m_data = to->m_pktdat;
3650	m_redzone_init(to);			/* setup red zone on dst */
3651}
3652
3653/*
3654 * Duplicate "from"'s mbuf pkthdr in "to".
3655 * "from" must have M_PKTHDR set, and "to" must be empty.
3656 * In particular, this does a deep copy of the packet tags.
3657 */
3658static int
3659m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3660{
3661	VERIFY(from->m_flags & M_PKTHDR);
3662
3663	/* Check for scratch area overflow */
3664	m_redzone_verify(from);
3665
3666	if (to->m_flags & M_PKTHDR) {
3667		/* Check for scratch area overflow */
3668		m_redzone_verify(to);
3669		/* We will be taking over the tags of 'to' */
3670		m_tag_delete_chain(to, NULL);
3671	}
3672	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3673	if ((to->m_flags & M_EXT) == 0)
3674		to->m_data = to->m_pktdat;
3675	to->m_pkthdr = from->m_pkthdr;
3676	m_redzone_init(to);			/* setup red zone on dst */
3677	m_tag_init(to, 0);			/* preserve dst static tags */
3678	return (m_tag_copy_chain(to, from, how));
3679}
3680
3681void
3682m_copy_pftag(struct mbuf *to, struct mbuf *from)
3683{
3684	to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag;
3685#if PF_ECN
3686	to->m_pkthdr.pf_mtag.pftag_hdr = NULL;
3687	to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
3688#endif /* PF_ECN */
3689}
3690
3691void
3692m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
3693{
3694	VERIFY(m->m_flags & M_PKTHDR);
3695
3696	m->m_pkthdr.pkt_proto = 0;
3697	m->m_pkthdr.pkt_flowsrc = 0;
3698	m->m_pkthdr.pkt_flowid = 0;
3699	m->m_pkthdr.pkt_flags &= pktf_mask;	/* caller-defined mask */
3700	/* preserve service class and interface info for loopback packets */
3701	if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
3702		(void) m_set_service_class(m, MBUF_SC_BE);
3703	if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
3704		m->m_pkthdr.pkt_ifainfo = 0;
3705#if MEASURE_BW
3706	m->m_pkthdr.pkt_bwseq  = 0;
3707#endif /* MEASURE_BW */
3708}
3709
3710void
3711m_copy_classifier(struct mbuf *to, struct mbuf *from)
3712{
3713	VERIFY(to->m_flags & M_PKTHDR);
3714	VERIFY(from->m_flags & M_PKTHDR);
3715
3716	to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
3717	to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
3718	to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
3719	to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
3720	(void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
3721	to->m_pkthdr.pkt_ifainfo  = from->m_pkthdr.pkt_ifainfo;
3722	to->m_pkthdr.ipsec_policy = from->m_pkthdr.ipsec_policy;
3723#if MEASURE_BW
3724	to->m_pkthdr.pkt_bwseq  = from->m_pkthdr.pkt_bwseq;
3725#endif /* MEASURE_BW */
3726}
3727
3728/*
3729 * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3730 * if wantall is not set, return whatever number were available.  Set up the
3731 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3732 * are chained on the m_nextpkt field.  Any packets requested beyond this
3733 * are chained onto the last packet header's m_next field.  The size of
3734 * the cluster is controlled by the parameter bufsize.
3735 */
3736__private_extern__ struct mbuf *
3737m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3738    int wait, int wantall, size_t bufsize)
3739{
3740	struct mbuf *m;
3741	struct mbuf **np, *top;
3742	unsigned int pnum, needed = *num_needed;
3743	mcache_obj_t *mp_list = NULL;
3744	int mcflags = MSLEEPF(wait);
3745	u_int32_t flag;
3746	struct ext_ref *rfa;
3747	mcache_t *cp;
3748	void *cl;
3749
3750	ASSERT(bufsize == m_maxsize(MC_CL) ||
3751	    bufsize == m_maxsize(MC_BIGCL) ||
3752	    bufsize == m_maxsize(MC_16KCL));
3753
3754	/*
3755	 * Caller must first check for njcl because this
3756	 * routine is internal and not exposed/used via KPI.
3757	 */
3758	VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3759
3760	top = NULL;
3761	np = &top;
3762	pnum = 0;
3763
3764	/*
3765	 * The caller doesn't want all the requested buffers; only some.
3766	 * Try hard to get what we can, but don't block.  This effectively
3767	 * overrides MCR_SLEEP, since this thread will not go to sleep
3768	 * if we can't get all the buffers.
3769	 */
3770	if (!wantall || (mcflags & MCR_NOSLEEP))
3771		mcflags |= MCR_TRYHARD;
3772
3773	/* Allocate the composite mbuf + cluster elements from the cache */
3774	if (bufsize == m_maxsize(MC_CL))
3775		cp = m_cache(MC_MBUF_CL);
3776	else if (bufsize == m_maxsize(MC_BIGCL))
3777		cp = m_cache(MC_MBUF_BIGCL);
3778	else
3779		cp = m_cache(MC_MBUF_16KCL);
3780	needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3781
3782	for (pnum = 0; pnum < needed; pnum++) {
3783		m = (struct mbuf *)mp_list;
3784		mp_list = mp_list->obj_next;
3785
3786		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3787		cl = m->m_ext.ext_buf;
3788		rfa = MEXT_RFA(m);
3789
3790		ASSERT(cl != NULL && rfa != NULL);
3791		VERIFY(MBUF_IS_COMPOSITE(m));
3792
3793		flag = MEXT_FLAGS(m);
3794
3795		MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3796		if (bufsize == m_maxsize(MC_16KCL)) {
3797			MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3798		} else if (bufsize == m_maxsize(MC_BIGCL)) {
3799			MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3800		} else {
3801			MBUF_CL_INIT(m, cl, rfa, 1, flag);
3802		}
3803
3804		if (num_with_pkthdrs > 0) {
3805			--num_with_pkthdrs;
3806#if CONFIG_MACF_NET
3807			if (mac_mbuf_label_init(m, wait) != 0) {
3808				m_freem(m);
3809				break;
3810			}
3811#endif /* MAC_NET */
3812		}
3813
3814		*np = m;
3815		if (num_with_pkthdrs > 0)
3816			np = &m->m_nextpkt;
3817		else
3818			np = &m->m_next;
3819	}
3820	ASSERT(pnum != *num_needed || mp_list == NULL);
3821	if (mp_list != NULL)
3822		mcache_free_ext(cp, mp_list);
3823
3824	if (pnum > 0) {
3825		mtype_stat_add(MT_DATA, pnum);
3826		mtype_stat_sub(MT_FREE, pnum);
3827	}
3828
3829	if (wantall && (pnum != *num_needed)) {
3830		if (top != NULL)
3831			m_freem_list(top);
3832		return (NULL);
3833	}
3834
3835	if (pnum > *num_needed) {
3836		printf("%s: File a radar related to <rdar://10146739>. \
3837			needed = %u, pnum = %u, num_needed = %u \n",
3838			__func__, needed, pnum, *num_needed);
3839	}
3840
3841	*num_needed = pnum;
3842	return (top);
3843}
3844
3845/*
3846 * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3847 * wantall is not set, return whatever number were available.  The size of
3848 * each mbuf in the list is controlled by the parameter packetlen.  Each
3849 * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3850 * in the chain is called a segment.  If maxsegments is not null and the
3851 * value pointed to is not null, this specify the maximum number of segments
3852 * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3853 * is zero the caller does not have any restriction on the number of segments.
3854 * The actual  number of segments of a mbuf chain is return in the value
3855 * pointed to by maxsegments.
3856 */
3857__private_extern__ struct mbuf *
3858m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3859    unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3860{
3861	struct mbuf **np, *top, *first = NULL;
3862	size_t bufsize, r_bufsize;
3863	unsigned int num = 0;
3864	unsigned int nsegs = 0;
3865	unsigned int needed, resid;
3866	int mcflags = MSLEEPF(wait);
3867	mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3868	mcache_t *cp = NULL, *rcp = NULL;
3869
3870	if (*numlist == 0)
3871		return (NULL);
3872
3873	top = NULL;
3874	np = &top;
3875
3876	if (wantsize == 0) {
3877		if (packetlen <= MINCLSIZE) {
3878			bufsize = packetlen;
3879		} else if (packetlen > m_maxsize(MC_CL)) {
3880			/* Use 4KB if jumbo cluster pool isn't available */
3881			if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3882				bufsize = m_maxsize(MC_BIGCL);
3883			else
3884				bufsize = m_maxsize(MC_16KCL);
3885		} else {
3886			bufsize = m_maxsize(MC_CL);
3887		}
3888	} else if (wantsize == m_maxsize(MC_CL) ||
3889	    wantsize == m_maxsize(MC_BIGCL) ||
3890	    (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3891		bufsize = wantsize;
3892	} else {
3893		return (NULL);
3894	}
3895
3896	if (bufsize <= MHLEN) {
3897		nsegs = 1;
3898	} else if (bufsize <= MINCLSIZE) {
3899		if (maxsegments != NULL && *maxsegments == 1) {
3900			bufsize = m_maxsize(MC_CL);
3901			nsegs = 1;
3902		} else {
3903			nsegs = 2;
3904		}
3905	} else if (bufsize == m_maxsize(MC_16KCL)) {
3906		VERIFY(njcl > 0);
3907		nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3908	} else if (bufsize == m_maxsize(MC_BIGCL)) {
3909		nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3910	} else {
3911		nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3912	}
3913	if (maxsegments != NULL) {
3914		if (*maxsegments && nsegs > *maxsegments) {
3915			*maxsegments = nsegs;
3916			return (NULL);
3917		}
3918		*maxsegments = nsegs;
3919	}
3920
3921	/*
3922	 * The caller doesn't want all the requested buffers; only some.
3923	 * Try hard to get what we can, but don't block.  This effectively
3924	 * overrides MCR_SLEEP, since this thread will not go to sleep
3925	 * if we can't get all the buffers.
3926	 */
3927	if (!wantall || (mcflags & MCR_NOSLEEP))
3928		mcflags |= MCR_TRYHARD;
3929
3930	/*
3931	 * Simple case where all elements in the lists/chains are mbufs.
3932	 * Unless bufsize is greater than MHLEN, each segment chain is made
3933	 * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
3934	 * of 2 mbufs; the second one is used for the residual data, i.e.
3935	 * the remaining data that cannot fit into the first mbuf.
3936	 */
3937	if (bufsize <= MINCLSIZE) {
3938		/* Allocate the elements in one shot from the mbuf cache */
3939		ASSERT(bufsize <= MHLEN || nsegs == 2);
3940		cp = m_cache(MC_MBUF);
3941		needed = mcache_alloc_ext(cp, &mp_list,
3942		    (*numlist) * nsegs, mcflags);
3943
3944		/*
3945		 * The number of elements must be even if we are to use an
3946		 * mbuf (instead of a cluster) to store the residual data.
3947		 * If we couldn't allocate the requested number of mbufs,
3948		 * trim the number down (if it's odd) in order to avoid
3949		 * creating a partial segment chain.
3950		 */
3951		if (bufsize > MHLEN && (needed & 0x1))
3952			needed--;
3953
3954		while (num < needed) {
3955			struct mbuf *m;
3956
3957			m = (struct mbuf *)mp_list;
3958			mp_list = mp_list->obj_next;
3959			ASSERT(m != NULL);
3960
3961			MBUF_INIT(m, 1, MT_DATA);
3962#if CONFIG_MACF_NET
3963			if (mac_init_mbuf(m, wait) != 0) {
3964				m_free(m);
3965				break;
3966			}
3967#endif /* MAC_NET */
3968			num++;
3969			if (bufsize > MHLEN) {
3970				/* A second mbuf for this segment chain */
3971				m->m_next = (struct mbuf *)mp_list;
3972				mp_list = mp_list->obj_next;
3973				ASSERT(m->m_next != NULL);
3974
3975				MBUF_INIT(m->m_next, 0, MT_DATA);
3976				num++;
3977			}
3978			*np = m;
3979			np = &m->m_nextpkt;
3980		}
3981		ASSERT(num != *numlist || mp_list == NULL);
3982
3983		if (num > 0) {
3984			mtype_stat_add(MT_DATA, num);
3985			mtype_stat_sub(MT_FREE, num);
3986		}
3987		num /= nsegs;
3988
3989		/* We've got them all; return to caller */
3990		if (num == *numlist)
3991			return (top);
3992
3993		goto fail;
3994	}
3995
3996	/*
3997	 * Complex cases where elements are made up of one or more composite
3998	 * mbufs + cluster, depending on packetlen.  Each N-segment chain can
3999	 * be illustrated as follows:
4000	 *
4001	 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4002	 *
4003	 * Every composite mbuf + cluster element comes from the intermediate
4004	 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
4005	 * the last composite element will come from the MC_MBUF_CL cache,
4006	 * unless the residual data is larger than 2KB where we use the
4007	 * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
4008	 * data is defined as extra data beyond the first element that cannot
4009	 * fit into the previous element, i.e. there is no residual data if
4010	 * the chain only has 1 segment.
4011	 */
4012	r_bufsize = bufsize;
4013	resid = packetlen > bufsize ? packetlen % bufsize : 0;
4014	if (resid > 0) {
4015		/* There is residual data; figure out the cluster size */
4016		if (wantsize == 0 && packetlen > MINCLSIZE) {
4017			/*
4018			 * Caller didn't request that all of the segments
4019			 * in the chain use the same cluster size; use the
4020			 * smaller of the cluster sizes.
4021			 */
4022			if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
4023				r_bufsize = m_maxsize(MC_16KCL);
4024			else if (resid > m_maxsize(MC_CL))
4025				r_bufsize = m_maxsize(MC_BIGCL);
4026			else
4027				r_bufsize = m_maxsize(MC_CL);
4028		} else {
4029			/* Use the same cluster size as the other segments */
4030			resid = 0;
4031		}
4032	}
4033
4034	needed = *numlist;
4035	if (resid > 0) {
4036		/*
4037		 * Attempt to allocate composite mbuf + cluster elements for
4038		 * the residual data in each chain; record the number of such
4039		 * elements that can be allocated so that we know how many
4040		 * segment chains we can afford to create.
4041		 */
4042		if (r_bufsize <= m_maxsize(MC_CL))
4043			rcp = m_cache(MC_MBUF_CL);
4044		else if (r_bufsize <= m_maxsize(MC_BIGCL))
4045			rcp = m_cache(MC_MBUF_BIGCL);
4046		else
4047			rcp = m_cache(MC_MBUF_16KCL);
4048		needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4049
4050		if (needed == 0)
4051			goto fail;
4052
4053		/* This is temporarily reduced for calculation */
4054		ASSERT(nsegs > 1);
4055		nsegs--;
4056	}
4057
4058	/*
4059	 * Attempt to allocate the rest of the composite mbuf + cluster
4060	 * elements for the number of segment chains that we need.
4061	 */
4062	if (bufsize <= m_maxsize(MC_CL))
4063		cp = m_cache(MC_MBUF_CL);
4064	else if (bufsize <= m_maxsize(MC_BIGCL))
4065		cp = m_cache(MC_MBUF_BIGCL);
4066	else
4067		cp = m_cache(MC_MBUF_16KCL);
4068	needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4069
4070	/* Round it down to avoid creating a partial segment chain */
4071	needed = (needed / nsegs) * nsegs;
4072	if (needed == 0)
4073		goto fail;
4074
4075	if (resid > 0) {
4076		/*
4077		 * We're about to construct the chain(s); take into account
4078		 * the number of segments we have created above to hold the
4079		 * residual data for each chain, as well as restore the
4080		 * original count of segments per chain.
4081		 */
4082		ASSERT(nsegs > 0);
4083		needed += needed / nsegs;
4084		nsegs++;
4085	}
4086
4087	for (;;) {
4088		struct mbuf *m;
4089		u_int32_t flag;
4090		struct ext_ref *rfa;
4091		void *cl;
4092		int pkthdr;
4093
4094		++num;
4095		if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4096			m = (struct mbuf *)mp_list;
4097			mp_list = mp_list->obj_next;
4098		} else {
4099			m = (struct mbuf *)rmp_list;
4100			rmp_list = rmp_list->obj_next;
4101		}
4102		ASSERT(m != NULL);
4103		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4104		VERIFY(m->m_ext.ext_free == NULL ||
4105		    m->m_ext.ext_free == m_bigfree ||
4106		    m->m_ext.ext_free == m_16kfree);
4107
4108		cl = m->m_ext.ext_buf;
4109		rfa = MEXT_RFA(m);
4110
4111		ASSERT(cl != NULL && rfa != NULL);
4112		VERIFY(MBUF_IS_COMPOSITE(m));
4113
4114		flag = MEXT_FLAGS(m);
4115
4116		pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4117		if (pkthdr)
4118			first = m;
4119		MBUF_INIT(m, pkthdr, MT_DATA);
4120		if (m->m_ext.ext_free == m_16kfree) {
4121			MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4122		} else if (m->m_ext.ext_free == m_bigfree) {
4123			MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4124		} else {
4125			MBUF_CL_INIT(m, cl, rfa, 1, flag);
4126		}
4127#if CONFIG_MACF_NET
4128		if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4129			--num;
4130			m_freem(m);
4131			break;
4132		}
4133#endif /* MAC_NET */
4134
4135		*np = m;
4136		if ((num % nsegs) == 0)
4137			np = &first->m_nextpkt;
4138		else
4139			np = &m->m_next;
4140
4141		if (num == needed)
4142			break;
4143	}
4144
4145	if (num > 0) {
4146		mtype_stat_add(MT_DATA, num);
4147		mtype_stat_sub(MT_FREE, num);
4148	}
4149
4150	num /= nsegs;
4151
4152	/* We've got them all; return to caller */
4153	if (num == *numlist) {
4154		ASSERT(mp_list == NULL && rmp_list == NULL);
4155		return (top);
4156	}
4157
4158fail:
4159	/* Free up what's left of the above */
4160	if (mp_list != NULL)
4161		mcache_free_ext(cp, mp_list);
4162	if (rmp_list != NULL)
4163		mcache_free_ext(rcp, rmp_list);
4164	if (wantall && top != NULL) {
4165		m_freem(top);
4166		return (NULL);
4167	}
4168	*numlist = num;
4169	return (top);
4170}
4171
4172/*
4173 * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4174 * packets on receive ring.
4175 */
4176__private_extern__ struct mbuf *
4177m_getpacket_how(int wait)
4178{
4179	unsigned int num_needed = 1;
4180
4181	return (m_getpackets_internal(&num_needed, 1, wait, 1,
4182	    m_maxsize(MC_CL)));
4183}
4184
4185/*
4186 * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4187 * packets on receive ring.
4188 */
4189struct mbuf *
4190m_getpacket(void)
4191{
4192	unsigned int num_needed = 1;
4193
4194	return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4195	    m_maxsize(MC_CL)));
4196}
4197
4198/*
4199 * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4200 * if this can't be met, return whatever number were available.  Set up the
4201 * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
4202 * are chained on the m_nextpkt field.  Any packets requested beyond this are
4203 * chained onto the last packet header's m_next field.
4204 */
4205struct mbuf *
4206m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4207{
4208	unsigned int n = num_needed;
4209
4210	return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4211	    m_maxsize(MC_CL)));
4212}
4213
4214/*
4215 * Return a list of mbuf hdrs set up as packet hdrs chained together
4216 * on the m_nextpkt field
4217 */
4218struct mbuf *
4219m_getpackethdrs(int num_needed, int how)
4220{
4221	struct mbuf *m;
4222	struct mbuf **np, *top;
4223
4224	top = NULL;
4225	np = &top;
4226
4227	while (num_needed--) {
4228		m = _M_RETRYHDR(how, MT_DATA);
4229		if (m == NULL)
4230			break;
4231
4232		*np = m;
4233		np = &m->m_nextpkt;
4234	}
4235
4236	return (top);
4237}
4238
4239/*
4240 * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
4241 * for mbufs packets freed.  Used by the drivers.
4242 */
4243int
4244m_freem_list(struct mbuf *m)
4245{
4246	struct mbuf *nextpkt;
4247	mcache_obj_t *mp_list = NULL;
4248	mcache_obj_t *mcl_list = NULL;
4249	mcache_obj_t *mbc_list = NULL;
4250	mcache_obj_t *m16k_list = NULL;
4251	mcache_obj_t *m_mcl_list = NULL;
4252	mcache_obj_t *m_mbc_list = NULL;
4253	mcache_obj_t *m_m16k_list = NULL;
4254	mcache_obj_t *ref_list = NULL;
4255	int pktcount = 0;
4256	int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4257
4258	while (m != NULL) {
4259		pktcount++;
4260
4261		nextpkt = m->m_nextpkt;
4262		m->m_nextpkt = NULL;
4263
4264		while (m != NULL) {
4265			struct mbuf *next = m->m_next;
4266			mcache_obj_t *o, *rfa;
4267			u_int32_t refcnt, composite;
4268
4269			if (m->m_type == MT_FREE)
4270				panic("m_free: freeing an already freed mbuf");
4271
4272			if (m->m_type != MT_FREE)
4273				mt_free++;
4274
4275			if (m->m_flags & M_PKTHDR) {
4276				/* Check for scratch area overflow */
4277				m_redzone_verify(m);
4278				/* Free the aux data and tags if there is any */
4279				m_tag_delete_chain(m, NULL);
4280			}
4281
4282			if (!(m->m_flags & M_EXT))
4283				goto simple_free;
4284
4285			o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4286			refcnt = m_decref(m);
4287			composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4288			if (refcnt == 0 && !composite) {
4289				if (m->m_ext.ext_free == NULL) {
4290					o->obj_next = mcl_list;
4291					mcl_list = o;
4292				} else if (m->m_ext.ext_free == m_bigfree) {
4293					o->obj_next = mbc_list;
4294					mbc_list = o;
4295				} else if (m->m_ext.ext_free == m_16kfree) {
4296					o->obj_next = m16k_list;
4297					m16k_list = o;
4298				} else {
4299					(*(m->m_ext.ext_free))((caddr_t)o,
4300					    m->m_ext.ext_size,
4301					    m->m_ext.ext_arg);
4302				}
4303				rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
4304				rfa->obj_next = ref_list;
4305				ref_list = rfa;
4306				MEXT_RFA(m) = NULL;
4307			} else if (refcnt == 0 && composite) {
4308				VERIFY(m->m_type != MT_FREE);
4309				/*
4310				 * Amortize the costs of atomic operations
4311				 * by doing them at the end, if possible.
4312				 */
4313				if (m->m_type == MT_DATA)
4314					mt_data++;
4315				else if (m->m_type == MT_HEADER)
4316					mt_header++;
4317				else if (m->m_type == MT_SONAME)
4318					mt_soname++;
4319				else if (m->m_type == MT_TAG)
4320					mt_tag++;
4321				else
4322					mtype_stat_dec(m->m_type);
4323
4324				m->m_type = MT_FREE;
4325				m->m_flags = M_EXT;
4326				m->m_len = 0;
4327				m->m_next = m->m_nextpkt = NULL;
4328
4329				MEXT_FLAGS(m) &= ~EXTF_READONLY;
4330
4331				/* "Free" into the intermediate cache */
4332				o = (mcache_obj_t *)m;
4333				if (m->m_ext.ext_free == NULL) {
4334					o->obj_next = m_mcl_list;
4335					m_mcl_list = o;
4336				} else if (m->m_ext.ext_free == m_bigfree) {
4337					o->obj_next = m_mbc_list;
4338					m_mbc_list = o;
4339				} else {
4340					VERIFY(m->m_ext.ext_free == m_16kfree);
4341					o->obj_next = m_m16k_list;
4342					m_m16k_list = o;
4343				}
4344				m = next;
4345				continue;
4346			}
4347simple_free:
4348			/*
4349			 * Amortize the costs of atomic operations
4350			 * by doing them at the end, if possible.
4351			 */
4352			if (m->m_type == MT_DATA)
4353				mt_data++;
4354			else if (m->m_type == MT_HEADER)
4355				mt_header++;
4356			else if (m->m_type == MT_SONAME)
4357				mt_soname++;
4358			else if (m->m_type == MT_TAG)
4359				mt_tag++;
4360			else if (m->m_type != MT_FREE)
4361				mtype_stat_dec(m->m_type);
4362
4363			m->m_type = MT_FREE;
4364			m->m_flags = m->m_len = 0;
4365			m->m_next = m->m_nextpkt = NULL;
4366
4367			((mcache_obj_t *)m)->obj_next = mp_list;
4368			mp_list = (mcache_obj_t *)m;
4369
4370			m = next;
4371		}
4372
4373		m = nextpkt;
4374	}
4375
4376	if (mt_free > 0)
4377		mtype_stat_add(MT_FREE, mt_free);
4378	if (mt_data > 0)
4379		mtype_stat_sub(MT_DATA, mt_data);
4380	if (mt_header > 0)
4381		mtype_stat_sub(MT_HEADER, mt_header);
4382	if (mt_soname > 0)
4383		mtype_stat_sub(MT_SONAME, mt_soname);
4384	if (mt_tag > 0)
4385		mtype_stat_sub(MT_TAG, mt_tag);
4386
4387	if (mp_list != NULL)
4388		mcache_free_ext(m_cache(MC_MBUF), mp_list);
4389	if (mcl_list != NULL)
4390		mcache_free_ext(m_cache(MC_CL), mcl_list);
4391	if (mbc_list != NULL)
4392		mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4393	if (m16k_list != NULL)
4394		mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4395	if (m_mcl_list != NULL)
4396		mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4397	if (m_mbc_list != NULL)
4398		mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4399	if (m_m16k_list != NULL)
4400		mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4401	if (ref_list != NULL)
4402		mcache_free_ext(ref_cache, ref_list);
4403
4404	return (pktcount);
4405}
4406
4407void
4408m_freem(struct mbuf *m)
4409{
4410	while (m != NULL)
4411		m = m_free(m);
4412}
4413
4414/*
4415 * Mbuffer utility routines.
4416 */
4417
4418/*
4419 * Compute the amount of space available before the current start
4420 * of data in an mbuf.
4421 */
4422int
4423m_leadingspace(struct mbuf *m)
4424{
4425	if (m->m_flags & M_EXT) {
4426		if (MCLHASREFERENCE(m))
4427			return (0);
4428		return (m->m_data - m->m_ext.ext_buf);
4429	}
4430	if (m->m_flags & M_PKTHDR)
4431		return (m->m_data - m->m_pktdat);
4432	return (m->m_data - m->m_dat);
4433}
4434
4435/*
4436 * Compute the amount of space available after the end of data in an mbuf.
4437 */
4438int
4439m_trailingspace(struct mbuf *m)
4440{
4441	if (m->m_flags & M_EXT) {
4442		if (MCLHASREFERENCE(m))
4443			return (0);
4444		return (m->m_ext.ext_buf + m->m_ext.ext_size -
4445		    (m->m_data + m->m_len));
4446	}
4447	return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4448}
4449
4450/*
4451 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4452 * copy junk along.  Does not adjust packet header length.
4453 */
4454struct mbuf *
4455m_prepend(struct mbuf *m, int len, int how)
4456{
4457	struct mbuf *mn;
4458
4459	_MGET(mn, how, m->m_type);
4460	if (mn == NULL) {
4461		m_freem(m);
4462		return (NULL);
4463	}
4464	if (m->m_flags & M_PKTHDR) {
4465		M_COPY_PKTHDR(mn, m);
4466		m->m_flags &= ~M_PKTHDR;
4467	}
4468	mn->m_next = m;
4469	m = mn;
4470	if (len < MHLEN)
4471		MH_ALIGN(m, len);
4472	m->m_len = len;
4473	return (m);
4474}
4475
4476/*
4477 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4478 * chain, copy junk along, and adjust length.
4479 */
4480struct mbuf *
4481m_prepend_2(struct mbuf *m, int len, int how)
4482{
4483	if (M_LEADINGSPACE(m) >= len) {
4484		m->m_data -= len;
4485		m->m_len += len;
4486	} else {
4487		m = m_prepend(m, len, how);
4488	}
4489	if ((m) && (m->m_flags & M_PKTHDR))
4490		m->m_pkthdr.len += len;
4491	return (m);
4492}
4493
4494/*
4495 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4496 * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
4497 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4498 */
4499int MCFail;
4500
4501struct mbuf *
4502m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
4503{
4504	struct mbuf *n, *mhdr = NULL, **np;
4505	int off = off0;
4506	struct mbuf *top;
4507	int copyhdr = 0;
4508
4509	if (off < 0 || len < 0)
4510		panic("m_copym: invalid offset %d or len %d", off, len);
4511
4512	if (off == 0 && (m->m_flags & M_PKTHDR)) {
4513		mhdr = m;
4514		copyhdr = 1;
4515	}
4516
4517	while (off >= m->m_len) {
4518		if (m->m_next == NULL)
4519			panic("m_copym: invalid mbuf chain");
4520		off -= m->m_len;
4521		m = m->m_next;
4522	}
4523	np = &top;
4524	top = NULL;
4525
4526	while (len > 0) {
4527		if (m == NULL) {
4528			if (len != M_COPYALL)
4529				panic("m_copym: len != M_COPYALL");
4530			break;
4531		}
4532
4533		n = _M_RETRY(wait, m->m_type);
4534		*np = n;
4535
4536		if (n == NULL)
4537			goto nospace;
4538
4539		if (copyhdr != 0) {
4540			if (mode == M_COPYM_MOVE_HDR) {
4541				M_COPY_PKTHDR(n, mhdr);
4542			} else if (mode == M_COPYM_COPY_HDR) {
4543				if (m_dup_pkthdr(n, mhdr, wait) == 0)
4544					goto nospace;
4545			}
4546			if (len == M_COPYALL)
4547				n->m_pkthdr.len -= off0;
4548			else
4549				n->m_pkthdr.len = len;
4550			copyhdr = 0;
4551		}
4552		if (len == M_COPYALL) {
4553			if (MIN(len, (m->m_len - off)) == len) {
4554				printf("m->m_len %d - off %d = %d, %d\n",
4555				    m->m_len, off, m->m_len - off,
4556				    MIN(len, (m->m_len - off)));
4557			}
4558		}
4559		n->m_len = MIN(len, (m->m_len - off));
4560		if (n->m_len == M_COPYALL) {
4561			printf("n->m_len == M_COPYALL, fixing\n");
4562			n->m_len = MHLEN;
4563		}
4564		if (m->m_flags & M_EXT) {
4565			n->m_ext = m->m_ext;
4566			m_incref(m);
4567			n->m_data = m->m_data + off;
4568			n->m_flags |= M_EXT;
4569		} else {
4570			bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4571			    (unsigned)n->m_len);
4572		}
4573		if (len != M_COPYALL)
4574			len -= n->m_len;
4575		off = 0;
4576		m = m->m_next;
4577		np = &n->m_next;
4578	}
4579
4580	if (top == NULL)
4581		MCFail++;
4582
4583	return (top);
4584nospace:
4585
4586	m_freem(top);
4587	MCFail++;
4588	return (NULL);
4589}
4590
4591
4592struct mbuf *
4593m_copym(struct mbuf *m, int off0, int len, int wait)
4594{
4595	return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
4596}
4597
4598/*
4599 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4600 * within this routine also, the last mbuf and offset accessed are passed
4601 * out and can be passed back in to avoid having to rescan the entire mbuf
4602 * list (normally hung off of the socket)
4603 */
4604struct mbuf *
4605m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
4606    struct mbuf **m_lastm, int *m_off, uint32_t mode)
4607{
4608	struct mbuf *n, **np = NULL;
4609	int off = off0, len = len0;
4610	struct mbuf *top = NULL;
4611	int mcflags = MSLEEPF(wait);
4612	int copyhdr = 0;
4613	int type = 0;
4614	mcache_obj_t *list = NULL;
4615	int needed = 0;
4616
4617	if (off == 0 && (m->m_flags & M_PKTHDR))
4618		copyhdr = 1;
4619
4620	if (*m_lastm != NULL) {
4621		m = *m_lastm;
4622		off = *m_off;
4623	} else {
4624		while (off >= m->m_len) {
4625			off -= m->m_len;
4626			m = m->m_next;
4627		}
4628	}
4629
4630	n = m;
4631	while (len > 0) {
4632		needed++;
4633		ASSERT(n != NULL);
4634		len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4635		n = n->m_next;
4636	}
4637	needed++;
4638	len = len0;
4639
4640	/*
4641	 * If the caller doesn't want to be put to sleep, mark it with
4642	 * MCR_TRYHARD so that we may reclaim buffers from other places
4643	 * before giving up.
4644	 */
4645	if (mcflags & MCR_NOSLEEP)
4646		mcflags |= MCR_TRYHARD;
4647
4648	if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4649	    mcflags) != needed)
4650		goto nospace;
4651
4652	needed = 0;
4653	while (len > 0) {
4654		n = (struct mbuf *)list;
4655		list = list->obj_next;
4656		ASSERT(n != NULL && m != NULL);
4657
4658		type = (top == NULL) ? MT_HEADER : m->m_type;
4659		MBUF_INIT(n, (top == NULL), type);
4660#if CONFIG_MACF_NET
4661		if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4662			mtype_stat_inc(MT_HEADER);
4663			mtype_stat_dec(MT_FREE);
4664			m_free(n);
4665			goto nospace;
4666		}
4667#endif /* MAC_NET */
4668
4669		if (top == NULL) {
4670			top = n;
4671			np = &top->m_next;
4672			continue;
4673		} else {
4674			needed++;
4675			*np = n;
4676		}
4677
4678		if (copyhdr) {
4679			if (mode == M_COPYM_MOVE_HDR) {
4680				M_COPY_PKTHDR(n, m);
4681			} else if (mode == M_COPYM_COPY_HDR) {
4682				if (m_dup_pkthdr(n, m, wait) == 0)
4683					goto nospace;
4684			}
4685			n->m_pkthdr.len = len;
4686			copyhdr = 0;
4687		}
4688		n->m_len = MIN(len, (m->m_len - off));
4689
4690		if (m->m_flags & M_EXT) {
4691			n->m_ext = m->m_ext;
4692			m_incref(m);
4693			n->m_data = m->m_data + off;
4694			n->m_flags |= M_EXT;
4695		} else {
4696			bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4697			    (unsigned)n->m_len);
4698		}
4699		len -= n->m_len;
4700
4701		if (len == 0) {
4702			if ((off + n->m_len) == m->m_len) {
4703				*m_lastm = m->m_next;
4704				*m_off  = 0;
4705			} else {
4706				*m_lastm = m;
4707				*m_off  = off + n->m_len;
4708			}
4709			break;
4710		}
4711		off = 0;
4712		m = m->m_next;
4713		np = &n->m_next;
4714	}
4715
4716	mtype_stat_inc(MT_HEADER);
4717	mtype_stat_add(type, needed);
4718	mtype_stat_sub(MT_FREE, needed + 1);
4719
4720	ASSERT(list == NULL);
4721	return (top);
4722
4723nospace:
4724	if (list != NULL)
4725		mcache_free_ext(m_cache(MC_MBUF), list);
4726	if (top != NULL)
4727		m_freem(top);
4728	MCFail++;
4729	return (NULL);
4730}
4731
4732/*
4733 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4734 * continuing for "len" bytes, into the indicated buffer.
4735 */
4736void
4737m_copydata(struct mbuf *m, int off, int len, void *vp)
4738{
4739	unsigned count;
4740	char *cp = vp;
4741
4742	if (off < 0 || len < 0)
4743		panic("m_copydata: invalid offset %d or len %d", off, len);
4744
4745	while (off > 0) {
4746		if (m == NULL)
4747			panic("m_copydata: invalid mbuf chain");
4748		if (off < m->m_len)
4749			break;
4750		off -= m->m_len;
4751		m = m->m_next;
4752	}
4753	while (len > 0) {
4754		if (m == NULL)
4755			panic("m_copydata: invalid mbuf chain");
4756		count = MIN(m->m_len - off, len);
4757		bcopy(MTOD(m, caddr_t) + off, cp, count);
4758		len -= count;
4759		cp += count;
4760		off = 0;
4761		m = m->m_next;
4762	}
4763}
4764
4765/*
4766 * Concatenate mbuf chain n to m.  Both chains must be of the same type
4767 * (e.g. MT_DATA).  Any m_pkthdr is not updated.
4768 */
4769void
4770m_cat(struct mbuf *m, struct mbuf *n)
4771{
4772	while (m->m_next)
4773		m = m->m_next;
4774	while (n) {
4775		if ((m->m_flags & M_EXT) ||
4776		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4777			/* just join the two chains */
4778			m->m_next = n;
4779			return;
4780		}
4781		/* splat the data from one into the other */
4782		bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4783		    (u_int)n->m_len);
4784		m->m_len += n->m_len;
4785		n = m_free(n);
4786	}
4787}
4788
4789void
4790m_adj(struct mbuf *mp, int req_len)
4791{
4792	int len = req_len;
4793	struct mbuf *m;
4794	int count;
4795
4796	if ((m = mp) == NULL)
4797		return;
4798	if (len >= 0) {
4799		/*
4800		 * Trim from head.
4801		 */
4802		while (m != NULL && len > 0) {
4803			if (m->m_len <= len) {
4804				len -= m->m_len;
4805				m->m_len = 0;
4806				m = m->m_next;
4807			} else {
4808				m->m_len -= len;
4809				m->m_data += len;
4810				len = 0;
4811			}
4812		}
4813		m = mp;
4814		if (m->m_flags & M_PKTHDR)
4815			m->m_pkthdr.len -= (req_len - len);
4816	} else {
4817		/*
4818		 * Trim from tail.  Scan the mbuf chain,
4819		 * calculating its length and finding the last mbuf.
4820		 * If the adjustment only affects this mbuf, then just
4821		 * adjust and return.  Otherwise, rescan and truncate
4822		 * after the remaining size.
4823		 */
4824		len = -len;
4825		count = 0;
4826		for (;;) {
4827			count += m->m_len;
4828			if (m->m_next == (struct mbuf *)0)
4829				break;
4830			m = m->m_next;
4831		}
4832		if (m->m_len >= len) {
4833			m->m_len -= len;
4834			m = mp;
4835			if (m->m_flags & M_PKTHDR)
4836				m->m_pkthdr.len -= len;
4837			return;
4838		}
4839		count -= len;
4840		if (count < 0)
4841			count = 0;
4842		/*
4843		 * Correct length for chain is "count".
4844		 * Find the mbuf with last data, adjust its length,
4845		 * and toss data from remaining mbufs on chain.
4846		 */
4847		m = mp;
4848		if (m->m_flags & M_PKTHDR)
4849			m->m_pkthdr.len = count;
4850		for (; m; m = m->m_next) {
4851			if (m->m_len >= count) {
4852				m->m_len = count;
4853				break;
4854			}
4855			count -= m->m_len;
4856		}
4857		while ((m = m->m_next))
4858			m->m_len = 0;
4859	}
4860}
4861
4862/*
4863 * Rearange an mbuf chain so that len bytes are contiguous
4864 * and in the data area of an mbuf (so that mtod and dtom
4865 * will work for a structure of size len).  Returns the resulting
4866 * mbuf chain on success, frees it and returns null on failure.
4867 * If there is room, it will add up to max_protohdr-len extra bytes to the
4868 * contiguous region in an attempt to avoid being called next time.
4869 */
4870int MPFail;
4871
4872struct mbuf *
4873m_pullup(struct mbuf *n, int len)
4874{
4875	struct mbuf *m;
4876	int count;
4877	int space;
4878
4879	/*
4880	 * If first mbuf has no cluster, and has room for len bytes
4881	 * without shifting current data, pullup into it,
4882	 * otherwise allocate a new mbuf to prepend to the chain.
4883	 */
4884	if ((n->m_flags & M_EXT) == 0 &&
4885	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4886		if (n->m_len >= len)
4887			return (n);
4888		m = n;
4889		n = n->m_next;
4890		len -= m->m_len;
4891	} else {
4892		if (len > MHLEN)
4893			goto bad;
4894		_MGET(m, M_DONTWAIT, n->m_type);
4895		if (m == 0)
4896			goto bad;
4897		m->m_len = 0;
4898		if (n->m_flags & M_PKTHDR) {
4899			M_COPY_PKTHDR(m, n);
4900			n->m_flags &= ~M_PKTHDR;
4901		}
4902	}
4903	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4904	do {
4905		count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4906		bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4907		    (unsigned)count);
4908		len -= count;
4909		m->m_len += count;
4910		n->m_len -= count;
4911		space -= count;
4912		if (n->m_len)
4913			n->m_data += count;
4914		else
4915			n = m_free(n);
4916	} while (len > 0 && n);
4917	if (len > 0) {
4918		(void) m_free(m);
4919		goto bad;
4920	}
4921	m->m_next = n;
4922	return (m);
4923bad:
4924	m_freem(n);
4925	MPFail++;
4926	return (0);
4927}
4928
4929/*
4930 * Like m_pullup(), except a new mbuf is always allocated, and we allow
4931 * the amount of empty space before the data in the new mbuf to be specified
4932 * (in the event that the caller expects to prepend later).
4933 */
4934__private_extern__ int MSFail = 0;
4935
4936__private_extern__ struct mbuf *
4937m_copyup(struct mbuf *n, int len, int dstoff)
4938{
4939	struct mbuf *m;
4940	int count, space;
4941
4942	if (len > (MHLEN - dstoff))
4943		goto bad;
4944	MGET(m, M_DONTWAIT, n->m_type);
4945	if (m == NULL)
4946		goto bad;
4947	m->m_len = 0;
4948	if (n->m_flags & M_PKTHDR) {
4949		m_copy_pkthdr(m, n);
4950		n->m_flags &= ~M_PKTHDR;
4951	}
4952	m->m_data += dstoff;
4953	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4954	do {
4955		count = min(min(max(len, max_protohdr), space), n->m_len);
4956		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
4957		    (unsigned)count);
4958		len -= count;
4959		m->m_len += count;
4960		n->m_len -= count;
4961		space -= count;
4962		if (n->m_len)
4963			n->m_data += count;
4964		else
4965			n = m_free(n);
4966	} while (len > 0 && n);
4967	if (len > 0) {
4968		(void) m_free(m);
4969		goto bad;
4970	}
4971	m->m_next = n;
4972	return (m);
4973bad:
4974	m_freem(n);
4975	MSFail++;
4976	return (NULL);
4977}
4978
4979/*
4980 * Partition an mbuf chain in two pieces, returning the tail --
4981 * all but the first len0 bytes.  In case of failure, it returns NULL and
4982 * attempts to restore the chain to its original state.
4983 */
4984struct mbuf *
4985m_split(struct mbuf *m0, int len0, int wait)
4986{
4987	return (m_split0(m0, len0, wait, 1));
4988}
4989
4990static struct mbuf *
4991m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
4992{
4993	struct mbuf *m, *n;
4994	unsigned len = len0, remain;
4995
4996	for (m = m0; m && len > m->m_len; m = m->m_next)
4997		len -= m->m_len;
4998	if (m == NULL)
4999		return (NULL);
5000	remain = m->m_len - len;
5001	if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5002		_MGETHDR(n, wait, m0->m_type);
5003		if (n == NULL)
5004			return (NULL);
5005		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5006		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5007		m0->m_pkthdr.len = len0;
5008		if (m->m_flags & M_EXT)
5009			goto extpacket;
5010		if (remain > MHLEN) {
5011			/* m can't be the lead packet */
5012			MH_ALIGN(n, 0);
5013			n->m_next = m_split(m, len, wait);
5014			if (n->m_next == NULL) {
5015				(void) m_free(n);
5016				return (NULL);
5017			} else
5018				return (n);
5019		} else
5020			MH_ALIGN(n, remain);
5021	} else if (remain == 0) {
5022		n = m->m_next;
5023		m->m_next = NULL;
5024		return (n);
5025	} else {
5026		_MGET(n, wait, m->m_type);
5027		if (n == NULL)
5028			return (NULL);
5029		M_ALIGN(n, remain);
5030	}
5031extpacket:
5032	if (m->m_flags & M_EXT) {
5033		n->m_flags |= M_EXT;
5034		n->m_ext = m->m_ext;
5035		m_incref(m);
5036		n->m_data = m->m_data + len;
5037	} else {
5038		bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5039	}
5040	n->m_len = remain;
5041	m->m_len = len;
5042	n->m_next = m->m_next;
5043	m->m_next = NULL;
5044	return (n);
5045}
5046
5047/*
5048 * Routine to copy from device local memory into mbufs.
5049 */
5050struct mbuf *
5051m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5052    void (*copy)(const void *, void *, size_t))
5053{
5054	struct mbuf *m;
5055	struct mbuf *top = NULL, **mp = &top;
5056	int off = off0, len;
5057	char *cp;
5058	char *epkt;
5059
5060	cp = buf;
5061	epkt = cp + totlen;
5062	if (off) {
5063		/*
5064		 * If 'off' is non-zero, packet is trailer-encapsulated,
5065		 * so we have to skip the type and length fields.
5066		 */
5067		cp += off + 2 * sizeof (u_int16_t);
5068		totlen -= 2 * sizeof (u_int16_t);
5069	}
5070	_MGETHDR(m, M_DONTWAIT, MT_DATA);
5071	if (m == NULL)
5072		return (NULL);
5073	m->m_pkthdr.rcvif = ifp;
5074	m->m_pkthdr.len = totlen;
5075	m->m_len = MHLEN;
5076
5077	while (totlen > 0) {
5078		if (top != NULL) {
5079			_MGET(m, M_DONTWAIT, MT_DATA);
5080			if (m == NULL) {
5081				m_freem(top);
5082				return (NULL);
5083			}
5084			m->m_len = MLEN;
5085		}
5086		len = MIN(totlen, epkt - cp);
5087		if (len >= MINCLSIZE) {
5088			MCLGET(m, M_DONTWAIT);
5089			if (m->m_flags & M_EXT) {
5090				m->m_len = len = MIN(len, m_maxsize(MC_CL));
5091			} else {
5092				/* give up when it's out of cluster mbufs */
5093				if (top != NULL)
5094					m_freem(top);
5095				m_freem(m);
5096				return (NULL);
5097			}
5098		} else {
5099			/*
5100			 * Place initial small packet/header at end of mbuf.
5101			 */
5102			if (len < m->m_len) {
5103				if (top == NULL &&
5104				    len + max_linkhdr <= m->m_len)
5105					m->m_data += max_linkhdr;
5106				m->m_len = len;
5107			} else {
5108				len = m->m_len;
5109			}
5110		}
5111		if (copy)
5112			copy(cp, MTOD(m, caddr_t), (unsigned)len);
5113		else
5114			bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5115		cp += len;
5116		*mp = m;
5117		mp = &m->m_next;
5118		totlen -= len;
5119		if (cp == epkt)
5120			cp = buf;
5121	}
5122	return (top);
5123}
5124
5125#ifndef MBUF_GROWTH_NORMAL_THRESH
5126#define	MBUF_GROWTH_NORMAL_THRESH 25
5127#endif
5128
5129/*
5130 * Cluster freelist allocation check.
5131 */
5132static int
5133m_howmany(int num, size_t bufsize)
5134{
5135	int i = 0, j = 0;
5136	u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5137	u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5138	u_int32_t sumclusters, freeclusters;
5139	u_int32_t percent_pool, percent_kmem;
5140	u_int32_t mb_growth, mb_growth_thresh;
5141
5142	VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5143	    bufsize == m_maxsize(MC_16KCL));
5144
5145	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5146
5147	/* Numbers in 2K cluster units */
5148	m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5149	m_clusters = m_total(MC_CL);
5150	m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5151	m_16kclusters = m_total(MC_16KCL);
5152	sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5153
5154	m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5155	m_clfree = m_infree(MC_CL);
5156	m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5157	m_16kclfree = m_infree(MC_16KCL);
5158	freeclusters = m_mbfree + m_clfree + m_bigclfree;
5159
5160	/* Bail if we've maxed out the mbuf memory map */
5161	if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5162	    (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5163	    (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5164		return (0);
5165	}
5166
5167	if (bufsize == m_maxsize(MC_BIGCL)) {
5168		/* Under minimum */
5169		if (m_bigclusters < m_minlimit(MC_BIGCL))
5170			return (m_minlimit(MC_BIGCL) - m_bigclusters);
5171
5172		percent_pool =
5173		    ((sumclusters - freeclusters) * 100) / sumclusters;
5174		percent_kmem = (sumclusters * 100) / nclusters;
5175
5176		/*
5177		 * If a light/normal user, grow conservatively (75%)
5178		 * If a heavy user, grow aggressively (50%)
5179		 */
5180		if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5181			mb_growth = MB_GROWTH_NORMAL;
5182		else
5183			mb_growth = MB_GROWTH_AGGRESSIVE;
5184
5185		if (percent_kmem < 5) {
5186			/* For initial allocations */
5187			i = num;
5188		} else {
5189			/* Return if >= MBIGCL_LOWAT clusters available */
5190			if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5191			    m_total(MC_BIGCL) >=
5192			    MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5193				return (0);
5194
5195			/* Ensure at least num clusters are accessible */
5196			if (num >= m_infree(MC_BIGCL))
5197				i = num - m_infree(MC_BIGCL);
5198			if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5199				j = num - (m_total(MC_BIGCL) -
5200				    m_minlimit(MC_BIGCL));
5201
5202			i = MAX(i, j);
5203
5204			/*
5205			 * Grow pool if percent_pool > 75 (normal growth)
5206			 * or percent_pool > 50 (aggressive growth).
5207			 */
5208			mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5209			if (percent_pool > mb_growth_thresh)
5210				j = ((sumclusters + num) >> mb_growth) -
5211				    freeclusters;
5212			i = MAX(i, j);
5213		}
5214
5215		/* Check to ensure we didn't go over limits */
5216		if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5217			i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5218		if ((i << 1) + sumclusters >= nclusters)
5219			i = (nclusters - sumclusters) >> 1;
5220		VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5221		VERIFY(sumclusters + (i << 1) <= nclusters);
5222
5223	} else { /* 16K CL */
5224		VERIFY(njcl > 0);
5225		/* Under minimum */
5226		if (m_16kclusters < MIN16KCL)
5227			return (MIN16KCL - m_16kclusters);
5228		if (m_16kclfree >= M16KCL_LOWAT)
5229			return (0);
5230
5231		/* Ensure at least num clusters are available */
5232		if (num >= m_16kclfree)
5233			i = num - m_16kclfree;
5234
5235		/* Always grow 16KCL pool aggressively */
5236		if (((m_16kclusters + num) >> 1) > m_16kclfree)
5237			j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5238		i = MAX(i, j);
5239
5240		/* Check to ensure we don't go over limit */
5241		if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5242			i = m_maxlimit(MC_16KCL) - m_16kclusters;
5243		VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
5244	}
5245	return (i);
5246}
5247/*
5248 * Return the number of bytes in the mbuf chain, m.
5249 */
5250unsigned int
5251m_length(struct mbuf *m)
5252{
5253	struct mbuf *m0;
5254	unsigned int pktlen;
5255
5256	if (m->m_flags & M_PKTHDR)
5257		return (m->m_pkthdr.len);
5258
5259	pktlen = 0;
5260	for (m0 = m; m0 != NULL; m0 = m0->m_next)
5261		pktlen += m0->m_len;
5262	return (pktlen);
5263}
5264
5265/*
5266 * Copy data from a buffer back into the indicated mbuf chain,
5267 * starting "off" bytes from the beginning, extending the mbuf
5268 * chain if necessary.
5269 */
5270void
5271m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5272{
5273#if DEBUG
5274	struct mbuf *origm = m0;
5275	int error;
5276#endif /* DEBUG */
5277
5278	if (m0 == NULL)
5279		return;
5280
5281#if DEBUG
5282	error =
5283#endif /* DEBUG */
5284	m_copyback0(&m0, off, len, cp,
5285	    M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5286
5287#if DEBUG
5288	if (error != 0 || (m0 != NULL && origm != m0))
5289		panic("m_copyback");
5290#endif /* DEBUG */
5291}
5292
5293struct mbuf *
5294m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5295{
5296	int error;
5297
5298	/* don't support chain expansion */
5299	VERIFY(off + len <= m_length(m0));
5300
5301	error = m_copyback0(&m0, off, len, cp,
5302	    M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5303	if (error) {
5304		/*
5305		 * no way to recover from partial success.
5306		 * just free the chain.
5307		 */
5308		m_freem(m0);
5309		return (NULL);
5310	}
5311	return (m0);
5312}
5313
5314/*
5315 * m_makewritable: ensure the specified range writable.
5316 */
5317int
5318m_makewritable(struct mbuf **mp, int off, int len, int how)
5319{
5320	int error;
5321#if DEBUG
5322	struct mbuf *n;
5323	int origlen, reslen;
5324
5325	origlen = m_length(*mp);
5326#endif /* DEBUG */
5327
5328#if 0 /* M_COPYALL is large enough */
5329	if (len == M_COPYALL)
5330		len = m_length(*mp) - off; /* XXX */
5331#endif
5332
5333	error = m_copyback0(mp, off, len, NULL,
5334	    M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5335
5336#if DEBUG
5337	reslen = 0;
5338	for (n = *mp; n; n = n->m_next)
5339		reslen += n->m_len;
5340	if (origlen != reslen)
5341		panic("m_makewritable: length changed");
5342	if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5343		panic("m_makewritable: inconsist");
5344#endif /* DEBUG */
5345
5346	return (error);
5347}
5348
5349static int
5350m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5351    int how)
5352{
5353	int mlen;
5354	struct mbuf *m, *n;
5355	struct mbuf **mp;
5356	int totlen = 0;
5357	const char *cp = vp;
5358
5359	VERIFY(mp0 != NULL);
5360	VERIFY(*mp0 != NULL);
5361	VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5362	VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5363
5364	/*
5365	 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5366	 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5367	 */
5368
5369	VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5370
5371	mp = mp0;
5372	m = *mp;
5373	while (off > (mlen = m->m_len)) {
5374		off -= mlen;
5375		totlen += mlen;
5376		if (m->m_next == NULL) {
5377			int tspace;
5378extend:
5379			if (!(flags & M_COPYBACK0_EXTEND))
5380				goto out;
5381
5382			/*
5383			 * try to make some space at the end of "m".
5384			 */
5385
5386			mlen = m->m_len;
5387			if (off + len >= MINCLSIZE &&
5388			    !(m->m_flags & M_EXT) && m->m_len == 0) {
5389				MCLGET(m, how);
5390			}
5391			tspace = M_TRAILINGSPACE(m);
5392			if (tspace > 0) {
5393				tspace = MIN(tspace, off + len);
5394				VERIFY(tspace > 0);
5395				bzero(mtod(m, char *) + m->m_len,
5396				    MIN(off, tspace));
5397				m->m_len += tspace;
5398				off += mlen;
5399				totlen -= mlen;
5400				continue;
5401			}
5402
5403			/*
5404			 * need to allocate an mbuf.
5405			 */
5406
5407			if (off + len >= MINCLSIZE) {
5408				n = m_getcl(how, m->m_type, 0);
5409			} else {
5410				n = _M_GET(how, m->m_type);
5411			}
5412			if (n == NULL) {
5413				goto out;
5414			}
5415			n->m_len = 0;
5416			n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5417			bzero(mtod(n, char *), MIN(n->m_len, off));
5418			m->m_next = n;
5419		}
5420		mp = &m->m_next;
5421		m = m->m_next;
5422	}
5423	while (len > 0) {
5424		mlen = m->m_len - off;
5425		if (mlen != 0 && m_mclhasreference(m)) {
5426			char *datap;
5427			int eatlen;
5428
5429			/*
5430			 * this mbuf is read-only.
5431			 * allocate a new writable mbuf and try again.
5432			 */
5433
5434#if DIAGNOSTIC
5435			if (!(flags & M_COPYBACK0_COW))
5436				panic("m_copyback0: read-only");
5437#endif /* DIAGNOSTIC */
5438
5439			/*
5440			 * if we're going to write into the middle of
5441			 * a mbuf, split it first.
5442			 */
5443			if (off > 0 && len < mlen) {
5444				n = m_split0(m, off, how, 0);
5445				if (n == NULL)
5446					goto enobufs;
5447				m->m_next = n;
5448				mp = &m->m_next;
5449				m = n;
5450				off = 0;
5451				continue;
5452			}
5453
5454			/*
5455			 * XXX TODO coalesce into the trailingspace of
5456			 * the previous mbuf when possible.
5457			 */
5458
5459			/*
5460			 * allocate a new mbuf.  copy packet header if needed.
5461			 */
5462			n = _M_GET(how, m->m_type);
5463			if (n == NULL)
5464				goto enobufs;
5465			if (off == 0 && (m->m_flags & M_PKTHDR)) {
5466				M_COPY_PKTHDR(n, m);
5467				n->m_len = MHLEN;
5468			} else {
5469				if (len >= MINCLSIZE)
5470					MCLGET(n, M_DONTWAIT);
5471				n->m_len =
5472				    (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5473			}
5474			if (n->m_len > len)
5475				n->m_len = len;
5476
5477			/*
5478			 * free the region which has been overwritten.
5479			 * copying data from old mbufs if requested.
5480			 */
5481			if (flags & M_COPYBACK0_PRESERVE)
5482				datap = mtod(n, char *);
5483			else
5484				datap = NULL;
5485			eatlen = n->m_len;
5486			VERIFY(off == 0 || eatlen >= mlen);
5487			if (off > 0) {
5488				VERIFY(len >= mlen);
5489				m->m_len = off;
5490				m->m_next = n;
5491				if (datap) {
5492					m_copydata(m, off, mlen, datap);
5493					datap += mlen;
5494				}
5495				eatlen -= mlen;
5496				mp = &m->m_next;
5497				m = m->m_next;
5498			}
5499			while (m != NULL && m_mclhasreference(m) &&
5500			    n->m_type == m->m_type && eatlen > 0) {
5501				mlen = MIN(eatlen, m->m_len);
5502				if (datap) {
5503					m_copydata(m, 0, mlen, datap);
5504					datap += mlen;
5505				}
5506				m->m_data += mlen;
5507				m->m_len -= mlen;
5508				eatlen -= mlen;
5509				if (m->m_len == 0)
5510					*mp = m = m_free(m);
5511			}
5512			if (eatlen > 0)
5513				n->m_len -= eatlen;
5514			n->m_next = m;
5515			*mp = m = n;
5516			continue;
5517		}
5518		mlen = MIN(mlen, len);
5519		if (flags & M_COPYBACK0_COPYBACK) {
5520			bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5521			cp += mlen;
5522		}
5523		len -= mlen;
5524		mlen += off;
5525		off = 0;
5526		totlen += mlen;
5527		if (len == 0)
5528			break;
5529		if (m->m_next == NULL) {
5530			goto extend;
5531		}
5532		mp = &m->m_next;
5533		m = m->m_next;
5534	}
5535out:
5536	if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5537		VERIFY(flags & M_COPYBACK0_EXTEND);
5538		m->m_pkthdr.len = totlen;
5539	}
5540
5541	return (0);
5542
5543enobufs:
5544	return (ENOBUFS);
5545}
5546
5547uint64_t
5548mcl_to_paddr(char *addr)
5549{
5550	vm_offset_t base_phys;
5551
5552	if (!MBUF_IN_MAP(addr))
5553		return (0);
5554	base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
5555
5556	if (base_phys == 0)
5557		return (0);
5558	return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)));
5559}
5560
5561/*
5562 * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
5563 * And really copy the thing.  That way, we don't "precompute" checksums
5564 * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
5565 * small packets, don't dup into a cluster.  That way received  packets
5566 * don't take up too much room in the sockbuf (cf. sbspace()).
5567 */
5568int MDFail;
5569
5570struct mbuf *
5571m_dup(struct mbuf *m, int how)
5572{
5573	struct mbuf *n, **np;
5574	struct mbuf *top;
5575	int copyhdr = 0;
5576
5577	np = &top;
5578	top = NULL;
5579	if (m->m_flags & M_PKTHDR)
5580		copyhdr = 1;
5581
5582	/*
5583	 * Quick check: if we have one mbuf and its data fits in an
5584	 *  mbuf with packet header, just copy and go.
5585	 */
5586	if (m->m_next == NULL) {
5587		/* Then just move the data into an mbuf and be done... */
5588		if (copyhdr) {
5589			if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5590				if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5591					return (NULL);
5592				n->m_len = m->m_len;
5593				m_dup_pkthdr(n, m, how);
5594				bcopy(m->m_data, n->m_data, m->m_len);
5595				return (n);
5596			}
5597		} else if (m->m_len <= MLEN) {
5598			if ((n = _M_GET(how, m->m_type)) == NULL)
5599				return (NULL);
5600			bcopy(m->m_data, n->m_data, m->m_len);
5601			n->m_len = m->m_len;
5602			return (n);
5603		}
5604	}
5605	while (m != NULL) {
5606#if BLUE_DEBUG
5607		kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5608		    m->m_data);
5609#endif
5610		if (copyhdr)
5611			n = _M_GETHDR(how, m->m_type);
5612		else
5613			n = _M_GET(how, m->m_type);
5614		if (n == NULL)
5615			goto nospace;
5616		if (m->m_flags & M_EXT) {
5617			if (m->m_len <= m_maxsize(MC_CL))
5618				MCLGET(n, how);
5619			else if (m->m_len <= m_maxsize(MC_BIGCL))
5620				n = m_mbigget(n, how);
5621			else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5622				n = m_m16kget(n, how);
5623			if (!(n->m_flags & M_EXT)) {
5624				(void) m_free(n);
5625				goto nospace;
5626			}
5627		}
5628		*np = n;
5629		if (copyhdr) {
5630			/* Don't use M_COPY_PKTHDR: preserve m_data */
5631			m_dup_pkthdr(n, m, how);
5632			copyhdr = 0;
5633			if (!(n->m_flags & M_EXT))
5634				n->m_data = n->m_pktdat;
5635		}
5636		n->m_len = m->m_len;
5637		/*
5638		 * Get the dup on the same bdry as the original
5639		 * Assume that the two mbufs have the same offset to data area
5640		 * (up to word boundaries)
5641		 */
5642		bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5643		m = m->m_next;
5644		np = &n->m_next;
5645#if BLUE_DEBUG
5646		kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5647		    n->m_data);
5648#endif
5649	}
5650
5651	if (top == NULL)
5652		MDFail++;
5653	return (top);
5654
5655nospace:
5656	m_freem(top);
5657	MDFail++;
5658	return (NULL);
5659}
5660
5661#define	MBUF_MULTIPAGES(m)						\
5662	(((m)->m_flags & M_EXT) &&					\
5663	((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) ||	\
5664	(!IS_P2ALIGNED((m)->m_data, NBPG) &&				\
5665	P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5666
5667static struct mbuf *
5668m_expand(struct mbuf *m, struct mbuf **last)
5669{
5670	struct mbuf *top = NULL;
5671	struct mbuf **nm = &top;
5672	uintptr_t data0, data;
5673	unsigned int len0, len;
5674
5675	VERIFY(MBUF_MULTIPAGES(m));
5676	VERIFY(m->m_next == NULL);
5677	data0 = (uintptr_t)m->m_data;
5678	len0 = m->m_len;
5679	*last = top;
5680
5681	for (;;) {
5682		struct mbuf *n;
5683
5684		data = data0;
5685		if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5686			len = NBPG;
5687		else if (!IS_P2ALIGNED(data, NBPG) &&
5688		    P2ROUNDUP(data, NBPG) < (data + len0))
5689			len = P2ROUNDUP(data, NBPG) - data;
5690		else
5691			len = len0;
5692
5693		VERIFY(len > 0);
5694		VERIFY(m->m_flags & M_EXT);
5695		m->m_data = (void *)data;
5696		m->m_len = len;
5697
5698		*nm = *last = m;
5699		nm = &m->m_next;
5700		m->m_next = NULL;
5701
5702		data0 += len;
5703		len0 -= len;
5704		if (len0 == 0)
5705			break;
5706
5707		n = _M_RETRY(M_DONTWAIT, MT_DATA);
5708		if (n == NULL) {
5709			m_freem(top);
5710			top = *last = NULL;
5711			break;
5712		}
5713
5714		n->m_ext = m->m_ext;
5715		m_incref(m);
5716		n->m_flags |= M_EXT;
5717		m = n;
5718	}
5719	return (top);
5720}
5721
5722struct mbuf *
5723m_normalize(struct mbuf *m)
5724{
5725	struct mbuf *top = NULL;
5726	struct mbuf **nm = &top;
5727	boolean_t expanded = FALSE;
5728
5729	while (m != NULL) {
5730		struct mbuf *n;
5731
5732		n = m->m_next;
5733		m->m_next = NULL;
5734
5735		/* Does the data cross one or more page boundaries? */
5736		if (MBUF_MULTIPAGES(m)) {
5737			struct mbuf *last;
5738			if ((m = m_expand(m, &last)) == NULL) {
5739				m_freem(n);
5740				m_freem(top);
5741				top = NULL;
5742				break;
5743			}
5744			*nm = m;
5745			nm = &last->m_next;
5746			expanded = TRUE;
5747		} else {
5748			*nm = m;
5749			nm = &m->m_next;
5750		}
5751		m = n;
5752	}
5753	if (expanded)
5754		atomic_add_32(&mb_normalized, 1);
5755	return (top);
5756}
5757
5758/*
5759 * Append the specified data to the indicated mbuf chain,
5760 * Extend the mbuf chain if the new data does not fit in
5761 * existing space.
5762 *
5763 * Return 1 if able to complete the job; otherwise 0.
5764 */
5765int
5766m_append(struct mbuf *m0, int len, caddr_t cp)
5767{
5768	struct mbuf *m, *n;
5769	int remainder, space;
5770
5771	for (m = m0; m->m_next != NULL; m = m->m_next)
5772		;
5773	remainder = len;
5774	space = M_TRAILINGSPACE(m);
5775	if (space > 0) {
5776		/*
5777		 * Copy into available space.
5778		 */
5779		if (space > remainder)
5780			space = remainder;
5781		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5782		m->m_len += space;
5783		cp += space, remainder -= space;
5784	}
5785	while (remainder > 0) {
5786		/*
5787		 * Allocate a new mbuf; could check space
5788		 * and allocate a cluster instead.
5789		 */
5790		n = m_get(M_WAITOK, m->m_type);
5791		if (n == NULL)
5792			break;
5793		n->m_len = min(MLEN, remainder);
5794		bcopy(cp, mtod(n, caddr_t), n->m_len);
5795		cp += n->m_len;
5796		remainder -= n->m_len;
5797		m->m_next = n;
5798		m = n;
5799	}
5800	if (m0->m_flags & M_PKTHDR)
5801		m0->m_pkthdr.len += len - remainder;
5802	return (remainder == 0);
5803}
5804
5805struct mbuf *
5806m_last(struct mbuf *m)
5807{
5808	while (m->m_next != NULL)
5809		m = m->m_next;
5810	return (m);
5811}
5812
5813unsigned int
5814m_fixhdr(struct mbuf *m0)
5815{
5816	u_int len;
5817
5818	VERIFY(m0->m_flags & M_PKTHDR);
5819
5820	len = m_length2(m0, NULL);
5821	m0->m_pkthdr.len = len;
5822	return (len);
5823}
5824
5825unsigned int
5826m_length2(struct mbuf *m0, struct mbuf **last)
5827{
5828	struct mbuf *m;
5829	u_int len;
5830
5831	len = 0;
5832	for (m = m0; m != NULL; m = m->m_next) {
5833		len += m->m_len;
5834		if (m->m_next == NULL)
5835			break;
5836	}
5837	if (last != NULL)
5838		*last = m;
5839	return (len);
5840}
5841
5842/*
5843 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5844 * and clusters.  If allocation fails and this cannot be completed, NULL will
5845 * be returned, but the passed in chain will be unchanged.  Upon success,
5846 * the original chain will be freed, and the new chain will be returned.
5847 *
5848 * If a non-packet header is passed in, the original mbuf (chain?) will
5849 * be returned unharmed.
5850 *
5851 * If offset is specfied, the first mbuf in the chain will have a leading
5852 * space of the amount stated by the "off" parameter.
5853 *
5854 * This routine requires that the m_pkthdr.header field of the original
5855 * mbuf chain is cleared by the caller.
5856 */
5857struct mbuf *
5858m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
5859{
5860	struct mbuf *m_new = NULL, *m_final = NULL;
5861	int progress = 0, length, pktlen;
5862
5863	if (!(m0->m_flags & M_PKTHDR))
5864		return (m0);
5865
5866	VERIFY(off < MHLEN);
5867	m_fixhdr(m0); /* Needed sanity check */
5868
5869	pktlen = m0->m_pkthdr.len + off;
5870	if (pktlen > MHLEN)
5871		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
5872	else
5873		m_final = m_gethdr(how, MT_DATA);
5874
5875	if (m_final == NULL)
5876		goto nospace;
5877
5878	if (off > 0) {
5879		pktlen -= off;
5880		m_final->m_data += off;
5881	}
5882
5883	/*
5884	 * Caller must have handled the contents pointed to by this
5885	 * pointer before coming here, as otherwise it will point to
5886	 * the original mbuf which will get freed upon success.
5887	 */
5888	VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
5889
5890	if (m_dup_pkthdr(m_final, m0, how) == 0)
5891		goto nospace;
5892
5893	m_new = m_final;
5894
5895	while (progress < pktlen) {
5896		length = pktlen - progress;
5897		if (length > MCLBYTES)
5898			length = MCLBYTES;
5899		length -= ((m_new == m_final) ? off : 0);
5900
5901		if (m_new == NULL) {
5902			if (length > MLEN)
5903				m_new = m_getcl(how, MT_DATA, 0);
5904			else
5905				m_new = m_get(how, MT_DATA);
5906			if (m_new == NULL)
5907				goto nospace;
5908		}
5909
5910		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
5911		progress += length;
5912		m_new->m_len = length;
5913		if (m_new != m_final)
5914			m_cat(m_final, m_new);
5915		m_new = NULL;
5916	}
5917	m_freem(m0);
5918	m0 = m_final;
5919	return (m0);
5920nospace:
5921	if (m_final)
5922		m_freem(m_final);
5923	return (NULL);
5924}
5925
5926struct mbuf *
5927m_defrag(struct mbuf *m0, int how)
5928{
5929	return (m_defrag_offset(m0, 0, how));
5930}
5931
5932void
5933m_mchtype(struct mbuf *m, int t)
5934{
5935	mtype_stat_inc(t);
5936	mtype_stat_dec(m->m_type);
5937	(m)->m_type = t;
5938}
5939
5940void *
5941m_mtod(struct mbuf *m)
5942{
5943	return (MTOD(m, void *));
5944}
5945
5946struct mbuf *
5947m_dtom(void *x)
5948{
5949	return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
5950}
5951
5952void
5953m_mcheck(struct mbuf *m)
5954{
5955	_MCHECK(m);
5956}
5957
5958/*
5959 * Return a pointer to mbuf/offset of location in mbuf chain.
5960 */
5961struct mbuf *
5962m_getptr(struct mbuf *m, int loc, int *off)
5963{
5964
5965	while (loc >= 0) {
5966		/* Normal end of search. */
5967		if (m->m_len > loc) {
5968			*off = loc;
5969			return (m);
5970		} else {
5971			loc -= m->m_len;
5972			if (m->m_next == NULL) {
5973				if (loc == 0) {
5974					/* Point at the end of valid data. */
5975					*off = m->m_len;
5976					return (m);
5977				}
5978				return (NULL);
5979			}
5980			m = m->m_next;
5981		}
5982	}
5983	return (NULL);
5984}
5985
5986/*
5987 * Inform the corresponding mcache(s) that there's a waiter below.
5988 */
5989static void
5990mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
5991{
5992	mcache_waiter_inc(m_cache(class));
5993	if (comp) {
5994		if (class == MC_CL) {
5995			mcache_waiter_inc(m_cache(MC_MBUF_CL));
5996		} else if (class == MC_BIGCL) {
5997			mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5998		} else if (class == MC_16KCL) {
5999			mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6000		} else {
6001			mcache_waiter_inc(m_cache(MC_MBUF_CL));
6002			mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6003		}
6004	}
6005}
6006
6007/*
6008 * Inform the corresponding mcache(s) that there's no more waiter below.
6009 */
6010static void
6011mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6012{
6013	mcache_waiter_dec(m_cache(class));
6014	if (comp) {
6015		if (class == MC_CL) {
6016			mcache_waiter_dec(m_cache(MC_MBUF_CL));
6017		} else if (class == MC_BIGCL) {
6018			mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6019		} else if (class == MC_16KCL) {
6020			mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6021		} else {
6022			mcache_waiter_dec(m_cache(MC_MBUF_CL));
6023			mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6024		}
6025	}
6026}
6027
6028/*
6029 * Called during slab (blocking and non-blocking) allocation.  If there
6030 * is at least one waiter, and the time since the first waiter is blocked
6031 * is greater than the watchdog timeout, panic the system.
6032 */
6033static void
6034mbuf_watchdog(void)
6035{
6036	struct timeval now;
6037	unsigned int since;
6038
6039	if (mb_waiters == 0 || !mb_watchdog)
6040		return;
6041
6042	microuptime(&now);
6043	since = now.tv_sec - mb_wdtstart.tv_sec;
6044	if (since >= MB_WDT_MAXTIME) {
6045		panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6046		    mb_waiters, since, mbuf_dump());
6047		/* NOTREACHED */
6048	}
6049}
6050
6051/*
6052 * Called during blocking allocation.  Returns TRUE if one or more objects
6053 * are available at the per-CPU caches layer and that allocation should be
6054 * retried at that level.
6055 */
6056static boolean_t
6057mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
6058{
6059	boolean_t mcache_retry = FALSE;
6060
6061	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6062
6063	/* Check if there's anything at the cache layer */
6064	if (mbuf_cached_above(class, wait)) {
6065		mcache_retry = TRUE;
6066		goto done;
6067	}
6068
6069	/* Nothing?  Then try hard to get it from somewhere */
6070	m_reclaim(class, num, (wait & MCR_COMP));
6071
6072	/* We tried hard and got something? */
6073	if (m_infree(class) > 0) {
6074		mbstat.m_wait++;
6075		goto done;
6076	} else if (mbuf_cached_above(class, wait)) {
6077		mbstat.m_wait++;
6078		mcache_retry = TRUE;
6079		goto done;
6080	} else if (wait & MCR_TRYHARD) {
6081		mcache_retry = TRUE;
6082		goto done;
6083	}
6084
6085	/*
6086	 * There's really nothing for us right now; inform the
6087	 * cache(s) that there is a waiter below and go to sleep.
6088	 */
6089	mbuf_waiter_inc(class, (wait & MCR_COMP));
6090
6091	VERIFY(!(wait & MCR_NOSLEEP));
6092
6093	/*
6094	 * If this is the first waiter, arm the watchdog timer.  Otherwise
6095	 * check if we need to panic the system due to watchdog timeout.
6096	 */
6097	if (mb_waiters == 0)
6098		microuptime(&mb_wdtstart);
6099	else
6100		mbuf_watchdog();
6101
6102	mb_waiters++;
6103	(void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
6104
6105	/* We are now up; stop getting notified until next round */
6106	mbuf_waiter_dec(class, (wait & MCR_COMP));
6107
6108	/* We waited and got something */
6109	if (m_infree(class) > 0) {
6110		mbstat.m_wait++;
6111		goto done;
6112	} else if (mbuf_cached_above(class, wait)) {
6113		mbstat.m_wait++;
6114		mcache_retry = TRUE;
6115	}
6116done:
6117	return (mcache_retry);
6118}
6119
6120static void
6121mbuf_worker_thread(void)
6122{
6123	int mbuf_expand;
6124
6125	while (1) {
6126		lck_mtx_lock(mbuf_mlock);
6127
6128		mbuf_expand = 0;
6129		if (mbuf_expand_mcl) {
6130			int n;
6131
6132			/* Adjust to current number of cluster in use */
6133			n = mbuf_expand_mcl -
6134			    (m_total(MC_CL) - m_infree(MC_CL));
6135			if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6136				n = m_maxlimit(MC_CL) - m_total(MC_CL);
6137			mbuf_expand_mcl = 0;
6138
6139			if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6140				mbuf_expand++;
6141		}
6142		if (mbuf_expand_big) {
6143			int n;
6144
6145			/* Adjust to current number of 4 KB cluster in use */
6146			n = mbuf_expand_big -
6147			    (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6148			if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6149				n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6150			mbuf_expand_big = 0;
6151
6152			if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6153				mbuf_expand++;
6154		}
6155		if (mbuf_expand_16k) {
6156			int n;
6157
6158			/* Adjust to current number of 16 KB cluster in use */
6159			n = mbuf_expand_16k -
6160			    (m_total(MC_16KCL) - m_infree(MC_16KCL));
6161			if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6162				n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6163			mbuf_expand_16k = 0;
6164
6165			if (n > 0)
6166				(void) freelist_populate(MC_16KCL, n, M_WAIT);
6167		}
6168
6169		/*
6170		 * Because we can run out of memory before filling the mbuf
6171		 * map, we should not allocate more clusters than they are
6172		 * mbufs -- otherwise we could have a large number of useless
6173		 * clusters allocated.
6174		 */
6175		if (mbuf_expand) {
6176			while (m_total(MC_MBUF) <
6177			    (m_total(MC_BIGCL) + m_total(MC_CL))) {
6178				if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6179					break;
6180			}
6181		}
6182
6183		lck_mtx_unlock(mbuf_mlock);
6184
6185		assert_wait(&mbuf_worker_run, THREAD_UNINT);
6186		(void) thread_block((thread_continue_t)mbuf_worker_thread);
6187	}
6188}
6189
6190static void
6191mbuf_worker_thread_init(void)
6192{
6193	mbuf_worker_ready++;
6194	mbuf_worker_thread();
6195}
6196
6197static mcl_slab_t *
6198slab_get(void *buf)
6199{
6200	mcl_slabg_t *slg;
6201	unsigned int ix, k;
6202
6203	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6204
6205	VERIFY(MBUF_IN_MAP(buf));
6206	ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
6207	VERIFY(ix < maxslabgrp);
6208
6209	if ((slg = slabstbl[ix]) == NULL) {
6210		/*
6211		 * In the current implementation, we never shrink the memory
6212		 * pool (hence the cluster map); if we attempt to reallocate
6213		 * a cluster group when it's already allocated, panic since
6214		 * this is a sign of a memory corruption (slabstbl[ix] got
6215		 * nullified).  This also means that there shouldn't be any
6216		 * hole in the kernel sub-map for the mbuf pool.
6217		 */
6218		++slabgrp;
6219		VERIFY(ix < slabgrp);
6220		/*
6221		 * Slabs expansion can only be done single threaded; when
6222		 * we get here, it must be as a result of m_clalloc() which
6223		 * is serialized and therefore mb_clalloc_busy must be set.
6224		 */
6225		VERIFY(mb_clalloc_busy);
6226		lck_mtx_unlock(mbuf_mlock);
6227
6228		/* This is a new buffer; create the slabs group for it */
6229		MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6230		    M_WAITOK | M_ZERO);
6231		VERIFY(slg != NULL);
6232
6233		lck_mtx_lock(mbuf_mlock);
6234		/*
6235		 * No other thread could have gone into m_clalloc() after
6236		 * we dropped the lock above, so verify that it's true.
6237		 */
6238		VERIFY(mb_clalloc_busy);
6239
6240		slabstbl[ix] = slg;
6241
6242		/* Chain each slab in the group to its forward neighbor */
6243		for (k = 1; k < NSLABSPMB; k++)
6244			slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6245		VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6246
6247		/* And chain the last slab in the previous group to this */
6248		if (ix > 0) {
6249			VERIFY(slabstbl[ix - 1]->
6250			    slg_slab[NSLABSPMB - 1].sl_next == NULL);
6251			slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6252			    &slg->slg_slab[0];
6253		}
6254	}
6255
6256	ix = MTOBG(buf) % NSLABSPMB;
6257	VERIFY(ix < NSLABSPMB);
6258
6259	return (&slg->slg_slab[ix]);
6260}
6261
6262static void
6263slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6264    void *base, void *head, unsigned int len, int refcnt, int chunks)
6265{
6266	sp->sl_class = class;
6267	sp->sl_flags = flags;
6268	sp->sl_base = base;
6269	sp->sl_head = head;
6270	sp->sl_len = len;
6271	sp->sl_refcnt = refcnt;
6272	sp->sl_chunks = chunks;
6273	slab_detach(sp);
6274}
6275
6276static void
6277slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6278{
6279	VERIFY(slab_is_detached(sp));
6280	m_slab_cnt(class)++;
6281	TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6282	sp->sl_flags &= ~SLF_DETACHED;
6283	if (class == MC_16KCL) {
6284		int k;
6285		for (k = 1; k < NSLABSP16KB; k++) {
6286			sp = sp->sl_next;
6287			/* Next slab must already be present */
6288			VERIFY(sp != NULL);
6289			VERIFY(slab_is_detached(sp));
6290			sp->sl_flags &= ~SLF_DETACHED;
6291		}
6292	}
6293}
6294
6295static void
6296slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6297{
6298	VERIFY(!slab_is_detached(sp));
6299	VERIFY(m_slab_cnt(class) > 0);
6300	m_slab_cnt(class)--;
6301	TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6302	slab_detach(sp);
6303	if (class == MC_16KCL) {
6304		int k;
6305		for (k = 1; k < NSLABSP16KB; k++) {
6306			sp = sp->sl_next;
6307			/* Next slab must already be present */
6308			VERIFY(sp != NULL);
6309			VERIFY(!slab_is_detached(sp));
6310			slab_detach(sp);
6311		}
6312	}
6313}
6314
6315static boolean_t
6316slab_inrange(mcl_slab_t *sp, void *buf)
6317{
6318	return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6319	    (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6320}
6321
6322#undef panic
6323
6324static void
6325slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6326{
6327	int i;
6328	unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6329	uintptr_t buf = (uintptr_t)sp->sl_base;
6330
6331	for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6332		void *next = ((mcache_obj_t *)buf)->obj_next;
6333		if (next != addr)
6334			continue;
6335		if (!mclverify) {
6336			if (next != NULL && !MBUF_IN_MAP(next)) {
6337				mcache_t *cp = m_cache(sp->sl_class);
6338				panic("%s: %s buffer %p in slab %p modified "
6339				    "after free at offset 0: %p out of range "
6340				    "[%p-%p)\n", __func__, cp->mc_name,
6341				    (void *)buf, sp, next, mbutl, embutl);
6342				/* NOTREACHED */
6343			}
6344		} else {
6345			mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6346			    (mcache_obj_t *)buf);
6347			mcl_audit_verify_nextptr(next, mca);
6348		}
6349	}
6350}
6351
6352static void
6353slab_detach(mcl_slab_t *sp)
6354{
6355	sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6356	sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6357	sp->sl_flags |= SLF_DETACHED;
6358}
6359
6360static boolean_t
6361slab_is_detached(mcl_slab_t *sp)
6362{
6363	return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6364	    (intptr_t)sp->sl_link.tqe_prev == -1 &&
6365	    (sp->sl_flags & SLF_DETACHED));
6366}
6367
6368static void
6369mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6370    mcache_obj_t **con_list, size_t con_size, unsigned int num)
6371{
6372	mcache_audit_t *mca, *mca_tail;
6373	mcache_obj_t *con = NULL;
6374	boolean_t save_contents = (con_list != NULL);
6375	unsigned int i, ix;
6376
6377	ASSERT(num <= NMBPBG);
6378	ASSERT(con_list == NULL || con_size != 0);
6379
6380	ix = MTOBG(buf);
6381	VERIFY(ix < maxclaudit);
6382
6383	/* Make sure we haven't been here before */
6384	for (i = 0; i < NMBPBG; i++)
6385		VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6386
6387	mca = mca_tail = *mca_list;
6388	if (save_contents)
6389		con = *con_list;
6390
6391	for (i = 0; i < num; i++) {
6392		mcache_audit_t *next;
6393
6394		next = mca->mca_next;
6395		bzero(mca, sizeof (*mca));
6396		mca->mca_next = next;
6397		mclaudit[ix].cl_audit[i] = mca;
6398
6399		/* Attach the contents buffer if requested */
6400		if (save_contents) {
6401			mcl_saved_contents_t *msc =
6402			    (mcl_saved_contents_t *)(void *)con;
6403
6404			VERIFY(msc != NULL);
6405			VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6406			VERIFY(con_size == sizeof (*msc));
6407			mca->mca_contents_size = con_size;
6408			mca->mca_contents = msc;
6409			con = con->obj_next;
6410			bzero(mca->mca_contents, mca->mca_contents_size);
6411		}
6412
6413		mca_tail = mca;
6414		mca = mca->mca_next;
6415	}
6416
6417	if (save_contents)
6418		*con_list = con;
6419
6420	*mca_list = mca_tail->mca_next;
6421	mca_tail->mca_next = NULL;
6422}
6423
6424/*
6425 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6426 * the corresponding audit structure for that buffer.
6427 */
6428static mcache_audit_t *
6429mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6430{
6431	mcache_audit_t *mca = NULL;
6432	int ix = MTOBG(o);
6433
6434	VERIFY(ix < maxclaudit);
6435	VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6436
6437	switch (class) {
6438	case MC_MBUF:
6439		/*
6440		 * For the mbuf case, find the index of the page
6441		 * used by the mbuf and use that index to locate the
6442		 * base address of the page.  Then find out the
6443		 * mbuf index relative to the page base and use
6444		 * it to locate the audit structure.
6445		 */
6446		VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6447		mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
6448		break;
6449
6450	case MC_CL:
6451		/*
6452		 * Same thing as above, but for 2KB clusters in a page.
6453		 */
6454		VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6455		mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6456		break;
6457
6458	case MC_BIGCL:
6459	case MC_16KCL:
6460		/*
6461		 * Same as above, but only return the first element.
6462		 */
6463		mca = mclaudit[ix].cl_audit[0];
6464		break;
6465
6466	default:
6467		VERIFY(0);
6468		/* NOTREACHED */
6469	}
6470
6471	return (mca);
6472}
6473
6474static void
6475mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6476    boolean_t alloc)
6477{
6478	struct mbuf *m = addr;
6479	mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6480
6481	VERIFY(mca->mca_contents != NULL &&
6482	    mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6483
6484	if (mclverify)
6485		mcl_audit_verify_nextptr(next, mca);
6486
6487	if (!alloc) {
6488		/* Save constructed mbuf fields */
6489		mcl_audit_save_mbuf(m, mca);
6490		if (mclverify) {
6491			mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6492			    m_maxsize(MC_MBUF));
6493		}
6494		((mcache_obj_t *)m)->obj_next = next;
6495		return;
6496	}
6497
6498	/* Check if the buffer has been corrupted while in freelist */
6499	if (mclverify) {
6500		mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6501	}
6502	/* Restore constructed mbuf fields */
6503	mcl_audit_restore_mbuf(m, mca, composite);
6504}
6505
6506static void
6507mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6508{
6509	struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
6510
6511	if (composite) {
6512		struct mbuf *next = m->m_next;
6513		VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6514		    MBUF_IS_COMPOSITE(ms));
6515		VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6516		/*
6517		 * We could have hand-picked the mbuf fields and restore
6518		 * them individually, but that will be a maintenance
6519		 * headache.  Instead, restore everything that was saved;
6520		 * the mbuf layer will recheck and reinitialize anyway.
6521		 */
6522		bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
6523		m->m_next = next;
6524	} else {
6525		/*
6526		 * For a regular mbuf (no cluster attached) there's nothing
6527		 * to restore other than the type field, which is expected
6528		 * to be MT_FREE.
6529		 */
6530		m->m_type = ms->m_type;
6531	}
6532	_MCHECK(m);
6533}
6534
6535static void
6536mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6537{
6538	VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6539	_MCHECK(m);
6540	bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
6541}
6542
6543static void
6544mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6545    boolean_t save_next)
6546{
6547	mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6548
6549	if (!alloc) {
6550		if (mclverify) {
6551			mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6552		}
6553		if (save_next) {
6554			mcl_audit_verify_nextptr(next, mca);
6555			((mcache_obj_t *)addr)->obj_next = next;
6556		}
6557	} else if (mclverify) {
6558		/* Check if the buffer has been corrupted while in freelist */
6559		mcl_audit_verify_nextptr(next, mca);
6560		mcache_audit_free_verify_set(mca, addr, 0, size);
6561	}
6562}
6563
6564static void
6565mcl_audit_scratch(mcache_audit_t *mca)
6566{
6567	void *stack[MCACHE_STACK_DEPTH + 1];
6568	mcl_scratch_audit_t *msa;
6569	struct timeval now;
6570
6571	VERIFY(mca->mca_contents != NULL);
6572	msa = MCA_SAVED_SCRATCH_PTR(mca);
6573
6574	msa->msa_pthread = msa->msa_thread;
6575	msa->msa_thread = current_thread();
6576	bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
6577	msa->msa_pdepth = msa->msa_depth;
6578	bzero(stack, sizeof (stack));
6579	msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
6580	bcopy(&stack[1], msa->msa_stack, sizeof (mca->mca_pstack));
6581
6582	msa->msa_ptstamp = msa->msa_tstamp;
6583	microuptime(&now);
6584	/* tstamp is in ms relative to base_ts */
6585	msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
6586	if ((now.tv_sec - mb_start.tv_sec) > 0)
6587		msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
6588}
6589
6590static void
6591mcl_audit_mcheck_panic(struct mbuf *m)
6592{
6593	mcache_audit_t *mca;
6594
6595	MRANGE(m);
6596	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6597
6598	panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6599	    m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6600	/* NOTREACHED */
6601}
6602
6603static void
6604mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6605{
6606	if (next != NULL && !MBUF_IN_MAP(next) &&
6607	    (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
6608		panic("mcl_audit: buffer %p modified after free at offset 0: "
6609		    "%p out of range [%p-%p)\n%s\n",
6610		    mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6611		/* NOTREACHED */
6612	}
6613}
6614
6615/* This function turns on mbuf leak detection */
6616static void
6617mleak_activate(void)
6618{
6619	mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6620	PE_parse_boot_argn("mleak_sample_factor",
6621	    &mleak_table.mleak_sample_factor,
6622	    sizeof (mleak_table.mleak_sample_factor));
6623
6624	if (mleak_table.mleak_sample_factor == 0)
6625		mclfindleak = 0;
6626
6627	if (mclfindleak == 0)
6628		return;
6629
6630	vm_size_t alloc_size =
6631	    mleak_alloc_buckets * sizeof (struct mallocation);
6632	vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6633
6634	MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6635	    M_TEMP, M_WAITOK | M_ZERO);
6636	VERIFY(mleak_allocations != NULL);
6637
6638	MALLOC(mleak_traces, struct mtrace *, trace_size,
6639	    M_TEMP, M_WAITOK | M_ZERO);
6640	VERIFY(mleak_traces != NULL);
6641
6642	MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6643	    M_TEMP, M_WAITOK | M_ZERO);
6644	VERIFY(mleak_stat != NULL);
6645	mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6646#ifdef __LP64__
6647	mleak_stat->ml_isaddr64 = 1;
6648#endif /* __LP64__ */
6649}
6650
6651static void
6652mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6653{
6654	int temp;
6655
6656	if (mclfindleak == 0)
6657		return;
6658
6659	if (!alloc)
6660		return (mleak_free(addr));
6661
6662	temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6663
6664	if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6665		uintptr_t bt[MLEAK_STACK_DEPTH];
6666		int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6667		mleak_log(bt, addr, logged, num);
6668	}
6669}
6670
6671/*
6672 * This function records the allocation in the mleak_allocations table
6673 * and the backtrace in the mleak_traces table; if allocation slot is in use,
6674 * replace old allocation with new one if the trace slot is in use, return
6675 * (or increment refcount if same trace).
6676 */
6677static boolean_t
6678mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6679{
6680	struct mallocation *allocation;
6681	struct mtrace *trace;
6682	uint32_t trace_index;
6683
6684	/* Quit if someone else modifying the tables */
6685	if (!lck_mtx_try_lock_spin(mleak_lock)) {
6686		mleak_table.total_conflicts++;
6687		return (FALSE);
6688	}
6689
6690	allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6691	    mleak_alloc_buckets)];
6692	trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6693	trace = &mleak_traces[trace_index];
6694
6695	VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6696	VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6697
6698	allocation->hitcount++;
6699	trace->hitcount++;
6700
6701	/*
6702	 * If the allocation bucket we want is occupied
6703	 * and the occupier has the same trace, just bail.
6704	 */
6705	if (allocation->element != NULL &&
6706	    trace_index == allocation->trace_index) {
6707		mleak_table.alloc_collisions++;
6708		lck_mtx_unlock(mleak_lock);
6709		return (TRUE);
6710	}
6711
6712	/*
6713	 * Store the backtrace in the traces array;
6714	 * Size of zero = trace bucket is free.
6715	 */
6716	if (trace->allocs > 0 &&
6717	    bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6718		/* Different, unique trace, but the same hash! Bail out. */
6719		trace->collisions++;
6720		mleak_table.trace_collisions++;
6721		lck_mtx_unlock(mleak_lock);
6722		return (TRUE);
6723	} else if (trace->allocs > 0) {
6724		/* Same trace, already added, so increment refcount */
6725		trace->allocs++;
6726	} else {
6727		/* Found an unused trace bucket, so record the trace here */
6728		if (trace->depth != 0) {
6729			/* this slot previously used but not currently in use */
6730			mleak_table.trace_overwrites++;
6731		}
6732		mleak_table.trace_recorded++;
6733		trace->allocs = 1;
6734		memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6735		trace->depth = depth;
6736		trace->collisions = 0;
6737	}
6738
6739	/* Step 2: Store the allocation record in the allocations array */
6740	if (allocation->element != NULL) {
6741		/*
6742		 * Replace an existing allocation.  No need to preserve
6743		 * because only a subset of the allocations are being
6744		 * recorded anyway.
6745		 */
6746		mleak_table.alloc_collisions++;
6747	} else if (allocation->trace_index != 0) {
6748		mleak_table.alloc_overwrites++;
6749	}
6750	allocation->element = addr;
6751	allocation->trace_index = trace_index;
6752	allocation->count = num;
6753	mleak_table.alloc_recorded++;
6754	mleak_table.outstanding_allocs++;
6755
6756	lck_mtx_unlock(mleak_lock);
6757	return (TRUE);
6758}
6759
6760static void
6761mleak_free(mcache_obj_t *addr)
6762{
6763	while (addr != NULL) {
6764		struct mallocation *allocation = &mleak_allocations
6765		    [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6766
6767		if (allocation->element == addr &&
6768		    allocation->trace_index < mleak_trace_buckets) {
6769			lck_mtx_lock_spin(mleak_lock);
6770			if (allocation->element == addr &&
6771			    allocation->trace_index < mleak_trace_buckets) {
6772				struct mtrace *trace;
6773				trace = &mleak_traces[allocation->trace_index];
6774				/* allocs = 0 means trace bucket is unused */
6775				if (trace->allocs > 0)
6776					trace->allocs--;
6777				if (trace->allocs == 0)
6778					trace->depth = 0;
6779				/* NULL element means alloc bucket is unused */
6780				allocation->element = NULL;
6781				mleak_table.outstanding_allocs--;
6782			}
6783			lck_mtx_unlock(mleak_lock);
6784		}
6785		addr = addr->obj_next;
6786	}
6787}
6788
6789static void
6790mleak_sort_traces()
6791{
6792	int i, j, k;
6793	struct mtrace *swap;
6794
6795	for(i = 0; i < MLEAK_NUM_TRACES; i++)
6796		mleak_top_trace[i] = NULL;
6797
6798	for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
6799	{
6800		if (mleak_traces[i].allocs <= 0)
6801			continue;
6802
6803		mleak_top_trace[j] = &mleak_traces[i];
6804		for (k = j; k > 0; k--) {
6805			if (mleak_top_trace[k]->allocs <=
6806			    mleak_top_trace[k-1]->allocs)
6807				break;
6808
6809			swap = mleak_top_trace[k-1];
6810			mleak_top_trace[k-1] = mleak_top_trace[k];
6811			mleak_top_trace[k] = swap;
6812		}
6813		j++;
6814	}
6815
6816	j--;
6817	for(; i < mleak_trace_buckets; i++) {
6818		if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
6819			continue;
6820
6821		mleak_top_trace[j] = &mleak_traces[i];
6822
6823		for (k = j; k > 0; k--) {
6824			if (mleak_top_trace[k]->allocs <=
6825			    mleak_top_trace[k-1]->allocs)
6826				break;
6827
6828			swap = mleak_top_trace[k-1];
6829			mleak_top_trace[k-1] = mleak_top_trace[k];
6830			mleak_top_trace[k] = swap;
6831		}
6832	}
6833}
6834
6835static void
6836mleak_update_stats()
6837{
6838	mleak_trace_stat_t *mltr;
6839	int i;
6840
6841	VERIFY(mleak_stat != NULL);
6842#ifdef __LP64__
6843	VERIFY(mleak_stat->ml_isaddr64);
6844#else
6845	VERIFY(!mleak_stat->ml_isaddr64);
6846#endif /* !__LP64__ */
6847	VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
6848
6849	mleak_sort_traces();
6850
6851	mltr = &mleak_stat->ml_trace[0];
6852	bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
6853	for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6854	int j;
6855
6856		if (mleak_top_trace[i] == NULL ||
6857		    mleak_top_trace[i]->allocs == 0)
6858			continue;
6859
6860		mltr->mltr_collisions	= mleak_top_trace[i]->collisions;
6861		mltr->mltr_hitcount	= mleak_top_trace[i]->hitcount;
6862		mltr->mltr_allocs	= mleak_top_trace[i]->allocs;
6863		mltr->mltr_depth	= mleak_top_trace[i]->depth;
6864
6865		VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
6866		for (j = 0; j < mltr->mltr_depth; j++)
6867			mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
6868
6869		mltr++;
6870	}
6871}
6872
6873static struct mbtypes {
6874	int		mt_type;
6875	const char	*mt_name;
6876} mbtypes[] = {
6877	{ MT_DATA,	"data" },
6878	{ MT_OOBDATA,	"oob data" },
6879	{ MT_CONTROL,	"ancillary data" },
6880	{ MT_HEADER,	"packet headers" },
6881	{ MT_SOCKET,	"socket structures" },
6882	{ MT_PCB,	"protocol control blocks" },
6883	{ MT_RTABLE,	"routing table entries" },
6884	{ MT_HTABLE,	"IMP host table entries" },
6885	{ MT_ATABLE,	"address resolution tables" },
6886	{ MT_FTABLE,	"fragment reassembly queue headers" },
6887	{ MT_SONAME,	"socket names and addresses" },
6888	{ MT_SOOPTS,	"socket options" },
6889	{ MT_RIGHTS,	"access rights" },
6890	{ MT_IFADDR,	"interface addresses" },
6891	{ MT_TAG,	"packet tags" },
6892	{ 0,		NULL }
6893};
6894
6895#define	MBUF_DUMP_BUF_CHK() {	\
6896	clen -= k;		\
6897	if (clen < 1)		\
6898		goto done;	\
6899	c += k;			\
6900}
6901
6902static char *
6903mbuf_dump(void)
6904{
6905	unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6906	u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6907	u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6908	u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6909	int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6910	uint8_t seen[256];
6911	struct mbtypes *mp;
6912	mb_class_stat_t *sp;
6913	mleak_trace_stat_t *mltr;
6914	char *c = mbuf_dump_buf;
6915	int i, k, clen = MBUF_DUMP_BUF_SIZE;
6916
6917	mbuf_dump_buf[0] = '\0';
6918
6919	/* synchronize all statistics in the mbuf table */
6920	mbuf_stat_sync();
6921	mbuf_mtypes_sync(TRUE);
6922
6923	sp = &mb_stat->mbs_class[0];
6924	for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
6925		u_int32_t mem;
6926
6927		if (m_class(i) == MC_MBUF) {
6928			m_mbufs = sp->mbcl_active;
6929		} else if (m_class(i) == MC_CL) {
6930			m_clfree = sp->mbcl_total - sp->mbcl_active;
6931		} else if (m_class(i) == MC_BIGCL) {
6932			m_bigclfree = sp->mbcl_total - sp->mbcl_active;
6933		} else if (njcl > 0 && m_class(i) == MC_16KCL) {
6934			m_16kclfree = sp->mbcl_total - sp->mbcl_active;
6935			m_16kclusters = sp->mbcl_total;
6936		} else if (m_class(i) == MC_MBUF_CL) {
6937			m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
6938		} else if (m_class(i) == MC_MBUF_BIGCL) {
6939			m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
6940		} else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
6941			m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
6942		}
6943
6944		mem = sp->mbcl_ctotal * sp->mbcl_size;
6945		totmem += mem;
6946		totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
6947		    sp->mbcl_size;
6948
6949	}
6950
6951	/* adjust free counts to include composite caches */
6952	m_clfree += m_mbufclfree;
6953	m_bigclfree += m_mbufbigclfree;
6954	m_16kclfree += m_mbuf16kclfree;
6955
6956	totmbufs = 0;
6957	for (mp = mbtypes; mp->mt_name != NULL; mp++)
6958		totmbufs += mbstat.m_mtypes[mp->mt_type];
6959	if (totmbufs > m_mbufs)
6960		totmbufs = m_mbufs;
6961	k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
6962	MBUF_DUMP_BUF_CHK();
6963
6964	bzero(&seen, sizeof (seen));
6965	for (mp = mbtypes; mp->mt_name != NULL; mp++) {
6966		if (mbstat.m_mtypes[mp->mt_type] != 0) {
6967			seen[mp->mt_type] = 1;
6968			k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
6969			    mbstat.m_mtypes[mp->mt_type], mp->mt_name);
6970			MBUF_DUMP_BUF_CHK();
6971		}
6972	}
6973	seen[MT_FREE] = 1;
6974	for (i = 0; i < nmbtypes; i++)
6975		if (!seen[i] && mbstat.m_mtypes[i] != 0) {
6976			k = snprintf(c, clen, "\t%u mbufs allocated to "
6977			    "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
6978			MBUF_DUMP_BUF_CHK();
6979		}
6980	if ((m_mbufs - totmbufs) > 0) {
6981		k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
6982		    m_mbufs - totmbufs);
6983		MBUF_DUMP_BUF_CHK();
6984	}
6985	k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
6986	    "%u/%u mbuf 4KB clusters in use\n",
6987	    (unsigned int)(mbstat.m_clusters - m_clfree),
6988	    (unsigned int)mbstat.m_clusters,
6989	    (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
6990	    (unsigned int)mbstat.m_bigclusters);
6991	MBUF_DUMP_BUF_CHK();
6992
6993	if (njcl > 0) {
6994		k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
6995		    m_16kclusters - m_16kclfree, m_16kclusters,
6996		    njclbytes / 1024);
6997		MBUF_DUMP_BUF_CHK();
6998	}
6999	totused = totmem - totfree;
7000	if (totmem == 0) {
7001		totpct = 0;
7002	} else if (totused < (ULONG_MAX / 100)) {
7003		totpct = (totused * 100) / totmem;
7004	} else {
7005		u_long totmem1 = totmem / 100;
7006		u_long totused1 = totused / 100;
7007		totpct = (totused1 * 100) / totmem1;
7008	}
7009	k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7010	    "in use)\n", totmem / 1024, totpct);
7011	MBUF_DUMP_BUF_CHK();
7012
7013	/* mbuf leak detection statistics */
7014	mleak_update_stats();
7015
7016	k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7017	MBUF_DUMP_BUF_CHK();
7018	k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7019	    mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7020	    mleak_table.mleak_sample_factor);
7021	MBUF_DUMP_BUF_CHK();
7022	k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7023	    mleak_table.outstanding_allocs);
7024	MBUF_DUMP_BUF_CHK();
7025	k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7026	    mleak_table.alloc_recorded, mleak_table.trace_recorded);
7027	MBUF_DUMP_BUF_CHK();
7028	k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7029	    mleak_table.alloc_collisions, mleak_table.trace_collisions);
7030	MBUF_DUMP_BUF_CHK();
7031	k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7032	    mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7033	MBUF_DUMP_BUF_CHK();
7034	k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7035	    mleak_table.total_conflicts);
7036	MBUF_DUMP_BUF_CHK();
7037
7038	k = snprintf(c, clen, "top %d outstanding traces:\n",
7039	    mleak_stat->ml_cnt);
7040	MBUF_DUMP_BUF_CHK();
7041	for (i = 0; i < mleak_stat->ml_cnt; i++) {
7042		mltr = &mleak_stat->ml_trace[i];
7043		k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7044		    "%llu hit(s), %llu collision(s)\n", (i + 1),
7045		    mltr->mltr_allocs, mltr->mltr_hitcount,
7046		    mltr->mltr_collisions);
7047		MBUF_DUMP_BUF_CHK();
7048	}
7049
7050	if (mleak_stat->ml_isaddr64)
7051		k = snprintf(c, clen, MB_LEAK_HDR_64);
7052	else
7053		k = snprintf(c, clen, MB_LEAK_HDR_32);
7054	MBUF_DUMP_BUF_CHK();
7055
7056	for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
7057		int j;
7058		k = snprintf(c, clen, "%2d: ", (i + 1));
7059		MBUF_DUMP_BUF_CHK();
7060		for (j = 0; j < mleak_stat->ml_cnt; j++) {
7061			mltr = &mleak_stat->ml_trace[j];
7062			if (i < mltr->mltr_depth) {
7063				if (mleak_stat->ml_isaddr64) {
7064					k = snprintf(c, clen, "0x%0llx  ",
7065					    mltr->mltr_addr[i]);
7066				} else {
7067					k = snprintf(c, clen,
7068					    "0x%08x  ",
7069					    (u_int32_t)mltr->mltr_addr[i]);
7070				}
7071			} else {
7072				if (mleak_stat->ml_isaddr64)
7073					k = snprintf(c, clen,
7074					    MB_LEAK_SPACING_64);
7075				else
7076					k = snprintf(c, clen,
7077					    MB_LEAK_SPACING_32);
7078			}
7079			MBUF_DUMP_BUF_CHK();
7080		}
7081		k = snprintf(c, clen, "\n");
7082		MBUF_DUMP_BUF_CHK();
7083	}
7084done:
7085	return (mbuf_dump_buf);
7086}
7087
7088#undef MBUF_DUMP_BUF_CHK
7089
7090/*
7091 * Convert between a regular and a packet header mbuf.  Caller is responsible
7092 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7093 */
7094int
7095m_reinit(struct mbuf *m, int hdr)
7096{
7097	int ret = 0;
7098
7099	if (hdr) {
7100		VERIFY(!(m->m_flags & M_PKTHDR));
7101		if (!(m->m_flags & M_EXT) &&
7102		    (m->m_data != m->m_dat || m->m_len > 0)) {
7103			/*
7104			 * If there's no external cluster attached and the
7105			 * mbuf appears to contain user data, we cannot
7106			 * safely convert this to a packet header mbuf,
7107			 * as the packet header structure might overlap
7108			 * with the data.
7109			 */
7110			printf("%s: cannot set M_PKTHDR on altered mbuf %p, "
7111			    "m_data %p (expected %p), m_len %d (expected 0)\n",
7112			    __func__, m, m->m_data, m->m_dat, m->m_len);
7113			ret = EBUSY;
7114		} else {
7115			VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
7116			m->m_flags |= M_PKTHDR;
7117			MBUF_INIT_PKTHDR(m);
7118		}
7119	} else {
7120		/* Check for scratch area overflow */
7121		m_redzone_verify(m);
7122		/* Free the aux data and tags if there is any */
7123		m_tag_delete_chain(m, NULL);
7124		m->m_flags &= ~M_PKTHDR;
7125	}
7126
7127	return (ret);
7128}
7129
7130void
7131m_scratch_init(struct mbuf *m)
7132{
7133	VERIFY(m->m_flags & M_PKTHDR);
7134
7135	bzero(&m->m_pkthdr.pkt_mpriv, sizeof (m->m_pkthdr.pkt_mpriv));
7136}
7137
7138u_int32_t
7139m_scratch_get(struct mbuf *m, u_int8_t **p)
7140{
7141	VERIFY(m->m_flags & M_PKTHDR);
7142
7143	if (mcltrace) {
7144		mcache_audit_t *mca;
7145
7146		lck_mtx_lock(mbuf_mlock);
7147		mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7148		if (mca->mca_uflags & MB_SCVALID)
7149			mcl_audit_scratch(mca);
7150		lck_mtx_unlock(mbuf_mlock);
7151	}
7152
7153	*p = (u_int8_t *)&m->m_pkthdr.pkt_mpriv;
7154	return (sizeof (m->m_pkthdr.pkt_mpriv));
7155}
7156
7157static void
7158m_redzone_init(struct mbuf *m)
7159{
7160	VERIFY(m->m_flags & M_PKTHDR);
7161	/*
7162	 * Each mbuf has a unique red zone pattern, which is a XOR
7163	 * of the red zone cookie and the address of the mbuf.
7164	 */
7165	m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7166}
7167
7168static void
7169m_redzone_verify(struct mbuf *m)
7170{
7171	u_int32_t mb_redzone;
7172
7173	VERIFY(m->m_flags & M_PKTHDR);
7174
7175	mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7176	if (m->m_pkthdr.redzone != mb_redzone) {
7177		panic("mbuf %p redzone violation with value 0x%x "
7178		    "(instead of 0x%x, using cookie 0x%x)\n",
7179		    m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7180		/* NOTREACHED */
7181	}
7182}
7183
7184SYSCTL_DECL(_kern_ipc);
7185SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
7186    CTLFLAG_RD | CTLFLAG_LOCKED,
7187    0, 0, mbstat_sysctl, "S,mbstat", "");
7188SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
7189    CTLFLAG_RD | CTLFLAG_LOCKED,
7190    0, 0, mb_stat_sysctl, "S,mb_stat", "");
7191SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
7192    CTLFLAG_RD | CTLFLAG_LOCKED,
7193    0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
7194SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
7195    CTLFLAG_RD | CTLFLAG_LOCKED,
7196    0, 0, mleak_table_sysctl, "S,mleak_table", "");
7197SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
7198    CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
7199SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
7200    CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
7201SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
7202    CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
7203