1/*
2 * Copyright (c) 1998-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 *	The Regents of the University of California.  All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 *    must display the following acknowledgement:
43 *	This product includes software developed by the University of
44 *	California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 *    may be used to endorse or promote products derived from this software
47 *    without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
62 */
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections.  This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/malloc.h>
73#include <sys/mbuf.h>
74#include <sys/kernel.h>
75#include <sys/sysctl.h>
76#include <sys/syslog.h>
77#include <sys/protosw.h>
78#include <sys/domain.h>
79#include <sys/queue.h>
80#include <sys/proc.h>
81
82#include <dev/random/randomdev.h>
83
84#include <kern/kern_types.h>
85#include <kern/simple_lock.h>
86#include <kern/queue.h>
87#include <kern/sched_prim.h>
88#include <kern/cpu_number.h>
89#include <kern/zalloc.h>
90
91#include <libkern/OSAtomic.h>
92#include <libkern/OSDebug.h>
93#include <libkern/libkern.h>
94
95#include <IOKit/IOMapper.h>
96
97#include <machine/limits.h>
98#include <machine/machine_routines.h>
99
100#if CONFIG_MACF_NET
101#include <security/mac_framework.h>
102#endif /* MAC_NET */
103
104#include <sys/mcache.h>
105#include <net/ntstat.h>
106
107/*
108 * MBUF IMPLEMENTATION NOTES.
109 *
110 * There is a total of 5 per-CPU caches:
111 *
112 * MC_MBUF:
113 *	This is a cache of rudimentary objects of MSIZE in size; each
114 *	object represents an mbuf structure.  This cache preserves only
115 *	the m_type field of the mbuf during its transactions.
116 *
117 * MC_CL:
118 *	This is a cache of rudimentary objects of MCLBYTES in size; each
119 *	object represents a mcluster structure.  This cache does not
120 *	preserve the contents of the objects during its transactions.
121 *
122 * MC_BIGCL:
123 *	This is a cache of rudimentary objects of MBIGCLBYTES in size; each
124 *	object represents a mbigcluster structure.  This cache does not
125 *	preserve the contents of the objects during its transaction.
126 *
127 * MC_MBUF_CL:
128 *	This is a cache of mbufs each having a cluster attached to it.
129 *	It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
130 *	fields of the mbuf related to the external cluster are preserved
131 *	during transactions.
132 *
133 * MC_MBUF_BIGCL:
134 *	This is a cache of mbufs each having a big cluster attached to it.
135 *	It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
136 *	fields of the mbuf related to the external cluster are preserved
137 *	during transactions.
138 *
139 * OBJECT ALLOCATION:
140 *
141 * Allocation requests are handled first at the per-CPU (mcache) layer
142 * before falling back to the slab layer.  Performance is optimal when
143 * the request is satisfied at the CPU layer because global data/lock
144 * never gets accessed.  When the slab layer is entered for allocation,
145 * the slab freelist will be checked first for available objects before
146 * the VM backing store is invoked.  Slab layer operations are serialized
147 * for all of the caches as the mbuf global lock is held most of the time.
148 * Allocation paths are different depending on the class of objects:
149 *
150 * a. Rudimentary object:
151 *
152 *	{ m_get_common(), m_clattach(), m_mclget(),
153 *	  m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
154 *	  composite object allocation }
155 *			|	^
156 *			|	|
157 *			|	+-----------------------+
158 *			v				|
159 *	   mcache_alloc/mcache_alloc_ext()	mbuf_slab_audit()
160 *			|				^
161 *			v				|
162 *		   [CPU cache] ------->	(found?) -------+
163 *			|				|
164 *			v				|
165 *		 mbuf_slab_alloc()			|
166 *			|				|
167 *			v				|
168 *	+---------> [freelist] ------->	(found?) -------+
169 *	|		|
170 *	|		v
171 *	|	    m_clalloc()
172 *	|		|
173 *	|		v
174 *	+---<<---- kmem_mb_alloc()
175 *
176 * b. Composite object:
177 *
178 *	{ m_getpackets_internal(), m_allocpacket_internal() }
179 *			|	^
180 *			|	|
181 *			|	+------	(done) ---------+
182 *			v				|
183 *	   mcache_alloc/mcache_alloc_ext()	mbuf_cslab_audit()
184 *			|				^
185 *			v				|
186 *		   [CPU cache] ------->	(found?) -------+
187 *			|				|
188 *			v				|
189 *		 mbuf_cslab_alloc()			|
190 *			|				|
191 *			v				|
192 *		    [freelist] ------->	(found?) -------+
193 *			|				|
194 *			v				|
195 *		(rudimentary object)			|
196 *	   mcache_alloc/mcache_alloc_ext() ------>>-----+
197 *
198 * Auditing notes: If auditing is enabled, buffers will be subjected to
199 * integrity checks by the audit routine.  This is done by verifying their
200 * contents against DEADBEEF (free) pattern before returning them to caller.
201 * As part of this step, the routine will also record the transaction and
202 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
203 * also restore any constructed data structure fields if necessary.
204 *
205 * OBJECT DEALLOCATION:
206 *
207 * Freeing an object simply involves placing it into the CPU cache; this
208 * pollutes the cache to benefit subsequent allocations.  The slab layer
209 * will only be entered if the object is to be purged out of the cache.
210 * During normal operations, this happens only when the CPU layer resizes
211 * its bucket while it's adjusting to the allocation load.  Deallocation
212 * paths are different depending on the class of objects:
213 *
214 * a. Rudimentary object:
215 *
216 *	{ m_free(), m_freem_list(), composite object deallocation }
217 *			|	^
218 *			|	|
219 *			|	+------	(done) ---------+
220 *			v				|
221 *	   mcache_free/mcache_free_ext()		|
222 *			|				|
223 *			v				|
224 *		mbuf_slab_audit()			|
225 *			|				|
226 *			v				|
227 *		   [CPU cache] ---> (not purging?) -----+
228 *			|				|
229 *			v				|
230 *		 mbuf_slab_free()			|
231 *			|				|
232 *			v				|
233 *		    [freelist] ----------->>------------+
234 *	 (objects never get purged to VM)
235 *
236 * b. Composite object:
237 *
238 *	{ m_free(), m_freem_list() }
239 *			|	^
240 *			|	|
241 *			|	+------	(done) ---------+
242 *			v				|
243 *	   mcache_free/mcache_free_ext()		|
244 *			|				|
245 *			v				|
246 *		mbuf_cslab_audit()			|
247 *			|				|
248 *			v				|
249 *		   [CPU cache] ---> (not purging?) -----+
250 *			|				|
251 *			v				|
252 *		 mbuf_cslab_free()			|
253 *			|				|
254 *			v				|
255 *		    [freelist] ---> (not purging?) -----+
256 *			|				|
257 *			v				|
258 *		(rudimentary object)			|
259 *	   mcache_free/mcache_free_ext() ------->>------+
260 *
261 * Auditing notes: If auditing is enabled, the audit routine will save
262 * any constructed data structure fields (if necessary) before filling the
263 * contents of the buffers with DEADBEEF (free) pattern and recording the
264 * transaction.  Buffers that are freed (whether at CPU or slab layer) are
265 * expected to contain the free pattern.
266 *
267 * DEBUGGING:
268 *
269 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
270 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
271 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
272 * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
273 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
274 * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
275 *
276 * Each object is associated with exactly one mcache_audit_t structure that
277 * contains the information related to its last buffer transaction.  Given
278 * an address of an object, the audit structure can be retrieved by finding
279 * the position of the object relevant to the base address of the cluster:
280 *
281 *	+------------+			+=============+
282 *	| mbuf addr  |			| mclaudit[i] |
283 *	+------------+			+=============+
284 *	      |				| cl_audit[0] |
285 *	i = MTOBG(addr)			+-------------+
286 *	      |			+----->	| cl_audit[1] | -----> mcache_audit_t
287 *	b = BGTOM(i)		|	+-------------+
288 *	      |			|	|     ...     |
289 *	x = MCLIDX(b, addr)	|	+-------------+
290 *	      |			|	| cl_audit[7] |
291 *	      +-----------------+	+-------------+
292 *		 (e.g. x == 1)
293 *
294 * The mclaudit[] array is allocated at initialization time, but its contents
295 * get populated when the corresponding cluster is created.  Because a page
296 * can be turned into NMBPBG number of mbufs, we preserve enough space for the
297 * mbufs so that there is a 1-to-1 mapping between them.  A page that never
298 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
299 * remaining entries unused.  For 16KB cluster, only one entry from the first
300 * page is allocated and used for the entire object.
301 */
302
303/* TODO: should be in header file */
304/* kernel translater */
305extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
306extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
307extern vm_map_t mb_map;		/* special map */
308
309/* Global lock */
310decl_lck_mtx_data(static, mbuf_mlock_data);
311static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
312static lck_attr_t *mbuf_mlock_attr;
313static lck_grp_t *mbuf_mlock_grp;
314static lck_grp_attr_t *mbuf_mlock_grp_attr;
315
316/* Back-end (common) layer */
317static void *mbuf_worker_run;	/* wait channel for worker thread */
318static int mbuf_worker_ready;	/* worker thread is runnable */
319static int mbuf_expand_mcl;	/* number of cluster creation requets */
320static int mbuf_expand_big;	/* number of big cluster creation requests */
321static int mbuf_expand_16k;	/* number of 16KB cluster creation requests */
322static int ncpu;		/* number of CPUs */
323static ppnum_t *mcl_paddr;	/* Array of cluster physical addresses */
324static ppnum_t mcl_pages;	/* Size of array (# physical pages) */
325static ppnum_t mcl_paddr_base;	/* Handle returned by IOMapper::iovmAlloc() */
326static mcache_t *ref_cache;	/* Cache of cluster reference & flags */
327static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
328static unsigned int mbuf_debug;	/* patchable mbuf mcache flags */
329static unsigned int mb_normalized; /* number of packets "normalized" */
330
331#define	MB_GROWTH_AGGRESSIVE	1	/* Threshold: 1/2 of total */
332#define	MB_GROWTH_NORMAL	2	/* Threshold: 3/4 of total */
333
334typedef enum {
335	MC_MBUF = 0,	/* Regular mbuf */
336	MC_CL,		/* Cluster */
337	MC_BIGCL,	/* Large (4KB) cluster */
338	MC_16KCL,	/* Jumbo (16KB) cluster */
339	MC_MBUF_CL,	/* mbuf + cluster */
340	MC_MBUF_BIGCL,	/* mbuf + large (4KB) cluster */
341	MC_MBUF_16KCL	/* mbuf + jumbo (16KB) cluster */
342} mbuf_class_t;
343
344#define	MBUF_CLASS_MIN		MC_MBUF
345#define	MBUF_CLASS_MAX		MC_MBUF_16KCL
346#define	MBUF_CLASS_LAST		MC_16KCL
347#define	MBUF_CLASS_VALID(c) \
348	((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
349#define	MBUF_CLASS_COMPOSITE(c) \
350	((int)(c) > MBUF_CLASS_LAST)
351
352
353/*
354 * mbuf specific mcache allocation request flags.
355 */
356#define	MCR_COMP	MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
357
358/*
359 * Per-cluster slab structure.
360 *
361 * A slab is a cluster control structure that contains one or more object
362 * chunks; the available chunks are chained in the slab's freelist (sl_head).
363 * Each time a chunk is taken out of the slab, the slab's reference count
364 * gets incremented.  When all chunks have been taken out, the empty slab
365 * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
366 * returned to a slab causes the slab's reference count to be decremented;
367 * it also causes the slab to be reinserted back to class's slab list, if
368 * it's not already done.
369 *
370 * Compartmentalizing of the object chunks into slabs allows us to easily
371 * merge one or more slabs together when the adjacent slabs are idle, as
372 * well as to convert or move a slab from one class to another; e.g. the
373 * mbuf cluster slab can be converted to a regular cluster slab when all
374 * mbufs in the slab have been freed.
375 *
376 * A slab may also span across multiple clusters for chunks larger than
377 * a cluster's size.  In this case, only the slab of the first cluster is
378 * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
379 * that they are part of the larger slab.
380 *
381 * Each slab controls a page of memory.
382 */
383typedef struct mcl_slab {
384	struct mcl_slab	*sl_next;	/* neighboring slab */
385	u_int8_t	sl_class;	/* controlling mbuf class */
386	int8_t		sl_refcnt;	/* outstanding allocations */
387	int8_t		sl_chunks;	/* chunks (bufs) in this slab */
388	u_int16_t	sl_flags;	/* slab flags (see below) */
389	u_int16_t	sl_len;		/* slab length */
390	void		*sl_base;	/* base of allocated memory */
391	void		*sl_head;	/* first free buffer */
392	TAILQ_ENTRY(mcl_slab) sl_link;	/* next/prev slab on freelist */
393} mcl_slab_t;
394
395#define	SLF_MAPPED	0x0001		/* backed by a mapped page */
396#define	SLF_PARTIAL	0x0002		/* part of another slab */
397#define	SLF_DETACHED	0x0004		/* not in slab freelist */
398
399/*
400 * The array of slabs are broken into groups of arrays per 1MB of kernel
401 * memory to reduce the footprint.  Each group is allocated on demand
402 * whenever a new piece of memory mapped in from the VM crosses the 1MB
403 * boundary.
404 */
405#define	NSLABSPMB	((1 << MBSHIFT) >> PGSHIFT)	/* 256 slabs/grp */
406
407typedef struct mcl_slabg {
408	mcl_slab_t	slg_slab[NSLABSPMB];	/* group of slabs */
409} mcl_slabg_t;
410
411/*
412 * Number of slabs needed to control a 16KB cluster object.
413 */
414#define	NSLABSP16KB	(M16KCLBYTES >> PGSHIFT)
415
416/*
417 * Per-cluster audit structure.
418 */
419typedef struct {
420	mcache_audit_t	*cl_audit[NMBPBG];	/* array of audits */
421} mcl_audit_t;
422
423typedef struct {
424	struct thread	*msa_thread;	/* thread doing transaction */
425	struct thread	*msa_pthread;	/* previous transaction thread */
426	uint32_t	msa_tstamp;	/* transaction timestamp (ms) */
427	uint32_t	msa_ptstamp;	/* prev transaction timestamp (ms) */
428	uint16_t	msa_depth;	/* pc stack depth */
429	uint16_t	msa_pdepth;	/* previous transaction pc stack */
430	void		*msa_stack[MCACHE_STACK_DEPTH];
431	void		*msa_pstack[MCACHE_STACK_DEPTH];
432} mcl_scratch_audit_t;
433
434typedef struct {
435	/*
436	 * Size of data from the beginning of an mbuf that covers m_hdr,
437	 * pkthdr and m_ext structures.  If auditing is enabled, we allocate
438	 * a shadow mbuf structure of this size inside each audit structure,
439	 * and the contents of the real mbuf gets copied into it when the mbuf
440	 * is freed.  This allows us to pattern-fill the mbuf for integrity
441	 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
442	 * cluster cache case).  Note that we don't save the contents of
443	 * clusters when they are freed; we simply pattern-fill them.
444	 */
445	u_int8_t		sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
446	mcl_scratch_audit_t	sc_scratch __attribute__((aligned(8)));
447} mcl_saved_contents_t;
448
449#define	AUDIT_CONTENTS_SIZE	(sizeof (mcl_saved_contents_t))
450
451#define	MCA_SAVED_MBUF_PTR(_mca)					\
452	((struct mbuf *)(void *)((mcl_saved_contents_t *)		\
453	(_mca)->mca_contents)->sc_mbuf)
454#define	MCA_SAVED_MBUF_SIZE						\
455	(sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
456#define	MCA_SAVED_SCRATCH_PTR(_mca)					\
457	(&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
458
459/*
460 * mbuf specific mcache audit flags
461 */
462#define	MB_INUSE	0x01	/* object has not been returned to slab */
463#define	MB_COMP_INUSE	0x02	/* object has not been returned to cslab */
464#define	MB_SCVALID	0x04	/* object has valid saved contents */
465
466/*
467 * Each of the following two arrays hold up to nmbclusters elements.
468 */
469static mcl_audit_t *mclaudit;	/* array of cluster audit information */
470static unsigned int maxclaudit;	/* max # of entries in audit table */
471static mcl_slabg_t **slabstbl;	/* cluster slabs table */
472static unsigned int maxslabgrp;	/* max # of entries in slabs table */
473static unsigned int slabgrp;	/* # of entries in slabs table */
474
475/* Globals */
476int nclusters;			/* # of clusters for non-jumbo (legacy) sizes */
477int njcl;			/* # of clusters for jumbo sizes */
478int njclbytes;			/* size of a jumbo cluster */
479union mbigcluster *mbutl;	/* first mapped cluster address */
480union mbigcluster *embutl;	/* ending virtual address of mclusters */
481int _max_linkhdr;		/* largest link-level header */
482int _max_protohdr;		/* largest protocol header */
483int max_hdr;			/* largest link+protocol header */
484int max_datalen;		/* MHLEN - max_hdr */
485
486static boolean_t mclverify;	/* debug: pattern-checking */
487static boolean_t mcltrace;	/* debug: stack tracing */
488static boolean_t mclfindleak;	/* debug: leak detection */
489static boolean_t mclexpleak;	/* debug: expose leak info to user space */
490
491static struct timeval mb_start;	/* beginning of time */
492
493/* mbuf leak detection variables */
494static struct mleak_table mleak_table;
495static mleak_stat_t *mleak_stat;
496
497#define	MLEAK_STAT_SIZE(n) \
498	((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
499
500struct mallocation {
501	mcache_obj_t *element;	/* the alloc'ed element, NULL if unused */
502	u_int32_t trace_index;	/* mtrace index for corresponding backtrace */
503	u_int32_t count;	/* How many objects were requested */
504	u_int64_t hitcount;	/* for determining hash effectiveness */
505};
506
507struct mtrace {
508	u_int64_t	collisions;
509	u_int64_t	hitcount;
510	u_int64_t	allocs;
511	u_int64_t	depth;
512	uintptr_t	addr[MLEAK_STACK_DEPTH];
513};
514
515/* Size must be a power of two for the zhash to be able to just mask off bits */
516#define	MLEAK_ALLOCATION_MAP_NUM	512
517#define	MLEAK_TRACE_MAP_NUM		256
518
519/*
520 * Sample factor for how often to record a trace.  This is overwritable
521 * by the boot-arg mleak_sample_factor.
522 */
523#define	MLEAK_SAMPLE_FACTOR		500
524
525/*
526 * Number of top leakers recorded.
527 */
528#define	MLEAK_NUM_TRACES		5
529
530#define	MB_LEAK_SPACING_64 "                    "
531#define MB_LEAK_SPACING_32 "            "
532
533
534#define	MB_LEAK_HDR_32	"\n\
535    trace [1]   trace [2]   trace [3]   trace [4]   trace [5]  \n\
536    ----------  ----------  ----------  ----------  ---------- \n\
537"
538
539#define	MB_LEAK_HDR_64	"\n\
540    trace [1]           trace [2]           trace [3]       \
541        trace [4]           trace [5]      \n\
542    ------------------  ------------------  ------------------  \
543    ------------------  ------------------ \n\
544"
545
546static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
547static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
548
549/* Hashmaps of allocations and their corresponding traces */
550static struct mallocation *mleak_allocations;
551static struct mtrace *mleak_traces;
552static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
553
554/* Lock to protect mleak tables from concurrent modification */
555decl_lck_mtx_data(static, mleak_lock_data);
556static lck_mtx_t *mleak_lock = &mleak_lock_data;
557static lck_attr_t *mleak_lock_attr;
558static lck_grp_t *mleak_lock_grp;
559static lck_grp_attr_t *mleak_lock_grp_attr;
560
561extern u_int32_t high_sb_max;
562
563/* The minimum number of objects that are allocated, to start. */
564#define	MINCL		32
565#define	MINBIGCL	(MINCL >> 1)
566#define	MIN16KCL	(MINCL >> 2)
567
568/* Low watermarks (only map in pages once free counts go below) */
569#define	MBIGCL_LOWAT	MINBIGCL
570#define	M16KCL_LOWAT	MIN16KCL
571
572typedef struct {
573	mbuf_class_t	mtbl_class;	/* class type */
574	mcache_t	*mtbl_cache;	/* mcache for this buffer class */
575	TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
576	mcache_obj_t	*mtbl_cobjlist;	/* composite objects freelist */
577	mb_class_stat_t	*mtbl_stats;	/* statistics fetchable via sysctl */
578	u_int32_t	mtbl_maxsize;	/* maximum buffer size */
579	int		mtbl_minlimit;	/* minimum allowed */
580	int		mtbl_maxlimit;	/* maximum allowed */
581	u_int32_t	mtbl_wantpurge;	/* purge during next reclaim */
582	uint32_t	mtbl_avgtotal;  /* average total on iOS */
583} mbuf_table_t;
584
585#define	m_class(c)	mbuf_table[c].mtbl_class
586#define	m_cache(c)	mbuf_table[c].mtbl_cache
587#define	m_slablist(c)	mbuf_table[c].mtbl_slablist
588#define	m_cobjlist(c)	mbuf_table[c].mtbl_cobjlist
589#define	m_maxsize(c)	mbuf_table[c].mtbl_maxsize
590#define	m_minlimit(c)	mbuf_table[c].mtbl_minlimit
591#define	m_maxlimit(c)	mbuf_table[c].mtbl_maxlimit
592#define	m_wantpurge(c)	mbuf_table[c].mtbl_wantpurge
593#define	m_avgtotal(c)	mbuf_table[c].mtbl_avgtotal
594#define	m_cname(c)	mbuf_table[c].mtbl_stats->mbcl_cname
595#define	m_size(c)	mbuf_table[c].mtbl_stats->mbcl_size
596#define	m_total(c)	mbuf_table[c].mtbl_stats->mbcl_total
597#define	m_active(c)	mbuf_table[c].mtbl_stats->mbcl_active
598#define	m_infree(c)	mbuf_table[c].mtbl_stats->mbcl_infree
599#define	m_slab_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_slab_cnt
600#define	m_alloc_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
601#define	m_free_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_free_cnt
602#define	m_notified(c)	mbuf_table[c].mtbl_stats->mbcl_notified
603#define	m_purge_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_purge_cnt
604#define	m_fail_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_fail_cnt
605#define	m_ctotal(c)	mbuf_table[c].mtbl_stats->mbcl_ctotal
606#define	m_peak(c)	mbuf_table[c].mtbl_stats->mbcl_peak_reported
607#define	m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
608
609static mbuf_table_t mbuf_table[] = {
610	/*
611	 * The caches for mbufs, regular clusters and big clusters.
612	 * The average total values were based on data gathered by actual
613	 * usage patterns on iOS.
614	 */
615	{ MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
616	    NULL, NULL, 0, 0, 0, 0, 3000 },
617	{ MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
618	    NULL, NULL, 0, 0, 0, 0, 2000 },
619	{ MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
620	    NULL, NULL, 0, 0, 0, 0, 1000 },
621	{ MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
622	    NULL, NULL, 0, 0, 0, 0, 1000 },
623	/*
624	 * The following are special caches; they serve as intermediate
625	 * caches backed by the above rudimentary caches.  Each object
626	 * in the cache is an mbuf with a cluster attached to it.  Unlike
627	 * the above caches, these intermediate caches do not directly
628	 * deal with the slab structures; instead, the constructed
629	 * cached elements are simply stored in the freelists.
630	 */
631	{ MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000 },
632	{ MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000 },
633	{ MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000 },
634};
635
636#define	NELEM(a)	(sizeof (a) / sizeof ((a)[0]))
637
638static void *mb_waitchan = &mbuf_table;	/* wait channel for all caches */
639static int mb_waiters;			/* number of waiters */
640
641boolean_t mb_peak_newreport = FALSE;
642boolean_t mb_peak_firstreport = FALSE;
643
644/* generate a report by default after 1 week of uptime */
645#define	MBUF_PEAK_FIRST_REPORT_THRESHOLD	604800
646
647#define	MB_WDT_MAXTIME	10		/* # of secs before watchdog panic */
648static struct timeval mb_wdtstart;	/* watchdog start timestamp */
649static char *mbuf_dump_buf;
650
651#define	MBUF_DUMP_BUF_SIZE	2048
652
653/*
654 * mbuf watchdog is enabled by default on embedded platforms.  It is
655 * also toggeable via the kern.ipc.mb_watchdog sysctl.
656 * Garbage collection is also enabled by default on embedded platforms.
657 * mb_drain_maxint controls the amount of time to wait (in seconds) before
658 * consecutive calls to m_drain().
659 */
660static unsigned int mb_watchdog = 0;
661static unsigned int mb_drain_maxint = 0;
662
663/* Red zone */
664static u_int32_t mb_redzone_cookie;
665static void m_redzone_init(struct mbuf *);
666static void m_redzone_verify(struct mbuf *m);
667
668/* The following are used to serialize m_clalloc() */
669static boolean_t mb_clalloc_busy;
670static void *mb_clalloc_waitchan = &mb_clalloc_busy;
671static int mb_clalloc_waiters;
672
673static void mbuf_mtypes_sync(boolean_t);
674static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
675static void mbuf_stat_sync(void);
676static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
677static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
678static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
679static char *mbuf_dump(void);
680static void mbuf_table_init(void);
681static inline void m_incref(struct mbuf *);
682static inline u_int32_t m_decref(struct mbuf *);
683static int m_clalloc(const u_int32_t, const int, const u_int32_t);
684static void mbuf_worker_thread_init(void);
685static mcache_obj_t *slab_alloc(mbuf_class_t, int);
686static void slab_free(mbuf_class_t, mcache_obj_t *);
687static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
688    unsigned int, int);
689static void mbuf_slab_free(void *, mcache_obj_t *, int);
690static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
691static void mbuf_slab_notify(void *, u_int32_t);
692static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
693    unsigned int);
694static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
695static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
696    unsigned int, int);
697static void mbuf_cslab_free(void *, mcache_obj_t *, int);
698static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
699static int freelist_populate(mbuf_class_t, unsigned int, int);
700static void freelist_init(mbuf_class_t);
701static boolean_t mbuf_cached_above(mbuf_class_t, int);
702static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
703static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
704static int m_howmany(int, size_t);
705static void mbuf_worker_thread(void);
706static void mbuf_watchdog(void);
707static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
708
709static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
710    size_t, unsigned int);
711static void mcl_audit_free(void *, unsigned int);
712static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
713static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
714static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
715    boolean_t);
716static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
717static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
718static void mcl_audit_scratch(mcache_audit_t *);
719static void mcl_audit_mcheck_panic(struct mbuf *);
720static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
721
722static void mleak_activate(void);
723static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
724static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
725static void mleak_free(mcache_obj_t *);
726static void mleak_sort_traces(void);
727static void mleak_update_stats(void);
728
729static mcl_slab_t *slab_get(void *);
730static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
731    void *, void *, unsigned int, int, int);
732static void slab_insert(mcl_slab_t *, mbuf_class_t);
733static void slab_remove(mcl_slab_t *, mbuf_class_t);
734static boolean_t slab_inrange(mcl_slab_t *, void *);
735static void slab_nextptr_panic(mcl_slab_t *, void *);
736static void slab_detach(mcl_slab_t *);
737static boolean_t slab_is_detached(mcl_slab_t *);
738
739static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
740static struct mbuf *m_split0(struct mbuf *, int, int, int);
741__private_extern__ void mbuf_report_peak_usage(void);
742static boolean_t mbuf_report_usage(mbuf_class_t);
743
744/* flags for m_copyback0 */
745#define	M_COPYBACK0_COPYBACK	0x0001	/* copyback from cp */
746#define	M_COPYBACK0_PRESERVE	0x0002	/* preserve original data */
747#define	M_COPYBACK0_COW		0x0004	/* do copy-on-write */
748#define	M_COPYBACK0_EXTEND	0x0008	/* extend chain */
749
750/*
751 * This flag is set for all mbufs that come out of and into the composite
752 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
753 * are marked with such a flag have clusters attached to them, and will be
754 * treated differently when they are freed; instead of being placed back
755 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
756 * are placed back into the appropriate composite cache's freelist, and the
757 * actual freeing is deferred until the composite objects are purged.  At
758 * such a time, this flag will be cleared from the mbufs and the objects
759 * will be freed into their own separate freelists.
760 */
761#define	EXTF_COMPOSITE	0x1
762
763/*
764 * This flag indicates that the external cluster is read-only, i.e. it is
765 * or was referred to by more than one mbufs.  Once set, this flag is never
766 * cleared.
767 */
768#define	EXTF_READONLY	0x2
769#define	EXTF_MASK	(EXTF_COMPOSITE | EXTF_READONLY)
770
771#define	MEXT_RFA(m)		((m)->m_ext.ext_refflags)
772#define	MEXT_REF(m)		(MEXT_RFA(m)->refcnt)
773#define	MEXT_FLAGS(m)		(MEXT_RFA(m)->flags)
774#define	MBUF_IS_COMPOSITE(m)	\
775	(MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
776
777/*
778 * Macros used to verify the integrity of the mbuf.
779 */
780#define	_MCHECK(m) {							\
781	if ((m)->m_type != MT_FREE) {					\
782		if (mclaudit == NULL)					\
783			panic("MCHECK: m_type=%d m=%p",			\
784			    (u_int16_t)(m)->m_type, m);			\
785		else							\
786			mcl_audit_mcheck_panic(m);			\
787	}								\
788}
789
790#define	MBUF_IN_MAP(addr)						\
791	((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
792
793#define	MRANGE(addr) {							\
794	if (!MBUF_IN_MAP(addr))						\
795		panic("MRANGE: address out of range 0x%p", addr);	\
796}
797
798/*
799 * Macro version of mtod.
800 */
801#define	MTOD(m, t)	((t)((m)->m_data))
802
803/*
804 * Macros to obtain (4KB) cluster index and base cluster address.
805 */
806
807#define	MTOBG(x)	(((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
808#define	BGTOM(x)	((union mbigcluster *)(mbutl + (x)))
809
810/*
811 * Macro to find the mbuf index relative to a base.
812 */
813#define	MCLIDX(c, m)	(((char *)(m) - (char *)(c)) >> MSIZESHIFT)
814
815/*
816 * Same thing for 2KB cluster index.
817 */
818#define	CLBGIDX(c, m)	(((char *)(m) - (char *)(c)) >> MCLSHIFT)
819
820/*
821 * Macros used during mbuf and cluster initialization.
822 */
823#define	MBUF_INIT_PKTHDR(m) {						\
824	(m)->m_pkthdr.rcvif = NULL;					\
825	(m)->m_pkthdr.pkt_hdr = NULL;					\
826	(m)->m_pkthdr.len = 0;						\
827	(m)->m_pkthdr.csum_flags = 0;					\
828	(m)->m_pkthdr.csum_data = 0;					\
829	(m)->m_pkthdr.vlan_tag = 0;					\
830	m_classifier_init(m, 0);					\
831	m_tag_init(m, 1);						\
832	m_scratch_init(m);						\
833	m_redzone_init(m);						\
834}
835
836#define	MBUF_INIT(m, pkthdr, type) {					\
837	_MCHECK(m);							\
838	(m)->m_next = (m)->m_nextpkt = NULL;				\
839	(m)->m_len = 0;							\
840	(m)->m_type = type;						\
841	if ((pkthdr) == 0) {						\
842		(m)->m_data = (m)->m_dat;				\
843		(m)->m_flags = 0;					\
844	} else {							\
845		(m)->m_data = (m)->m_pktdat;				\
846		(m)->m_flags = M_PKTHDR;				\
847		MBUF_INIT_PKTHDR(m);					\
848	}								\
849}
850
851#define	MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) {		\
852	(m)->m_data = (m)->m_ext.ext_buf = (buf);			\
853	(m)->m_flags |= M_EXT;						\
854	(m)->m_ext.ext_size = (size);					\
855	(m)->m_ext.ext_free = (free);					\
856	(m)->m_ext.ext_arg = (arg);					\
857	(m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward =	\
858	    &(m)->m_ext.ext_refs;					\
859	MEXT_RFA(m) = (rfa);						\
860	MEXT_REF(m) = (ref);						\
861	MEXT_FLAGS(m) = (flag);						\
862}
863
864#define	MBUF_CL_INIT(m, buf, rfa, ref, flag)	\
865	MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
866
867#define	MBUF_BIGCL_INIT(m, buf, rfa, ref, flag)	\
868	MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
869
870#define	MBUF_16KCL_INIT(m, buf, rfa, ref, flag)	\
871	MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
872
873/*
874 * Macro to convert BSD malloc sleep flag to mcache's
875 */
876#define	MSLEEPF(f)	((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
877
878/*
879 * The structure that holds all mbuf class statistics exportable via sysctl.
880 * Similar to mbstat structure, the mb_stat structure is protected by the
881 * global mbuf lock.  It contains additional information about the classes
882 * that allows for a more accurate view of the state of the allocator.
883 */
884struct mb_stat *mb_stat;
885struct omb_stat *omb_stat;	/* For backwards compatibility */
886
887#define	MB_STAT_SIZE(n) \
888	((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
889#define	OMB_STAT_SIZE(n) \
890	((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
891
892/*
893 * The legacy structure holding all of the mbuf allocation statistics.
894 * The actual statistics used by the kernel are stored in the mbuf_table
895 * instead, and are updated atomically while the global mbuf lock is held.
896 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
897 * Unlike before, the kernel no longer relies on the contents of mbstat for
898 * its operations (e.g. cluster expansion) because the structure is exposed
899 * to outside and could possibly be modified, therefore making it unsafe.
900 * With the exception of the mbstat.m_mtypes array (see below), all of the
901 * statistics are updated as they change.
902 */
903struct mbstat mbstat;
904
905#define	MBSTAT_MTYPES_MAX \
906	(sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
907
908/*
909 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
910 * atomically and stored in a per-CPU structure which is lock-free; this is
911 * done in order to avoid writing to the global mbstat data structure which
912 * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
913 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
914 * array and returned to the application.  Any updates for types greater or
915 * equal than MT_MAX would be done atomically to the mbstat; this slows down
916 * performance but is okay since the kernel uses only up to MT_MAX-1 while
917 * anything beyond that (up to type 255) is considered a corner case.
918 */
919typedef struct {
920	unsigned int	cpu_mtypes[MT_MAX];
921} __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
922
923typedef struct {
924	mtypes_cpu_t	mbs_cpu[1];
925} mbuf_mtypes_t;
926
927static mbuf_mtypes_t *mbuf_mtypes;	/* per-CPU statistics */
928
929#define	MBUF_MTYPES_SIZE(n) \
930	((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
931
932#define	MTYPES_CPU(p) \
933	((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
934
935#define	mtype_stat_add(type, n) {					\
936	if ((unsigned)(type) < MT_MAX) {				\
937		mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);		\
938		atomic_add_32(&mbs->cpu_mtypes[type], n);		\
939	} else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {	\
940		atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);	\
941	}								\
942}
943
944#define	mtype_stat_sub(t, n)	mtype_stat_add(t, -(n))
945#define	mtype_stat_inc(t)	mtype_stat_add(t, 1)
946#define	mtype_stat_dec(t)	mtype_stat_sub(t, 1)
947
948static void
949mbuf_mtypes_sync(boolean_t locked)
950{
951	int m, n;
952	mtypes_cpu_t mtc;
953
954	if (locked)
955		lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
956
957	bzero(&mtc, sizeof (mtc));
958	for (m = 0; m < ncpu; m++) {
959		mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
960		mtypes_cpu_t temp;
961
962		bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
963		    sizeof (temp.cpu_mtypes));
964
965		for (n = 0; n < MT_MAX; n++)
966			mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
967	}
968	if (!locked)
969		lck_mtx_lock(mbuf_mlock);
970	for (n = 0; n < MT_MAX; n++)
971		mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
972	if (!locked)
973		lck_mtx_unlock(mbuf_mlock);
974}
975
976static int
977mbstat_sysctl SYSCTL_HANDLER_ARGS
978{
979#pragma unused(oidp, arg1, arg2)
980	mbuf_mtypes_sync(FALSE);
981
982	return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
983}
984
985static void
986mbuf_stat_sync(void)
987{
988	mb_class_stat_t *sp;
989	mcache_cpu_t *ccp;
990	mcache_t *cp;
991	int k, m, bktsize;
992
993	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
994
995	for (k = 0; k < NELEM(mbuf_table); k++) {
996		cp = m_cache(k);
997		ccp = &cp->mc_cpu[0];
998		bktsize = ccp->cc_bktsize;
999		sp = mbuf_table[k].mtbl_stats;
1000
1001		if (cp->mc_flags & MCF_NOCPUCACHE)
1002			sp->mbcl_mc_state = MCS_DISABLED;
1003		else if (cp->mc_purge_cnt > 0)
1004			sp->mbcl_mc_state = MCS_PURGING;
1005		else if (bktsize == 0)
1006			sp->mbcl_mc_state = MCS_OFFLINE;
1007		else
1008			sp->mbcl_mc_state = MCS_ONLINE;
1009
1010		sp->mbcl_mc_cached = 0;
1011		for (m = 0; m < ncpu; m++) {
1012			ccp = &cp->mc_cpu[m];
1013			if (ccp->cc_objs > 0)
1014				sp->mbcl_mc_cached += ccp->cc_objs;
1015			if (ccp->cc_pobjs > 0)
1016				sp->mbcl_mc_cached += ccp->cc_pobjs;
1017		}
1018		sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1019		sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1020		    sp->mbcl_infree;
1021
1022		sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1023		sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1024		sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1025
1026		/* Calculate total count specific to each class */
1027		sp->mbcl_ctotal = sp->mbcl_total;
1028		switch (m_class(k)) {
1029		case MC_MBUF:
1030			/* Deduct mbufs used in composite caches */
1031			sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1032			    m_total(MC_MBUF_BIGCL));
1033			break;
1034
1035		case MC_CL:
1036			/* Deduct clusters used in composite cache */
1037			sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1038			break;
1039
1040		case MC_BIGCL:
1041			/* Deduct clusters used in composite cache */
1042			sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1043			break;
1044
1045		case MC_16KCL:
1046			/* Deduct clusters used in composite cache */
1047			sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1048			break;
1049
1050		default:
1051			break;
1052		}
1053	}
1054}
1055
1056static int
1057mb_stat_sysctl SYSCTL_HANDLER_ARGS
1058{
1059#pragma unused(oidp, arg1, arg2)
1060	void *statp;
1061	int k, statsz, proc64 = proc_is64bit(req->p);
1062
1063	lck_mtx_lock(mbuf_mlock);
1064	mbuf_stat_sync();
1065
1066	if (!proc64) {
1067		struct omb_class_stat *oc;
1068		struct mb_class_stat *c;
1069
1070		omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1071		oc = &omb_stat->mbs_class[0];
1072		c = &mb_stat->mbs_class[0];
1073		for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1074			(void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1075			    "%s", c->mbcl_cname);
1076			oc->mbcl_size = c->mbcl_size;
1077			oc->mbcl_total = c->mbcl_total;
1078			oc->mbcl_active = c->mbcl_active;
1079			oc->mbcl_infree = c->mbcl_infree;
1080			oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1081			oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1082			oc->mbcl_free_cnt = c->mbcl_free_cnt;
1083			oc->mbcl_notified = c->mbcl_notified;
1084			oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1085			oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1086			oc->mbcl_ctotal = c->mbcl_ctotal;
1087			oc->mbcl_release_cnt = c->mbcl_release_cnt;
1088			oc->mbcl_mc_state = c->mbcl_mc_state;
1089			oc->mbcl_mc_cached = c->mbcl_mc_cached;
1090			oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1091			oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1092			oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1093		}
1094		statp = omb_stat;
1095		statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1096	} else {
1097		statp = mb_stat;
1098		statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1099	}
1100
1101	lck_mtx_unlock(mbuf_mlock);
1102
1103	return (SYSCTL_OUT(req, statp, statsz));
1104}
1105
1106static int
1107mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1108{
1109#pragma unused(oidp, arg1, arg2)
1110	int i;
1111
1112	/* Ensure leak tracing turned on */
1113	if (!mclfindleak || !mclexpleak)
1114		return (ENXIO);
1115
1116	lck_mtx_lock(mleak_lock);
1117	mleak_update_stats();
1118	i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1119	lck_mtx_unlock(mleak_lock);
1120
1121	return (i);
1122}
1123
1124static int
1125mleak_table_sysctl SYSCTL_HANDLER_ARGS
1126{
1127#pragma unused(oidp, arg1, arg2)
1128	int i = 0;
1129
1130	/* Ensure leak tracing turned on */
1131	if (!mclfindleak || !mclexpleak)
1132		return (ENXIO);
1133
1134	lck_mtx_lock(mleak_lock);
1135	i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1136	lck_mtx_unlock(mleak_lock);
1137
1138	return (i);
1139}
1140
1141static inline void
1142m_incref(struct mbuf *m)
1143{
1144	UInt32 old, new;
1145	volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1146
1147	do {
1148		old = *addr;
1149		new = old + 1;
1150		ASSERT(new != 0);
1151	} while (!OSCompareAndSwap(old, new, addr));
1152
1153	/*
1154	 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1155	 * we don't clear the flag when the refcount goes back to 1
1156	 * to simplify code calling m_mclhasreference().
1157	 */
1158	if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1159		(void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1160}
1161
1162static inline u_int32_t
1163m_decref(struct mbuf *m)
1164{
1165	UInt32 old, new;
1166	volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1167
1168	do {
1169		old = *addr;
1170		new = old - 1;
1171		ASSERT(old != 0);
1172	} while (!OSCompareAndSwap(old, new, addr));
1173
1174	return (new);
1175}
1176
1177static void
1178mbuf_table_init(void)
1179{
1180	unsigned int b, c, s;
1181	int m;
1182
1183	MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1184	    M_TEMP, M_WAITOK | M_ZERO);
1185	VERIFY(omb_stat != NULL);
1186
1187	MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1188	    M_TEMP, M_WAITOK | M_ZERO);
1189	VERIFY(mb_stat != NULL);
1190
1191	mb_stat->mbs_cnt = NELEM(mbuf_table);
1192	for (m = 0; m < NELEM(mbuf_table); m++)
1193		mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1194
1195#if CONFIG_MBUF_JUMBO
1196	/*
1197	 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1198	 * this only on platforms where jumbo cluster pool is enabled.
1199	 */
1200	njcl = nmbclusters / 3;
1201	njclbytes = M16KCLBYTES;
1202#endif /* CONFIG_MBUF_JUMBO */
1203
1204	/*
1205	 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1206	 * a multiple of 4KB clusters.
1207	 */
1208	nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1209	if (njcl > 0) {
1210		/*
1211		 * Each jumbo cluster takes 8 2KB clusters, so make
1212		 * sure that the pool size is evenly divisible by 8;
1213		 * njcl is in 2KB unit, hence treated as such.
1214		 */
1215		njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1216
1217		/* Update nclusters with rounded down value of njcl */
1218		nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1219	}
1220
1221	/*
1222	 * njcl is valid only on platforms with 16KB jumbo clusters, where
1223	 * it is configured to 1/3 of the pool size.  On these platforms,
1224	 * the remaining is used for 2KB and 4KB clusters.  On platforms
1225	 * without 16KB jumbo clusters, the entire pool is used for both
1226	 * 2KB and 4KB clusters.  A 4KB cluster can either be splitted into
1227	 * 16 mbufs, or into 2 2KB clusters.
1228	 *
1229	 *  +---+---+------------ ... -----------+------- ... -------+
1230	 *  | c | b |              s             |        njcl       |
1231	 *  +---+---+------------ ... -----------+------- ... -------+
1232	 *
1233	 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1234	 * clusters (1/64th each.)
1235	 */
1236	c = P2ROUNDDOWN((nclusters >> 6), 2);		/* in 2KB unit */
1237	b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1238	s = nclusters - (c + (b << NCLPBGSHIFT));	/* in 2KB unit */
1239
1240	/*
1241	 * 1/64th (c) is reserved for 2KB clusters.
1242	 */
1243	m_minlimit(MC_CL) = c;
1244	m_maxlimit(MC_CL) = s + c;			/* in 2KB unit */
1245	m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1246	(void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1247
1248	/*
1249	 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1250	 * It cannot be turned into 2KB clusters or mbufs.
1251	 */
1252	m_minlimit(MC_BIGCL) = b;
1253	m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;	/* in 4KB unit */
1254	m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1255	(void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1256
1257	/*
1258	 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1259	 */
1260	m_minlimit(MC_MBUF) = 0;
1261	m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT);	/* in mbuf unit */
1262	m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1263	(void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1264
1265	/*
1266	 * Set limits for the composite classes.
1267	 */
1268	m_minlimit(MC_MBUF_CL) = 0;
1269	m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1270	m_maxsize(MC_MBUF_CL) = MCLBYTES;
1271	m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1272	(void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1273
1274	m_minlimit(MC_MBUF_BIGCL) = 0;
1275	m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1276	m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1277	m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1278	(void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1279
1280	/*
1281	 * And for jumbo classes.
1282	 */
1283	m_minlimit(MC_16KCL) = 0;
1284	m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);	/* in 16KB unit */
1285	m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1286	(void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1287
1288	m_minlimit(MC_MBUF_16KCL) = 0;
1289	m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1290	m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1291	m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1292	(void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1293
1294	/*
1295	 * Initialize the legacy mbstat structure.
1296	 */
1297	bzero(&mbstat, sizeof (mbstat));
1298	mbstat.m_msize = m_maxsize(MC_MBUF);
1299	mbstat.m_mclbytes = m_maxsize(MC_CL);
1300	mbstat.m_minclsize = MINCLSIZE;
1301	mbstat.m_mlen = MLEN;
1302	mbstat.m_mhlen = MHLEN;
1303	mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1304}
1305
1306#if defined(__LP64__)
1307typedef struct ncl_tbl {
1308	uint64_t nt_maxmem;	/* memory (sane) size */
1309	uint32_t nt_mbpool;	/* mbuf pool size */
1310} ncl_tbl_t;
1311
1312/* Non-server */
1313static ncl_tbl_t ncl_table[] = {
1314	{ (1ULL << GBSHIFT)	  /*  1 GB */,	(64 << MBSHIFT)	 /*  64 MB */ },
1315	{ (1ULL << (GBSHIFT + 3)) /*  8 GB */,	(96 << MBSHIFT)	 /*  96 MB */ },
1316	{ (1ULL << (GBSHIFT + 4)) /* 16 GB */,	(128 << MBSHIFT) /* 128 MB */ },
1317	{ 0, 0 }
1318};
1319
1320/* Server */
1321static ncl_tbl_t ncl_table_srv[] = {
1322	{ (1ULL << GBSHIFT)	  /*  1 GB */,	(96 << MBSHIFT)  /*  96 MB */ },
1323	{ (1ULL << (GBSHIFT + 2)) /*  4 GB */,	(128 << MBSHIFT) /* 128 MB */ },
1324	{ (1ULL << (GBSHIFT + 3)) /*  8 GB */,	(160 << MBSHIFT) /* 160 MB */ },
1325	{ (1ULL << (GBSHIFT + 4)) /* 16 GB */,	(192 << MBSHIFT) /* 192 MB */ },
1326	{ (1ULL << (GBSHIFT + 5)) /* 32 GB */,	(256 << MBSHIFT) /* 256 MB */ },
1327	{ (1ULL << (GBSHIFT + 6)) /* 64 GB */,	(384 << MBSHIFT) /* 384 MB */ },
1328	{ 0, 0 }
1329};
1330#endif /* __LP64__ */
1331
1332__private_extern__ unsigned int
1333mbuf_default_ncl(int server, uint64_t mem)
1334{
1335#if !defined(__LP64__)
1336#pragma unused(server)
1337	unsigned int n;
1338	/*
1339	 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1340	 */
1341	if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1342		n = 32768;
1343#else
1344	unsigned int n, i;
1345	ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1346	/*
1347	 * 64-bit kernel (mbuf pool size based on table).
1348	 */
1349	n = tbl[0].nt_mbpool;
1350	for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1351		if (mem < tbl[i].nt_maxmem)
1352			break;
1353		n = tbl[i].nt_mbpool;
1354	}
1355	n >>= MCLSHIFT;
1356#endif /* !__LP64__ */
1357	return (n);
1358}
1359
1360__private_extern__ void
1361mbinit(void)
1362{
1363	unsigned int m;
1364	unsigned int initmcl = 0;
1365	void *buf;
1366	thread_t thread = THREAD_NULL;
1367
1368	microuptime(&mb_start);
1369
1370	/*
1371	 * These MBUF_ values must be equal to their private counterparts.
1372	 */
1373	_CASSERT(MBUF_EXT == M_EXT);
1374	_CASSERT(MBUF_PKTHDR == M_PKTHDR);
1375	_CASSERT(MBUF_EOR == M_EOR);
1376	_CASSERT(MBUF_LOOP == M_LOOP);
1377	_CASSERT(MBUF_BCAST == M_BCAST);
1378	_CASSERT(MBUF_MCAST == M_MCAST);
1379	_CASSERT(MBUF_FRAG == M_FRAG);
1380	_CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1381	_CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1382	_CASSERT(MBUF_PROMISC == M_PROMISC);
1383	_CASSERT(MBUF_HASFCS == M_HASFCS);
1384
1385	_CASSERT(MBUF_TYPE_FREE == MT_FREE);
1386	_CASSERT(MBUF_TYPE_DATA == MT_DATA);
1387	_CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1388	_CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1389	_CASSERT(MBUF_TYPE_PCB == MT_PCB);
1390	_CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1391	_CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1392	_CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1393	_CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1394	_CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1395	_CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1396	_CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1397	_CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1398	_CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1399	_CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1400
1401	_CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1402	_CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1403	_CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1404	_CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1405	_CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1406	_CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1407	_CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1408	_CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1409	_CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1410	_CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1411	_CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1412	_CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1413	_CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1414
1415	_CASSERT(MBUF_WAITOK == M_WAIT);
1416	_CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1417	_CASSERT(MBUF_COPYALL == M_COPYALL);
1418
1419	_CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1420	_CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1421	_CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1422	_CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1423	_CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1424	_CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1425	_CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1426	_CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1427	_CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1428	_CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1429
1430	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1431	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1432	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1433	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1434
1435	/* Module specific scratch space (32-bit alignment requirement) */
1436	_CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1437	    sizeof (uint32_t)));
1438
1439	/* Initialize random red zone cookie value */
1440	_CASSERT(sizeof (mb_redzone_cookie) ==
1441	    sizeof (((struct pkthdr *)0)->redzone));
1442	read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1443
1444	/* Make sure we don't save more than we should */
1445	_CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1446
1447	if (nmbclusters == 0)
1448		nmbclusters = NMBCLUSTERS;
1449
1450	/* This should be a sane (at least even) value by now */
1451	VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1452
1453	/* Setup the mbuf table */
1454	mbuf_table_init();
1455
1456	/* Global lock for common layer */
1457	mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1458	mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1459	mbuf_mlock_attr = lck_attr_alloc_init();
1460	lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1461
1462	/*
1463	 * Allocate cluster slabs table:
1464	 *
1465	 *	maxslabgrp = (N * 2048) / (1024 * 1024)
1466	 *
1467	 * Where N is nmbclusters rounded up to the nearest 512.  This yields
1468	 * mcl_slab_g_t units, each one representing a MB of memory.
1469	 */
1470	maxslabgrp =
1471	    (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
1472	MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1473	    M_TEMP, M_WAITOK | M_ZERO);
1474	VERIFY(slabstbl != NULL);
1475
1476	/*
1477	 * Allocate audit structures, if needed:
1478	 *
1479	 *	maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1480	 *
1481	 * This yields mcl_audit_t units, each one representing a page.
1482	 */
1483	PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1484	mbuf_debug |= mcache_getflags();
1485	if (mbuf_debug & MCF_DEBUG) {
1486		maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1487		MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1488		    M_TEMP, M_WAITOK | M_ZERO);
1489		VERIFY(mclaudit != NULL);
1490
1491		mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1492		    AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP);
1493		VERIFY(mcl_audit_con_cache != NULL);
1494	}
1495	mclverify = (mbuf_debug & MCF_VERIFY);
1496	mcltrace = (mbuf_debug & MCF_TRACE);
1497	mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1498	mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1499
1500	/* Enable mbuf leak logging, with a lock to protect the tables */
1501
1502	mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1503	mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1504	mleak_lock_attr = lck_attr_alloc_init();
1505	lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1506
1507	mleak_activate();
1508
1509	/* Calculate the number of pages assigned to the cluster pool */
1510	mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1511	MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1512	    M_TEMP, M_WAITOK);
1513	VERIFY(mcl_paddr != NULL);
1514
1515	/* Register with the I/O Bus mapper */
1516	mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1517	bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1518
1519	embutl = (union mbigcluster *)
1520	    ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)));
1521	VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
1522
1523	/* Prime up the freelist */
1524	PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1525	if (initmcl != 0) {
1526		initmcl >>= NCLPBGSHIFT;	/* become a 4K unit */
1527		if (initmcl > m_maxlimit(MC_BIGCL))
1528			initmcl = m_maxlimit(MC_BIGCL);
1529	}
1530	if (initmcl < m_minlimit(MC_BIGCL))
1531		initmcl = m_minlimit(MC_BIGCL);
1532
1533	lck_mtx_lock(mbuf_mlock);
1534
1535	/*
1536	 * For classes with non-zero minimum limits, populate their freelists
1537	 * so that m_total(class) is at least m_minlimit(class).
1538	 */
1539	VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1540	freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1541	VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1542	freelist_init(m_class(MC_CL));
1543
1544	for (m = 0; m < NELEM(mbuf_table); m++) {
1545		/* Make sure we didn't miss any */
1546		VERIFY(m_minlimit(m_class(m)) == 0 ||
1547		    m_total(m_class(m)) >= m_minlimit(m_class(m)));
1548
1549		/* populate the initial sizes and report from there on */
1550		m_peak(m_class(m)) = m_total(m_class(m));
1551	}
1552	mb_peak_newreport = FALSE;
1553
1554	lck_mtx_unlock(mbuf_mlock);
1555
1556	(void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1557	    NULL, &thread);
1558	thread_deallocate(thread);
1559
1560	ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1561	    0, 0, MCR_SLEEP);
1562
1563	/* Create the cache for each class */
1564	for (m = 0; m < NELEM(mbuf_table); m++) {
1565		void *allocfunc, *freefunc, *auditfunc, *logfunc;
1566		u_int32_t flags;
1567
1568		flags = mbuf_debug;
1569		if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1570		    m_class(m) == MC_MBUF_16KCL) {
1571			allocfunc = mbuf_cslab_alloc;
1572			freefunc = mbuf_cslab_free;
1573			auditfunc = mbuf_cslab_audit;
1574			logfunc = mleak_logger;
1575		} else {
1576			allocfunc = mbuf_slab_alloc;
1577			freefunc = mbuf_slab_free;
1578			auditfunc = mbuf_slab_audit;
1579			logfunc = mleak_logger;
1580		}
1581
1582		/*
1583		 * Disable per-CPU caches for jumbo classes if there
1584		 * is no jumbo cluster pool available in the system.
1585		 * The cache itself is still created (but will never
1586		 * be populated) since it simplifies the code.
1587		 */
1588		if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1589		    njcl == 0)
1590			flags |= MCF_NOCPUCACHE;
1591
1592		if (!mclfindleak)
1593			flags |= MCF_NOLEAKLOG;
1594
1595		m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1596		    allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1597		    (void *)(uintptr_t)m, flags, MCR_SLEEP);
1598	}
1599
1600	/*
1601	 * Allocate structure for per-CPU statistics that's aligned
1602	 * on the CPU cache boundary; this code assumes that we never
1603	 * uninitialize this framework, since the original address
1604	 * before alignment is not saved.
1605	 */
1606	ncpu = ml_get_max_cpus();
1607	MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
1608	    M_TEMP, M_WAITOK);
1609	VERIFY(buf != NULL);
1610
1611	mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1612	    CPU_CACHE_LINE_SIZE);
1613	bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1614
1615	/*
1616	 * Set the max limit on sb_max to be 1/16 th of the size of
1617	 * memory allocated for mbuf clusters.
1618	 */
1619	high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1620	if (high_sb_max < sb_max) {
1621		/* sb_max is too large for this configuration, scale it down */
1622		if (high_sb_max > (1 << MBSHIFT)) {
1623			/* We have atleast 16 M of mbuf pool */
1624			sb_max = high_sb_max;
1625		} else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1626			/*
1627			 * If we have more than 1M of mbufpool, cap the size of
1628			 * max sock buf at 1M
1629			 */
1630			sb_max = high_sb_max = (1 << MBSHIFT);
1631		} else {
1632			sb_max = high_sb_max;
1633		}
1634	}
1635
1636	/* allocate space for mbuf_dump_buf */
1637	MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1638	VERIFY(mbuf_dump_buf != NULL);
1639
1640	if (mbuf_debug & MCF_DEBUG) {
1641		printf("%s: MLEN %d, MHLEN %d\n", __func__,
1642		    (int)_MLEN, (int)_MHLEN);
1643	}
1644
1645	printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1646	    (nmbclusters << MCLSHIFT) >> MBSHIFT,
1647	    (nclusters << MCLSHIFT) >> MBSHIFT,
1648	    (njcl << MCLSHIFT) >> MBSHIFT);
1649}
1650
1651/*
1652 * Obtain a slab of object(s) from the class's freelist.
1653 */
1654static mcache_obj_t *
1655slab_alloc(mbuf_class_t class, int wait)
1656{
1657	mcl_slab_t *sp;
1658	mcache_obj_t *buf;
1659
1660	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1661
1662	VERIFY(class != MC_16KCL || njcl > 0);
1663
1664	/* This should always be NULL for us */
1665	VERIFY(m_cobjlist(class) == NULL);
1666
1667	/*
1668	 * Treat composite objects as having longer lifespan by using
1669	 * a slab from the reverse direction, in hoping that this could
1670	 * reduce the probability of fragmentation for slabs that hold
1671	 * more than one buffer chunks (e.g. mbuf slabs).  For other
1672	 * slabs, this probably doesn't make much of a difference.
1673	 */
1674	if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
1675		sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1676	else
1677		sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1678
1679	if (sp == NULL) {
1680		VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1681		/* The slab list for this class is empty */
1682		return (NULL);
1683	}
1684
1685	VERIFY(m_infree(class) > 0);
1686	VERIFY(!slab_is_detached(sp));
1687	VERIFY(sp->sl_class == class &&
1688	    (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1689	buf = sp->sl_head;
1690	VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1691
1692	if (class == MC_MBUF) {
1693		sp->sl_head = buf->obj_next;
1694		VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1695	} else if (class == MC_CL) {
1696		sp->sl_head = buf->obj_next;
1697		VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
1698	} else {
1699		sp->sl_head = NULL;
1700	}
1701	if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1702		slab_nextptr_panic(sp, sp->sl_head);
1703		/* In case sl_head is in the map but not in the slab */
1704		VERIFY(slab_inrange(sp, sp->sl_head));
1705		/* NOTREACHED */
1706	}
1707
1708	/* Increment slab reference */
1709	sp->sl_refcnt++;
1710
1711	if (mclaudit != NULL) {
1712		mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1713		mca->mca_uflags = 0;
1714		/* Save contents on mbuf objects only */
1715		if (class == MC_MBUF)
1716			mca->mca_uflags |= MB_SCVALID;
1717	}
1718
1719	if (class == MC_CL) {
1720		mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1721		/*
1722		 * A 2K cluster slab can have at most NCLPBG references.
1723		 */
1724		VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1725		    sp->sl_chunks == NCLPBG &&
1726		    sp->sl_len == m_maxsize(MC_BIGCL));
1727		VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
1728	} else if (class == MC_BIGCL) {
1729		mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1730		    m_infree(MC_MBUF_BIGCL);
1731		/*
1732		 * A 4K cluster slab can have at most 1 reference.
1733		 */
1734		VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1735		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1736	} else if (class == MC_16KCL) {
1737		mcl_slab_t *nsp;
1738		int k;
1739
1740		--m_infree(MC_16KCL);
1741		VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1742		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1743		/*
1744		 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1745		 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1746		 * most 1 reference.
1747		 */
1748		for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1749			nsp = nsp->sl_next;
1750			/* Next slab must already be present */
1751			VERIFY(nsp != NULL);
1752			nsp->sl_refcnt++;
1753			VERIFY(!slab_is_detached(nsp));
1754			VERIFY(nsp->sl_class == MC_16KCL &&
1755			    nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1756			    nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1757			    nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1758			    nsp->sl_head == NULL);
1759		}
1760	} else {
1761		VERIFY(class == MC_MBUF);
1762		--m_infree(MC_MBUF);
1763		/*
1764		 * If auditing is turned on, this check is
1765		 * deferred until later in mbuf_slab_audit().
1766		 */
1767		if (mclaudit == NULL)
1768			_MCHECK((struct mbuf *)buf);
1769		/*
1770		 * Since we have incremented the reference count above,
1771		 * an mbuf slab (formerly a 4KB cluster slab that was cut
1772		 * up into mbufs) must have a reference count between 1
1773		 * and NMBPBG at this point.
1774		 */
1775		VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1776		    sp->sl_chunks == NMBPBG &&
1777		    sp->sl_len == m_maxsize(MC_BIGCL));
1778		VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
1779	}
1780
1781	/* If empty, remove this slab from the class's freelist */
1782	if (sp->sl_head == NULL) {
1783		VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1784		VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
1785		slab_remove(sp, class);
1786	}
1787
1788	return (buf);
1789}
1790
1791/*
1792 * Place a slab of object(s) back into a class's slab list.
1793 */
1794static void
1795slab_free(mbuf_class_t class, mcache_obj_t *buf)
1796{
1797	mcl_slab_t *sp;
1798
1799	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1800
1801	VERIFY(class != MC_16KCL || njcl > 0);
1802	VERIFY(buf->obj_next == NULL);
1803	sp = slab_get(buf);
1804	VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1805	    (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1806
1807	/* Decrement slab reference */
1808	sp->sl_refcnt--;
1809
1810	if (class == MC_CL) {
1811		VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1812		/*
1813		 * A slab that has been splitted for 2KB clusters can have
1814		 * at most 1 outstanding reference at this point.
1815		 */
1816		VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1817		    sp->sl_chunks == NCLPBG &&
1818		    sp->sl_len == m_maxsize(MC_BIGCL));
1819		VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1820		    (slab_is_detached(sp) && sp->sl_head == NULL));
1821	} else if (class == MC_BIGCL) {
1822		VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1823		/*
1824		 * A 4KB cluster slab can have at most 1 reference
1825		 * which must be 0 at this point.
1826		 */
1827		VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1828		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1829		VERIFY(slab_is_detached(sp));
1830	} else if (class == MC_16KCL) {
1831		mcl_slab_t *nsp;
1832		int k;
1833		/*
1834		 * A 16KB cluster takes NSLABSP16KB slabs, all must
1835		 * now have 0 reference.
1836		 */
1837		VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1838		VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1839		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1840		VERIFY(slab_is_detached(sp));
1841		for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1842			nsp = nsp->sl_next;
1843			/* Next slab must already be present */
1844			VERIFY(nsp != NULL);
1845			nsp->sl_refcnt--;
1846			VERIFY(slab_is_detached(nsp));
1847			VERIFY(nsp->sl_class == MC_16KCL &&
1848			    (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1849			    nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1850			    nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1851			    nsp->sl_head == NULL);
1852		}
1853	} else {
1854		/*
1855		 * A slab that has been splitted for mbufs has at most NMBPBG
1856		 * reference counts.  Since we have decremented one reference
1857		 * above, it must now be between 0 and NMBPBG-1.
1858		 */
1859		VERIFY(class == MC_MBUF);
1860		VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1861		    sp->sl_chunks == NMBPBG &&
1862		    sp->sl_len == m_maxsize(MC_BIGCL));
1863		VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
1864		    (slab_is_detached(sp) && sp->sl_head == NULL));
1865	}
1866
1867	/*
1868	 * When auditing is enabled, ensure that the buffer still
1869	 * contains the free pattern.  Otherwise it got corrupted
1870	 * while at the CPU cache layer.
1871	 */
1872	if (mclaudit != NULL) {
1873		mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1874		if (mclverify) {
1875			mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1876		}
1877		mca->mca_uflags &= ~MB_SCVALID;
1878	}
1879
1880	if (class == MC_CL) {
1881		mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1882		buf->obj_next = sp->sl_head;
1883	} else if (class == MC_BIGCL) {
1884		mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1885		    m_infree(MC_MBUF_BIGCL);
1886	} else if (class == MC_16KCL) {
1887		++m_infree(MC_16KCL);
1888	} else {
1889		++m_infree(MC_MBUF);
1890		buf->obj_next = sp->sl_head;
1891	}
1892	sp->sl_head = buf;
1893
1894	/*
1895	 * If a slab has been splitted to either one which holds 2KB clusters,
1896	 * or one which holds mbufs, turn it back to one which holds a 4KB
1897	 * cluster.
1898	 */
1899	if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1900	    m_total(class) > m_minlimit(class) &&
1901	    m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1902		int i = NMBPBG;
1903
1904		m_total(MC_BIGCL)++;
1905		mbstat.m_bigclusters = m_total(MC_BIGCL);
1906		m_total(MC_MBUF) -= NMBPBG;
1907		mbstat.m_mbufs = m_total(MC_MBUF);
1908		m_infree(MC_MBUF) -= NMBPBG;
1909		mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1910
1911		VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1912		VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
1913
1914		while (i--) {
1915			struct mbuf *m = sp->sl_head;
1916			VERIFY(m != NULL);
1917			sp->sl_head = m->m_next;
1918			m->m_next = NULL;
1919		}
1920		VERIFY(sp->sl_head == NULL);
1921
1922		/* Remove the slab from the mbuf class's slab list */
1923		slab_remove(sp, class);
1924
1925		/* Reinitialize it as a 4KB cluster slab */
1926		slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1927		    sp->sl_len, 0, 1);
1928
1929		if (mclverify) {
1930			mcache_set_pattern(MCACHE_FREE_PATTERN,
1931			    (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1932		}
1933		mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1934		    m_infree(MC_MBUF_BIGCL);
1935
1936		VERIFY(slab_is_detached(sp));
1937		/* And finally switch class */
1938		class = MC_BIGCL;
1939	} else if (class == MC_CL && sp->sl_refcnt == 0 &&
1940	    m_total(class) > m_minlimit(class) &&
1941	    m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1942		int i = NCLPBG;
1943
1944		m_total(MC_BIGCL)++;
1945		mbstat.m_bigclusters = m_total(MC_BIGCL);
1946		m_total(MC_CL) -= NCLPBG;
1947		mbstat.m_clusters = m_total(MC_CL);
1948		m_infree(MC_CL) -= NCLPBG;
1949		VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1950		VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1951
1952		while (i--) {
1953			union mcluster *c = sp->sl_head;
1954			VERIFY(c != NULL);
1955			sp->sl_head = c->mcl_next;
1956			c->mcl_next = NULL;
1957		}
1958		VERIFY(sp->sl_head == NULL);
1959
1960		/* Remove the slab from the 2KB cluster class's slab list */
1961		slab_remove(sp, class);
1962
1963		/* Reinitialize it as a 4KB cluster slab */
1964		slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1965		    sp->sl_len, 0, 1);
1966
1967		if (mclverify) {
1968			mcache_set_pattern(MCACHE_FREE_PATTERN,
1969			    (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1970		}
1971		mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1972		    m_infree(MC_MBUF_BIGCL);
1973
1974		VERIFY(slab_is_detached(sp));
1975		/* And finally switch class */
1976		class = MC_BIGCL;
1977	}
1978
1979	/* Reinsert the slab to the class's slab list */
1980	if (slab_is_detached(sp))
1981		slab_insert(sp, class);
1982}
1983
1984/*
1985 * Common allocator for rudimentary objects called by the CPU cache layer
1986 * during an allocation request whenever there is no available element in the
1987 * bucket layer.  It returns one or more elements from the appropriate global
1988 * freelist.  If the freelist is empty, it will attempt to populate it and
1989 * retry the allocation.
1990 */
1991static unsigned int
1992mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1993{
1994	mbuf_class_t class = (mbuf_class_t)arg;
1995	unsigned int need = num;
1996	mcache_obj_t **list = *plist;
1997
1998	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1999	ASSERT(need > 0);
2000
2001	lck_mtx_lock(mbuf_mlock);
2002
2003	for (;;) {
2004		if ((*list = slab_alloc(class, wait)) != NULL) {
2005			(*list)->obj_next = NULL;
2006			list = *plist = &(*list)->obj_next;
2007
2008			if (--need == 0) {
2009				/*
2010				 * If the number of elements in freelist has
2011				 * dropped below low watermark, asynchronously
2012				 * populate the freelist now rather than doing
2013				 * it later when we run out of elements.
2014				 */
2015				if (!mbuf_cached_above(class, wait) &&
2016				    m_infree(class) < m_total(class) >> 5) {
2017					(void) freelist_populate(class, 1,
2018					    M_DONTWAIT);
2019				}
2020				break;
2021			}
2022		} else {
2023			VERIFY(m_infree(class) == 0 || class == MC_CL);
2024
2025			(void) freelist_populate(class, 1,
2026			    (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2027
2028			if (m_infree(class) > 0)
2029				continue;
2030
2031			/* Check if there's anything at the cache layer */
2032			if (mbuf_cached_above(class, wait))
2033				break;
2034
2035			/* watchdog checkpoint */
2036			mbuf_watchdog();
2037
2038			/* We have nothing and cannot block; give up */
2039			if (wait & MCR_NOSLEEP) {
2040				if (!(wait & MCR_TRYHARD)) {
2041					m_fail_cnt(class)++;
2042					mbstat.m_drops++;
2043					break;
2044				}
2045			}
2046
2047			/*
2048			 * If the freelist is still empty and the caller is
2049			 * willing to be blocked, sleep on the wait channel
2050			 * until an element is available.  Otherwise, if
2051			 * MCR_TRYHARD is set, do our best to satisfy the
2052			 * request without having to go to sleep.
2053			 */
2054			if (mbuf_worker_ready &&
2055			    mbuf_sleep(class, need, wait))
2056				break;
2057
2058			lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2059		}
2060	}
2061
2062	m_alloc_cnt(class) += num - need;
2063	lck_mtx_unlock(mbuf_mlock);
2064
2065	return (num - need);
2066}
2067
2068/*
2069 * Common de-allocator for rudimentary objects called by the CPU cache
2070 * layer when one or more elements need to be returned to the appropriate
2071 * global freelist.
2072 */
2073static void
2074mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2075{
2076	mbuf_class_t class = (mbuf_class_t)arg;
2077	mcache_obj_t *nlist;
2078	unsigned int num = 0;
2079	int w;
2080
2081	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2082
2083	lck_mtx_lock(mbuf_mlock);
2084
2085	for (;;) {
2086		nlist = list->obj_next;
2087		list->obj_next = NULL;
2088		slab_free(class, list);
2089		++num;
2090		if ((list = nlist) == NULL)
2091			break;
2092	}
2093	m_free_cnt(class) += num;
2094
2095	if ((w = mb_waiters) > 0)
2096		mb_waiters = 0;
2097
2098	lck_mtx_unlock(mbuf_mlock);
2099
2100	if (w != 0)
2101		wakeup(mb_waitchan);
2102}
2103
2104/*
2105 * Common auditor for rudimentary objects called by the CPU cache layer
2106 * during an allocation or free request.  For the former, this is called
2107 * after the objects are obtained from either the bucket or slab layer
2108 * and before they are returned to the caller.  For the latter, this is
2109 * called immediately during free and before placing the objects into
2110 * the bucket or slab layer.
2111 */
2112static void
2113mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2114{
2115	mbuf_class_t class = (mbuf_class_t)arg;
2116	mcache_audit_t *mca;
2117
2118	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2119
2120	while (list != NULL) {
2121		lck_mtx_lock(mbuf_mlock);
2122		mca = mcl_audit_buf2mca(class, list);
2123
2124		/* Do the sanity checks */
2125		if (class == MC_MBUF) {
2126			mcl_audit_mbuf(mca, list, FALSE, alloc);
2127			ASSERT(mca->mca_uflags & MB_SCVALID);
2128		} else {
2129			mcl_audit_cluster(mca, list, m_maxsize(class),
2130			    alloc, TRUE);
2131			ASSERT(!(mca->mca_uflags & MB_SCVALID));
2132		}
2133		/* Record this transaction */
2134		if (mcltrace)
2135			mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2136
2137		if (alloc)
2138			mca->mca_uflags |= MB_INUSE;
2139		else
2140			mca->mca_uflags &= ~MB_INUSE;
2141		/* Unpair the object (unconditionally) */
2142		mca->mca_uptr = NULL;
2143		lck_mtx_unlock(mbuf_mlock);
2144
2145		list = list->obj_next;
2146	}
2147}
2148
2149/*
2150 * Common notify routine for all caches.  It is called by mcache when
2151 * one or more objects get freed.  We use this indication to trigger
2152 * the wakeup of any sleeping threads so that they can retry their
2153 * allocation requests.
2154 */
2155static void
2156mbuf_slab_notify(void *arg, u_int32_t reason)
2157{
2158	mbuf_class_t class = (mbuf_class_t)arg;
2159	int w;
2160
2161	ASSERT(MBUF_CLASS_VALID(class));
2162
2163	if (reason != MCN_RETRYALLOC)
2164		return;
2165
2166	lck_mtx_lock(mbuf_mlock);
2167	if ((w = mb_waiters) > 0) {
2168		m_notified(class)++;
2169		mb_waiters = 0;
2170	}
2171	lck_mtx_unlock(mbuf_mlock);
2172
2173	if (w != 0)
2174		wakeup(mb_waitchan);
2175}
2176
2177/*
2178 * Obtain object(s) from the composite class's freelist.
2179 */
2180static unsigned int
2181cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2182{
2183	unsigned int need = num;
2184	mcl_slab_t *sp, *clsp, *nsp;
2185	struct mbuf *m;
2186	mcache_obj_t **list = *plist;
2187	void *cl;
2188
2189	VERIFY(need > 0);
2190	VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2191	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2192
2193	/* Get what we can from the freelist */
2194	while ((*list = m_cobjlist(class)) != NULL) {
2195		MRANGE(*list);
2196
2197		m = (struct mbuf *)*list;
2198		sp = slab_get(m);
2199		cl = m->m_ext.ext_buf;
2200		clsp = slab_get(cl);
2201		VERIFY(m->m_flags == M_EXT && cl != NULL);
2202		VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
2203
2204		if (class == MC_MBUF_CL) {
2205			VERIFY(clsp->sl_refcnt >= 1 &&
2206			    clsp->sl_refcnt <= NCLPBG);
2207		} else {
2208			VERIFY(clsp->sl_refcnt == 1);
2209		}
2210
2211		if (class == MC_MBUF_16KCL) {
2212			int k;
2213			for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2214				nsp = nsp->sl_next;
2215				/* Next slab must already be present */
2216				VERIFY(nsp != NULL);
2217				VERIFY(nsp->sl_refcnt == 1);
2218			}
2219		}
2220
2221		if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2222		    !MBUF_IN_MAP(m_cobjlist(class))) {
2223			slab_nextptr_panic(sp, m_cobjlist(class));
2224			/* NOTREACHED */
2225		}
2226		(*list)->obj_next = NULL;
2227		list = *plist = &(*list)->obj_next;
2228
2229		if (--need == 0)
2230			break;
2231	}
2232	m_infree(class) -= (num - need);
2233
2234	return (num - need);
2235}
2236
2237/*
2238 * Place object(s) back into a composite class's freelist.
2239 */
2240static unsigned int
2241cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2242{
2243	mcache_obj_t *o, *tail;
2244	unsigned int num = 0;
2245	struct mbuf *m, *ms;
2246	mcache_audit_t *mca = NULL;
2247	mcache_obj_t *ref_list = NULL;
2248	mcl_slab_t *clsp, *nsp;
2249	void *cl;
2250	mbuf_class_t cl_class;
2251
2252	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2253	VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2254	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2255
2256	if (class == MC_MBUF_CL) {
2257		cl_class = MC_CL;
2258	} else if (class == MC_MBUF_BIGCL) {
2259		cl_class = MC_BIGCL;
2260	} else {
2261		VERIFY(class == MC_MBUF_16KCL);
2262		cl_class = MC_16KCL;
2263	}
2264
2265	o = tail = list;
2266
2267	while ((m = ms = (struct mbuf *)o) != NULL) {
2268		mcache_obj_t *rfa, *nexto = o->obj_next;
2269
2270		/* Do the mbuf sanity checks */
2271		if (mclaudit != NULL) {
2272			mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2273			if (mclverify) {
2274				mcache_audit_free_verify(mca, m, 0,
2275				    m_maxsize(MC_MBUF));
2276			}
2277			ms = MCA_SAVED_MBUF_PTR(mca);
2278		}
2279
2280		/* Do the cluster sanity checks */
2281		cl = ms->m_ext.ext_buf;
2282		clsp = slab_get(cl);
2283		if (mclverify) {
2284			size_t size = m_maxsize(cl_class);
2285			mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2286			    (mcache_obj_t *)cl), cl, 0, size);
2287		}
2288		VERIFY(ms->m_type == MT_FREE);
2289		VERIFY(ms->m_flags == M_EXT);
2290		VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2291		if (cl_class == MC_CL) {
2292			VERIFY(clsp->sl_refcnt >= 1 &&
2293			    clsp->sl_refcnt <= NCLPBG);
2294		} else {
2295			VERIFY(clsp->sl_refcnt == 1);
2296		}
2297		if (cl_class == MC_16KCL) {
2298			int k;
2299			for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2300				nsp = nsp->sl_next;
2301				/* Next slab must already be present */
2302				VERIFY(nsp != NULL);
2303				VERIFY(nsp->sl_refcnt == 1);
2304			}
2305		}
2306
2307		/*
2308		 * If we're asked to purge, restore the actual mbuf using
2309		 * contents of the shadow structure (if auditing is enabled)
2310		 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2311		 * about to free it and the attached cluster into their caches.
2312		 */
2313		if (purged) {
2314			/* Restore constructed mbuf fields */
2315			if (mclaudit != NULL)
2316				mcl_audit_restore_mbuf(m, mca, TRUE);
2317
2318			MEXT_REF(m) = 0;
2319			MEXT_FLAGS(m) = 0;
2320
2321			rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2322			rfa->obj_next = ref_list;
2323			ref_list = rfa;
2324			MEXT_RFA(m) = NULL;
2325
2326			m->m_type = MT_FREE;
2327			m->m_flags = m->m_len = 0;
2328			m->m_next = m->m_nextpkt = NULL;
2329
2330			/* Save mbuf fields and make auditing happy */
2331			if (mclaudit != NULL)
2332				mcl_audit_mbuf(mca, o, FALSE, FALSE);
2333
2334			VERIFY(m_total(class) > 0);
2335			m_total(class)--;
2336
2337			/* Free the mbuf */
2338			o->obj_next = NULL;
2339			slab_free(MC_MBUF, o);
2340
2341			/* And free the cluster */
2342			((mcache_obj_t *)cl)->obj_next = NULL;
2343			if (class == MC_MBUF_CL)
2344				slab_free(MC_CL, cl);
2345			else if (class == MC_MBUF_BIGCL)
2346				slab_free(MC_BIGCL, cl);
2347			else
2348				slab_free(MC_16KCL, cl);
2349		}
2350
2351		++num;
2352		tail = o;
2353		o = nexto;
2354	}
2355
2356	if (!purged) {
2357		tail->obj_next = m_cobjlist(class);
2358		m_cobjlist(class) = list;
2359		m_infree(class) += num;
2360	} else if (ref_list != NULL) {
2361		mcache_free_ext(ref_cache, ref_list);
2362	}
2363
2364	return (num);
2365}
2366
2367/*
2368 * Common allocator for composite objects called by the CPU cache layer
2369 * during an allocation request whenever there is no available element in
2370 * the bucket layer.  It returns one or more composite elements from the
2371 * appropriate global freelist.  If the freelist is empty, it will attempt
2372 * to obtain the rudimentary objects from their caches and construct them
2373 * into composite mbuf + cluster objects.
2374 */
2375static unsigned int
2376mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2377    int wait)
2378{
2379	mbuf_class_t class = (mbuf_class_t)arg;
2380	mbuf_class_t cl_class = 0;
2381	unsigned int num = 0, cnum = 0, want = needed;
2382	mcache_obj_t *ref_list = NULL;
2383	mcache_obj_t *mp_list = NULL;
2384	mcache_obj_t *clp_list = NULL;
2385	mcache_obj_t **list;
2386	struct ext_ref *rfa;
2387	struct mbuf *m;
2388	void *cl;
2389
2390	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2391	ASSERT(needed > 0);
2392
2393	VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2394
2395	/* There should not be any slab for this class */
2396	VERIFY(m_slab_cnt(class) == 0 &&
2397	    m_slablist(class).tqh_first == NULL &&
2398	    m_slablist(class).tqh_last == NULL);
2399
2400	lck_mtx_lock(mbuf_mlock);
2401
2402	/* Try using the freelist first */
2403	num = cslab_alloc(class, plist, needed);
2404	list = *plist;
2405	if (num == needed) {
2406		m_alloc_cnt(class) += num;
2407		lck_mtx_unlock(mbuf_mlock);
2408		return (needed);
2409	}
2410
2411	lck_mtx_unlock(mbuf_mlock);
2412
2413	/*
2414	 * We could not satisfy the request using the freelist alone;
2415	 * allocate from the appropriate rudimentary caches and use
2416	 * whatever we can get to construct the composite objects.
2417	 */
2418	needed -= num;
2419
2420	/*
2421	 * Mark these allocation requests as coming from a composite cache.
2422	 * Also, if the caller is willing to be blocked, mark the request
2423	 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2424	 * slab layer waiting for the individual object when one or more
2425	 * of the already-constructed composite objects are available.
2426	 */
2427	wait |= MCR_COMP;
2428	if (!(wait & MCR_NOSLEEP))
2429		wait |= MCR_FAILOK;
2430
2431	/* allocate mbufs */
2432	needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2433	if (needed == 0) {
2434		ASSERT(mp_list == NULL);
2435		goto fail;
2436	}
2437
2438	/* allocate clusters */
2439	if (class == MC_MBUF_CL) {
2440		cl_class = MC_CL;
2441	} else if (class == MC_MBUF_BIGCL) {
2442		cl_class = MC_BIGCL;
2443	} else {
2444		VERIFY(class == MC_MBUF_16KCL);
2445		cl_class = MC_16KCL;
2446	}
2447	needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2448	if (needed == 0) {
2449		ASSERT(clp_list == NULL);
2450		goto fail;
2451	}
2452
2453	needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2454	if (needed == 0) {
2455		ASSERT(ref_list == NULL);
2456		goto fail;
2457	}
2458
2459	/*
2460	 * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2461	 * overs will get freed accordingly before we return to caller.
2462	 */
2463	for (cnum = 0; cnum < needed; cnum++) {
2464		struct mbuf *ms;
2465
2466		m = ms = (struct mbuf *)mp_list;
2467		mp_list = mp_list->obj_next;
2468
2469		cl = clp_list;
2470		clp_list = clp_list->obj_next;
2471		((mcache_obj_t *)cl)->obj_next = NULL;
2472
2473		rfa = (struct ext_ref *)ref_list;
2474		ref_list = ref_list->obj_next;
2475		((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2476
2477		/*
2478		 * If auditing is enabled, construct the shadow mbuf
2479		 * in the audit structure instead of in the actual one.
2480		 * mbuf_cslab_audit() will take care of restoring the
2481		 * contents after the integrity check.
2482		 */
2483		if (mclaudit != NULL) {
2484			mcache_audit_t *mca, *cl_mca;
2485
2486			lck_mtx_lock(mbuf_mlock);
2487			mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2488			ms = MCA_SAVED_MBUF_PTR(mca);
2489			cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2490
2491			/*
2492			 * Pair them up.  Note that this is done at the time
2493			 * the mbuf+cluster objects are constructed.  This
2494			 * information should be treated as "best effort"
2495			 * debugging hint since more than one mbufs can refer
2496			 * to a cluster.  In that case, the cluster might not
2497			 * be freed along with the mbuf it was paired with.
2498			 */
2499			mca->mca_uptr = cl_mca;
2500			cl_mca->mca_uptr = mca;
2501
2502			ASSERT(mca->mca_uflags & MB_SCVALID);
2503			ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2504			lck_mtx_unlock(mbuf_mlock);
2505
2506			/* Technically, they are in the freelist */
2507			if (mclverify) {
2508				size_t size;
2509
2510				mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2511				    m_maxsize(MC_MBUF));
2512
2513				if (class == MC_MBUF_CL)
2514					size = m_maxsize(MC_CL);
2515				else if (class == MC_MBUF_BIGCL)
2516					size = m_maxsize(MC_BIGCL);
2517				else
2518					size = m_maxsize(MC_16KCL);
2519
2520				mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2521				    size);
2522			}
2523		}
2524
2525		MBUF_INIT(ms, 0, MT_FREE);
2526		if (class == MC_MBUF_16KCL) {
2527			MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2528		} else if (class == MC_MBUF_BIGCL) {
2529			MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2530		} else {
2531			MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2532		}
2533		VERIFY(ms->m_flags == M_EXT);
2534		VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2535
2536		*list = (mcache_obj_t *)m;
2537		(*list)->obj_next = NULL;
2538		list = *plist = &(*list)->obj_next;
2539	}
2540
2541fail:
2542	/*
2543	 * Free up what's left of the above.
2544	 */
2545	if (mp_list != NULL)
2546		mcache_free_ext(m_cache(MC_MBUF), mp_list);
2547	if (clp_list != NULL)
2548		mcache_free_ext(m_cache(cl_class), clp_list);
2549	if (ref_list != NULL)
2550		mcache_free_ext(ref_cache, ref_list);
2551
2552	lck_mtx_lock(mbuf_mlock);
2553	if (num > 0 || cnum > 0) {
2554		m_total(class) += cnum;
2555		VERIFY(m_total(class) <= m_maxlimit(class));
2556		m_alloc_cnt(class) += num + cnum;
2557	}
2558	if ((num + cnum) < want)
2559		m_fail_cnt(class) += (want - (num + cnum));
2560	lck_mtx_unlock(mbuf_mlock);
2561
2562	return (num + cnum);
2563}
2564
2565/*
2566 * Common de-allocator for composite objects called by the CPU cache
2567 * layer when one or more elements need to be returned to the appropriate
2568 * global freelist.
2569 */
2570static void
2571mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2572{
2573	mbuf_class_t class = (mbuf_class_t)arg;
2574	unsigned int num;
2575	int w;
2576
2577	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2578
2579	lck_mtx_lock(mbuf_mlock);
2580
2581	num = cslab_free(class, list, purged);
2582	m_free_cnt(class) += num;
2583
2584	if ((w = mb_waiters) > 0)
2585		mb_waiters = 0;
2586
2587	lck_mtx_unlock(mbuf_mlock);
2588
2589	if (w != 0)
2590		wakeup(mb_waitchan);
2591}
2592
2593/*
2594 * Common auditor for composite objects called by the CPU cache layer
2595 * during an allocation or free request.  For the former, this is called
2596 * after the objects are obtained from either the bucket or slab layer
2597 * and before they are returned to the caller.  For the latter, this is
2598 * called immediately during free and before placing the objects into
2599 * the bucket or slab layer.
2600 */
2601static void
2602mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2603{
2604	mbuf_class_t class = (mbuf_class_t)arg;
2605	mcache_audit_t *mca;
2606	struct mbuf *m, *ms;
2607	mcl_slab_t *clsp, *nsp;
2608	size_t size;
2609	void *cl;
2610
2611	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2612
2613	while ((m = ms = (struct mbuf *)list) != NULL) {
2614		lck_mtx_lock(mbuf_mlock);
2615		/* Do the mbuf sanity checks and record its transaction */
2616		mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2617		mcl_audit_mbuf(mca, m, TRUE, alloc);
2618		if (mcltrace)
2619			mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2620
2621		if (alloc)
2622			mca->mca_uflags |= MB_COMP_INUSE;
2623		else
2624			mca->mca_uflags &= ~MB_COMP_INUSE;
2625
2626		/*
2627		 * Use the shadow mbuf in the audit structure if we are
2628		 * freeing, since the contents of the actual mbuf has been
2629		 * pattern-filled by the above call to mcl_audit_mbuf().
2630		 */
2631		if (!alloc && mclverify)
2632			ms = MCA_SAVED_MBUF_PTR(mca);
2633
2634		/* Do the cluster sanity checks and record its transaction */
2635		cl = ms->m_ext.ext_buf;
2636		clsp = slab_get(cl);
2637		VERIFY(ms->m_flags == M_EXT && cl != NULL);
2638		VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2639		if (class == MC_MBUF_CL)
2640			VERIFY(clsp->sl_refcnt >= 1 &&
2641			    clsp->sl_refcnt <= NCLPBG);
2642		else
2643			VERIFY(clsp->sl_refcnt == 1);
2644
2645		if (class == MC_MBUF_16KCL) {
2646			int k;
2647			for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2648				nsp = nsp->sl_next;
2649				/* Next slab must already be present */
2650				VERIFY(nsp != NULL);
2651				VERIFY(nsp->sl_refcnt == 1);
2652			}
2653		}
2654
2655		mca = mcl_audit_buf2mca(MC_CL, cl);
2656		if (class == MC_MBUF_CL)
2657			size = m_maxsize(MC_CL);
2658		else if (class == MC_MBUF_BIGCL)
2659			size = m_maxsize(MC_BIGCL);
2660		else
2661			size = m_maxsize(MC_16KCL);
2662		mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2663		if (mcltrace)
2664			mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2665
2666		if (alloc)
2667			mca->mca_uflags |= MB_COMP_INUSE;
2668		else
2669			mca->mca_uflags &= ~MB_COMP_INUSE;
2670		lck_mtx_unlock(mbuf_mlock);
2671
2672		list = list->obj_next;
2673	}
2674}
2675
2676/*
2677 * Allocate some number of mbuf clusters and place on cluster freelist.
2678 */
2679static int
2680m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2681{
2682	int i;
2683	vm_size_t size = 0;
2684	int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2685	vm_offset_t page = 0;
2686	mcache_audit_t *mca_list = NULL;
2687	mcache_obj_t *con_list = NULL;
2688	mcl_slab_t *sp;
2689
2690	VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2691	    bufsize == m_maxsize(MC_16KCL));
2692
2693	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2694
2695	/*
2696	 * Multiple threads may attempt to populate the cluster map one
2697	 * after another.  Since we drop the lock below prior to acquiring
2698	 * the physical page(s), our view of the cluster map may no longer
2699	 * be accurate, and we could end up over-committing the pages beyond
2700	 * the maximum allowed for each class.  To prevent it, this entire
2701	 * operation (including the page mapping) is serialized.
2702	 */
2703	while (mb_clalloc_busy) {
2704		mb_clalloc_waiters++;
2705		(void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2706		    (PZERO-1), "m_clalloc", NULL);
2707		lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2708	}
2709
2710	/* We are busy now; tell everyone else to go away */
2711	mb_clalloc_busy = TRUE;
2712
2713	/*
2714	 * Honor the caller's wish to block or not block.  We have a way
2715	 * to grow the pool asynchronously using the mbuf worker thread.
2716	 */
2717	i = m_howmany(num, bufsize);
2718	if (i == 0 || (wait & M_DONTWAIT))
2719		goto out;
2720
2721	lck_mtx_unlock(mbuf_mlock);
2722
2723	size = round_page(i * bufsize);
2724	page = kmem_mb_alloc(mb_map, size, large_buffer);
2725
2726	/*
2727	 * If we did ask for "n" 16KB physically contiguous chunks
2728	 * and didn't get them, then please try again without this
2729	 * restriction.
2730	 */
2731	if (large_buffer && page == 0)
2732		page = kmem_mb_alloc(mb_map, size, 0);
2733
2734	if (page == 0) {
2735		if (bufsize == m_maxsize(MC_BIGCL)) {
2736			/* Try for 1 page if failed, only 4KB request */
2737			size = NBPG;
2738			page = kmem_mb_alloc(mb_map, size, 0);
2739		}
2740
2741		if (page == 0) {
2742			lck_mtx_lock(mbuf_mlock);
2743			goto out;
2744		}
2745	}
2746
2747	VERIFY(IS_P2ALIGNED(page, NBPG));
2748	numpages = size / NBPG;
2749
2750	/* If auditing is enabled, allocate the audit structures now */
2751	if (mclaudit != NULL) {
2752		int needed;
2753
2754		/*
2755		 * Yes, I realize this is a waste of memory for clusters
2756		 * that never get transformed into mbufs, as we may end
2757		 * up with NMBPBG-1 unused audit structures per cluster.
2758		 * But doing so tremendously simplifies the allocation
2759		 * strategy, since at this point we are not holding the
2760		 * mbuf lock and the caller is okay to be blocked.
2761		 */
2762		if (bufsize == m_maxsize(MC_BIGCL)) {
2763			needed = numpages * NMBPBG;
2764
2765			i = mcache_alloc_ext(mcl_audit_con_cache,
2766			    &con_list, needed, MCR_SLEEP);
2767
2768			VERIFY(con_list != NULL && i == needed);
2769		} else {
2770			needed = numpages / NSLABSP16KB;
2771		}
2772
2773		i = mcache_alloc_ext(mcache_audit_cache,
2774		    (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2775
2776		VERIFY(mca_list != NULL && i == needed);
2777	}
2778
2779	lck_mtx_lock(mbuf_mlock);
2780
2781	for (i = 0; i < numpages; i++, page += NBPG) {
2782		ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2783		ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
2784		mbuf_class_t class = MC_BIGCL;
2785
2786		/*
2787		 * If there is a mapper the appropriate I/O page is returned;
2788		 * zero out the page to discard its past contents to prevent
2789		 * exposing leftover kernel memory.
2790		 */
2791		VERIFY(offset < mcl_pages);
2792		if (mcl_paddr_base != 0) {
2793			bzero((void *)(uintptr_t) page, page_size);
2794			new_page = IOMapperInsertPage(mcl_paddr_base,
2795			    offset, new_page);
2796		}
2797		mcl_paddr[offset] = new_page;
2798
2799		/* Pattern-fill this fresh page */
2800		if (mclverify) {
2801			mcache_set_pattern(MCACHE_FREE_PATTERN,
2802			    (caddr_t)page, NBPG);
2803		}
2804		if (bufsize == m_maxsize(MC_BIGCL)) {
2805			union mbigcluster *mbc = (union mbigcluster *)page;
2806
2807			/* One for the entire page */
2808			sp = slab_get(mbc);
2809			if (mclaudit != NULL) {
2810				mcl_audit_init(mbc, &mca_list, &con_list,
2811				    AUDIT_CONTENTS_SIZE, NMBPBG);
2812			}
2813			VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2814			slab_init(sp, MC_BIGCL, SLF_MAPPED,
2815			    mbc, mbc, bufsize, 0, 1);
2816
2817			/* Insert this slab */
2818			slab_insert(sp, MC_BIGCL);
2819
2820			/* Update stats now since slab_get() drops the lock */
2821			mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2822			    m_infree(MC_MBUF_BIGCL);
2823			mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2824			VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2825			class = MC_BIGCL;
2826		} else if ((i % NSLABSP16KB) == 0) {
2827			union m16kcluster *m16kcl = (union m16kcluster *)page;
2828			mcl_slab_t *nsp;
2829			int k;
2830
2831			VERIFY(njcl > 0);
2832			/* One for the entire 16KB */
2833			sp = slab_get(m16kcl);
2834			if (mclaudit != NULL)
2835				mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2836
2837			VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2838			slab_init(sp, MC_16KCL, SLF_MAPPED,
2839			    m16kcl, m16kcl, bufsize, 0, 1);
2840
2841			/*
2842			 * 2nd-Nth page's slab is part of the first one,
2843			 * where N is NSLABSP16KB.
2844			 */
2845			for (k = 1; k < NSLABSP16KB; k++) {
2846				nsp = slab_get(((union mbigcluster *)page) + k);
2847				VERIFY(nsp->sl_refcnt == 0 &&
2848				    nsp->sl_flags == 0);
2849				slab_init(nsp, MC_16KCL,
2850				    SLF_MAPPED | SLF_PARTIAL,
2851				    m16kcl, NULL, 0, 0, 0);
2852			}
2853
2854			/* Insert this slab */
2855			slab_insert(sp, MC_16KCL);
2856
2857			/* Update stats now since slab_get() drops the lock */
2858			m_infree(MC_16KCL)++;
2859			m_total(MC_16KCL)++;
2860			VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2861			class = MC_16KCL;
2862		}
2863		if (!mb_peak_newreport && mbuf_report_usage(class))
2864			mb_peak_newreport = TRUE;
2865	}
2866	VERIFY(mca_list == NULL && con_list == NULL);
2867
2868	/* We're done; let others enter */
2869	mb_clalloc_busy = FALSE;
2870	if (mb_clalloc_waiters > 0) {
2871		mb_clalloc_waiters = 0;
2872		wakeup(mb_clalloc_waitchan);
2873	}
2874
2875	if (bufsize == m_maxsize(MC_BIGCL))
2876		return (numpages);
2877
2878	VERIFY(bufsize == m_maxsize(MC_16KCL));
2879	return (numpages / NSLABSP16KB);
2880
2881out:
2882	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2883
2884	/* We're done; let others enter */
2885	mb_clalloc_busy = FALSE;
2886	if (mb_clalloc_waiters > 0) {
2887		mb_clalloc_waiters = 0;
2888		wakeup(mb_clalloc_waitchan);
2889	}
2890
2891	/*
2892	 * When non-blocking we kick a thread if we have to grow the
2893	 * pool or if the number of free clusters is less than requested.
2894	 */
2895	if (bufsize == m_maxsize(MC_BIGCL)) {
2896		if (i > 0) {
2897			/*
2898			 * Remember total number of 4KB clusters needed
2899			 * at this time.
2900			 */
2901			i += m_total(MC_BIGCL);
2902			if (i > mbuf_expand_big) {
2903				mbuf_expand_big = i;
2904				if (mbuf_worker_ready)
2905					wakeup((caddr_t)&mbuf_worker_run);
2906			}
2907		}
2908
2909		if (m_infree(MC_BIGCL) >= num)
2910			return (1);
2911	} else {
2912		if (i > 0) {
2913			/*
2914			 * Remember total number of 16KB clusters needed
2915			 * at this time.
2916			 */
2917			i += m_total(MC_16KCL);
2918			if (i > mbuf_expand_16k) {
2919				mbuf_expand_16k = i;
2920				if (mbuf_worker_ready)
2921					wakeup((caddr_t)&mbuf_worker_run);
2922			}
2923		}
2924
2925		if (m_infree(MC_16KCL) >= num)
2926			return (1);
2927	}
2928	return (0);
2929}
2930
2931/*
2932 * Populate the global freelist of the corresponding buffer class.
2933 */
2934static int
2935freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2936{
2937	mcache_obj_t *o = NULL;
2938	int i, numpages = 0, count;
2939
2940	VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2941	    class == MC_16KCL);
2942
2943	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2944
2945	switch (class) {
2946	case MC_MBUF:
2947	case MC_CL:
2948	case MC_BIGCL:
2949		numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2950		i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2951
2952		/* Respect the 4KB clusters minimum limit */
2953		if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2954		    m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2955			if (class != MC_BIGCL || (wait & MCR_COMP))
2956				return (0);
2957		}
2958		if (class == MC_BIGCL)
2959			return (i != 0);
2960		break;
2961
2962	case MC_16KCL:
2963		return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2964		/* NOTREACHED */
2965
2966	default:
2967		VERIFY(0);
2968		/* NOTREACHED */
2969	}
2970
2971	VERIFY(class == MC_MBUF || class == MC_CL);
2972
2973	/* how many objects will we cut the page into? */
2974	int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2975
2976	for (count = 0; count < numpages; count++) {
2977
2978		/* respect totals, minlimit, maxlimit */
2979		if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2980		    m_total(class) >= m_maxlimit(class))
2981			break;
2982
2983		if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2984			break;
2985
2986		struct mbuf *m = (struct mbuf *)o;
2987		union mcluster *c = (union mcluster *)o;
2988		mcl_slab_t *sp = slab_get(o);
2989		mcache_audit_t *mca = NULL;
2990
2991		VERIFY(slab_is_detached(sp) &&
2992		    (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2993
2994		/*
2995		 * Make sure that the cluster is unmolested
2996		 * while in freelist
2997		 */
2998		if (mclverify) {
2999			mca = mcl_audit_buf2mca(MC_BIGCL, o);
3000			mcache_audit_free_verify(mca, o, 0,
3001			    m_maxsize(MC_BIGCL));
3002		}
3003
3004		/* Reinitialize it as an mbuf or 2K slab */
3005		slab_init(sp, class, sp->sl_flags,
3006		    sp->sl_base, NULL, sp->sl_len, 0, numobj);
3007
3008		VERIFY(o == (mcache_obj_t *)sp->sl_base);
3009		VERIFY(sp->sl_head == NULL);
3010
3011		VERIFY(m_total(MC_BIGCL) > 0);
3012		m_total(MC_BIGCL)--;
3013		mbstat.m_bigclusters = m_total(MC_BIGCL);
3014
3015		m_total(class) += numobj;
3016		m_infree(class) += numobj;
3017
3018		VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
3019		VERIFY(m_total(class) <= m_maxlimit(class));
3020		if (!mb_peak_newreport && mbuf_report_usage(class))
3021			mb_peak_newreport = TRUE;
3022
3023		i = numobj;
3024		if (class == MC_MBUF) {
3025			mbstat.m_mbufs = m_total(MC_MBUF);
3026			mtype_stat_add(MT_FREE, NMBPBG);
3027			while (i--) {
3028				/*
3029				 * If auditing is enabled, construct the
3030				 * shadow mbuf in the audit structure
3031				 * instead of the actual one.
3032				 * mbuf_slab_audit() will take care of
3033				 * restoring the contents after the
3034				 * integrity check.
3035				 */
3036				if (mclaudit != NULL) {
3037					struct mbuf *ms;
3038					mca = mcl_audit_buf2mca(MC_MBUF,
3039					    (mcache_obj_t *)m);
3040					ms = MCA_SAVED_MBUF_PTR(mca);
3041					ms->m_type = MT_FREE;
3042				} else {
3043					m->m_type = MT_FREE;
3044				}
3045				m->m_next = sp->sl_head;
3046				sp->sl_head = (void *)m++;
3047			}
3048		} else { /* MC_CL */
3049			mbstat.m_clfree =
3050			    m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3051			mbstat.m_clusters = m_total(MC_CL);
3052			while (i--) {
3053				c->mcl_next = sp->sl_head;
3054				sp->sl_head = (void *)c++;
3055			}
3056		}
3057
3058		/* Insert into the mbuf or 2k slab list */
3059		slab_insert(sp, class);
3060
3061		if ((i = mb_waiters) > 0)
3062			mb_waiters = 0;
3063		if (i != 0)
3064			wakeup(mb_waitchan);
3065	}
3066	return (count != 0);
3067}
3068
3069/*
3070 * For each class, initialize the freelist to hold m_minlimit() objects.
3071 */
3072static void
3073freelist_init(mbuf_class_t class)
3074{
3075	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3076
3077	VERIFY(class == MC_CL || class == MC_BIGCL);
3078	VERIFY(m_total(class) == 0);
3079	VERIFY(m_minlimit(class) > 0);
3080
3081	while (m_total(class) < m_minlimit(class))
3082		(void) freelist_populate(class, m_minlimit(class), M_WAIT);
3083
3084	VERIFY(m_total(class) >= m_minlimit(class));
3085}
3086
3087/*
3088 * (Inaccurately) check if it might be worth a trip back to the
3089 * mcache layer due the availability of objects there.  We'll
3090 * end up back here if there's nothing up there.
3091 */
3092static boolean_t
3093mbuf_cached_above(mbuf_class_t class, int wait)
3094{
3095	switch (class) {
3096	case MC_MBUF:
3097		if (wait & MCR_COMP)
3098			return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3099			    !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3100		break;
3101
3102	case MC_CL:
3103		if (wait & MCR_COMP)
3104			return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3105		break;
3106
3107	case MC_BIGCL:
3108		if (wait & MCR_COMP)
3109			return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3110		break;
3111
3112	case MC_16KCL:
3113		if (wait & MCR_COMP)
3114			return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3115		break;
3116
3117	case MC_MBUF_CL:
3118	case MC_MBUF_BIGCL:
3119	case MC_MBUF_16KCL:
3120		break;
3121
3122	default:
3123		VERIFY(0);
3124		/* NOTREACHED */
3125	}
3126
3127	return (!mcache_bkt_isempty(m_cache(class)));
3128}
3129
3130/*
3131 * If possible, convert constructed objects to raw ones.
3132 */
3133static boolean_t
3134mbuf_steal(mbuf_class_t class, unsigned int num)
3135{
3136	mcache_obj_t *top = NULL;
3137	mcache_obj_t **list = &top;
3138	unsigned int tot = 0;
3139
3140	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3141
3142	switch (class) {
3143	case MC_MBUF:
3144	case MC_CL:
3145	case MC_BIGCL:
3146	case MC_16KCL:
3147		return (FALSE);
3148
3149	case MC_MBUF_CL:
3150	case MC_MBUF_BIGCL:
3151	case MC_MBUF_16KCL:
3152		/* Get the required number of constructed objects if possible */
3153		if (m_infree(class) > m_minlimit(class)) {
3154			tot = cslab_alloc(class, &list,
3155			    MIN(num, m_infree(class)));
3156		}
3157
3158		/* And destroy them to get back the raw objects */
3159		if (top != NULL)
3160			(void) cslab_free(class, top, 1);
3161		break;
3162
3163	default:
3164		VERIFY(0);
3165		/* NOTREACHED */
3166	}
3167
3168	return (tot == num);
3169}
3170
3171static void
3172m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3173{
3174	int m, bmap = 0;
3175
3176	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3177
3178	VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3179	VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3180	VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3181
3182	/*
3183	 * This logic can be made smarter; for now, simply mark
3184	 * all other related classes as potential victims.
3185	 */
3186	switch (class) {
3187	case MC_MBUF:
3188		m_wantpurge(MC_CL)++;
3189		m_wantpurge(MC_BIGCL)++;
3190		m_wantpurge(MC_MBUF_CL)++;
3191		m_wantpurge(MC_MBUF_BIGCL)++;
3192		break;
3193
3194	case MC_CL:
3195		m_wantpurge(MC_MBUF)++;
3196		m_wantpurge(MC_BIGCL)++;
3197		m_wantpurge(MC_MBUF_BIGCL)++;
3198		if (!comp)
3199			m_wantpurge(MC_MBUF_CL)++;
3200		break;
3201
3202	case MC_BIGCL:
3203		m_wantpurge(MC_MBUF)++;
3204		m_wantpurge(MC_CL)++;
3205		m_wantpurge(MC_MBUF_CL)++;
3206		if (!comp)
3207			m_wantpurge(MC_MBUF_BIGCL)++;
3208		break;
3209
3210	case MC_16KCL:
3211		if (!comp)
3212			m_wantpurge(MC_MBUF_16KCL)++;
3213		break;
3214
3215	default:
3216		VERIFY(0);
3217		/* NOTREACHED */
3218	}
3219
3220	/*
3221	 * Run through each marked class and check if we really need to
3222	 * purge (and therefore temporarily disable) the per-CPU caches
3223	 * layer used by the class.  If so, remember the classes since
3224	 * we are going to drop the lock below prior to purging.
3225	 */
3226	for (m = 0; m < NELEM(mbuf_table); m++) {
3227		if (m_wantpurge(m) > 0) {
3228			m_wantpurge(m) = 0;
3229			/*
3230			 * Try hard to steal the required number of objects
3231			 * from the freelist of other mbuf classes.  Only
3232			 * purge and disable the per-CPU caches layer when
3233			 * we don't have enough; it's the last resort.
3234			 */
3235			if (!mbuf_steal(m, num))
3236				bmap |= (1 << m);
3237		}
3238	}
3239
3240	lck_mtx_unlock(mbuf_mlock);
3241
3242	if (bmap != 0) {
3243		/* signal the domains to drain */
3244		net_drain_domains();
3245
3246		/* Sigh; we have no other choices but to ask mcache to purge */
3247		for (m = 0; m < NELEM(mbuf_table); m++) {
3248			if ((bmap & (1 << m)) &&
3249			    mcache_purge_cache(m_cache(m), TRUE)) {
3250				lck_mtx_lock(mbuf_mlock);
3251				m_purge_cnt(m)++;
3252				mbstat.m_drain++;
3253				lck_mtx_unlock(mbuf_mlock);
3254			}
3255		}
3256	} else {
3257		/*
3258		 * Request mcache to reap extra elements from all of its caches;
3259		 * note that all reaps are serialized and happen only at a fixed
3260		 * interval.
3261		 */
3262		mcache_reap();
3263	}
3264	lck_mtx_lock(mbuf_mlock);
3265}
3266
3267static inline struct mbuf *
3268m_get_common(int wait, short type, int hdr)
3269{
3270	struct mbuf *m;
3271	int mcflags = MSLEEPF(wait);
3272
3273	/* Is this due to a non-blocking retry?  If so, then try harder */
3274	if (mcflags & MCR_NOSLEEP)
3275		mcflags |= MCR_TRYHARD;
3276
3277	m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3278	if (m != NULL) {
3279		MBUF_INIT(m, hdr, type);
3280		mtype_stat_inc(type);
3281		mtype_stat_dec(MT_FREE);
3282#if CONFIG_MACF_NET
3283		if (hdr && mac_init_mbuf(m, wait) != 0) {
3284			m_free(m);
3285			return (NULL);
3286		}
3287#endif /* MAC_NET */
3288	}
3289	return (m);
3290}
3291
3292/*
3293 * Space allocation routines; these are also available as macros
3294 * for critical paths.
3295 */
3296#define	_M_GET(wait, type)	m_get_common(wait, type, 0)
3297#define	_M_GETHDR(wait, type)	m_get_common(wait, type, 1)
3298#define	_M_RETRY(wait, type)	_M_GET(wait, type)
3299#define	_M_RETRYHDR(wait, type)	_M_GETHDR(wait, type)
3300#define	_MGET(m, how, type)	((m) = _M_GET(how, type))
3301#define	_MGETHDR(m, how, type)	((m) = _M_GETHDR(how, type))
3302
3303struct mbuf *
3304m_get(int wait, int type)
3305{
3306	return (_M_GET(wait, type));
3307}
3308
3309struct mbuf *
3310m_gethdr(int wait, int type)
3311{
3312	return (_M_GETHDR(wait, type));
3313}
3314
3315struct mbuf *
3316m_retry(int wait, int type)
3317{
3318	return (_M_RETRY(wait, type));
3319}
3320
3321struct mbuf *
3322m_retryhdr(int wait, int type)
3323{
3324	return (_M_RETRYHDR(wait, type));
3325}
3326
3327struct mbuf *
3328m_getclr(int wait, int type)
3329{
3330	struct mbuf *m;
3331
3332	_MGET(m, wait, type);
3333	if (m != NULL)
3334		bzero(MTOD(m, caddr_t), MLEN);
3335	return (m);
3336}
3337
3338struct mbuf *
3339m_free(struct mbuf *m)
3340{
3341	struct mbuf *n = m->m_next;
3342
3343	if (m->m_type == MT_FREE)
3344		panic("m_free: freeing an already freed mbuf");
3345
3346	if (m->m_flags & M_PKTHDR) {
3347		/* Check for scratch area overflow */
3348		m_redzone_verify(m);
3349		/* Free the aux data and tags if there is any */
3350		m_tag_delete_chain(m, NULL);
3351	}
3352
3353	if (m->m_flags & M_EXT) {
3354		u_int32_t refcnt;
3355		u_int32_t composite;
3356
3357		refcnt = m_decref(m);
3358		composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3359		if (refcnt == 0 && !composite) {
3360			if (m->m_ext.ext_free == NULL) {
3361				mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3362			} else if (m->m_ext.ext_free == m_bigfree) {
3363				mcache_free(m_cache(MC_BIGCL),
3364				    m->m_ext.ext_buf);
3365			} else if (m->m_ext.ext_free == m_16kfree) {
3366				mcache_free(m_cache(MC_16KCL),
3367				    m->m_ext.ext_buf);
3368			} else {
3369				(*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3370				    m->m_ext.ext_size, m->m_ext.ext_arg);
3371			}
3372			mcache_free(ref_cache, MEXT_RFA(m));
3373			MEXT_RFA(m) = NULL;
3374		} else if (refcnt == 0 && composite) {
3375			VERIFY(m->m_type != MT_FREE);
3376
3377			mtype_stat_dec(m->m_type);
3378			mtype_stat_inc(MT_FREE);
3379
3380			m->m_type = MT_FREE;
3381			m->m_flags = M_EXT;
3382			m->m_len = 0;
3383			m->m_next = m->m_nextpkt = NULL;
3384
3385			MEXT_FLAGS(m) &= ~EXTF_READONLY;
3386
3387			/* "Free" into the intermediate cache */
3388			if (m->m_ext.ext_free == NULL) {
3389				mcache_free(m_cache(MC_MBUF_CL), m);
3390			} else if (m->m_ext.ext_free == m_bigfree) {
3391				mcache_free(m_cache(MC_MBUF_BIGCL), m);
3392			} else {
3393				VERIFY(m->m_ext.ext_free == m_16kfree);
3394				mcache_free(m_cache(MC_MBUF_16KCL), m);
3395			}
3396			return (n);
3397		}
3398	}
3399
3400	if (m->m_type != MT_FREE) {
3401		mtype_stat_dec(m->m_type);
3402		mtype_stat_inc(MT_FREE);
3403	}
3404
3405	m->m_type = MT_FREE;
3406	m->m_flags = m->m_len = 0;
3407	m->m_next = m->m_nextpkt = NULL;
3408
3409	mcache_free(m_cache(MC_MBUF), m);
3410
3411	return (n);
3412}
3413
3414__private_extern__ struct mbuf *
3415m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3416    void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3417    int wait)
3418{
3419	struct ext_ref *rfa = NULL;
3420
3421	if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3422		return (NULL);
3423
3424	if (m->m_flags & M_EXT) {
3425		u_int32_t refcnt;
3426		u_int32_t composite;
3427
3428		refcnt = m_decref(m);
3429		composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3430		if (refcnt == 0 && !composite) {
3431			if (m->m_ext.ext_free == NULL) {
3432				mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3433			} else if (m->m_ext.ext_free == m_bigfree) {
3434				mcache_free(m_cache(MC_BIGCL),
3435				    m->m_ext.ext_buf);
3436			} else if (m->m_ext.ext_free == m_16kfree) {
3437				mcache_free(m_cache(MC_16KCL),
3438				    m->m_ext.ext_buf);
3439			} else {
3440				(*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3441				    m->m_ext.ext_size, m->m_ext.ext_arg);
3442			}
3443			/* Re-use the reference structure */
3444			rfa = MEXT_RFA(m);
3445		} else if (refcnt == 0 && composite) {
3446			VERIFY(m->m_type != MT_FREE);
3447
3448			mtype_stat_dec(m->m_type);
3449			mtype_stat_inc(MT_FREE);
3450
3451			m->m_type = MT_FREE;
3452			m->m_flags = M_EXT;
3453			m->m_len = 0;
3454			m->m_next = m->m_nextpkt = NULL;
3455
3456			MEXT_FLAGS(m) &= ~EXTF_READONLY;
3457
3458			/* "Free" into the intermediate cache */
3459			if (m->m_ext.ext_free == NULL) {
3460				mcache_free(m_cache(MC_MBUF_CL), m);
3461			} else if (m->m_ext.ext_free == m_bigfree) {
3462				mcache_free(m_cache(MC_MBUF_BIGCL), m);
3463			} else {
3464				VERIFY(m->m_ext.ext_free == m_16kfree);
3465				mcache_free(m_cache(MC_MBUF_16KCL), m);
3466			}
3467			/*
3468			 * Allocate a new mbuf, since we didn't divorce
3469			 * the composite mbuf + cluster pair above.
3470			 */
3471			if ((m = _M_GETHDR(wait, type)) == NULL)
3472				return (NULL);
3473		}
3474	}
3475
3476	if (rfa == NULL &&
3477	    (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3478		m_free(m);
3479		return (NULL);
3480	}
3481
3482	MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3483
3484	return (m);
3485}
3486
3487/*
3488 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3489 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3490 */
3491struct mbuf *
3492m_getcl(int wait, int type, int flags)
3493{
3494	struct mbuf *m;
3495	int mcflags = MSLEEPF(wait);
3496	int hdr = (flags & M_PKTHDR);
3497
3498	/* Is this due to a non-blocking retry?  If so, then try harder */
3499	if (mcflags & MCR_NOSLEEP)
3500		mcflags |= MCR_TRYHARD;
3501
3502	m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3503	if (m != NULL) {
3504		u_int32_t flag;
3505		struct ext_ref *rfa;
3506		void *cl;
3507
3508		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3509		cl = m->m_ext.ext_buf;
3510		rfa = MEXT_RFA(m);
3511
3512		ASSERT(cl != NULL && rfa != NULL);
3513		VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3514
3515		flag = MEXT_FLAGS(m);
3516
3517		MBUF_INIT(m, hdr, type);
3518		MBUF_CL_INIT(m, cl, rfa, 1, flag);
3519
3520		mtype_stat_inc(type);
3521		mtype_stat_dec(MT_FREE);
3522#if CONFIG_MACF_NET
3523		if (hdr && mac_init_mbuf(m, wait) != 0) {
3524			m_freem(m);
3525			return (NULL);
3526		}
3527#endif /* MAC_NET */
3528	}
3529	return (m);
3530}
3531
3532/* m_mclget() add an mbuf cluster to a normal mbuf */
3533struct mbuf *
3534m_mclget(struct mbuf *m, int wait)
3535{
3536	struct ext_ref *rfa;
3537
3538	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3539		return (m);
3540
3541	m->m_ext.ext_buf = m_mclalloc(wait);
3542	if (m->m_ext.ext_buf != NULL) {
3543		MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3544	} else {
3545		mcache_free(ref_cache, rfa);
3546	}
3547	return (m);
3548}
3549
3550/* Allocate an mbuf cluster */
3551caddr_t
3552m_mclalloc(int wait)
3553{
3554	int mcflags = MSLEEPF(wait);
3555
3556	/* Is this due to a non-blocking retry?  If so, then try harder */
3557	if (mcflags & MCR_NOSLEEP)
3558		mcflags |= MCR_TRYHARD;
3559
3560	return (mcache_alloc(m_cache(MC_CL), mcflags));
3561}
3562
3563/* Free an mbuf cluster */
3564void
3565m_mclfree(caddr_t p)
3566{
3567	mcache_free(m_cache(MC_CL), p);
3568}
3569
3570/*
3571 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3572 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3573 */
3574int
3575m_mclhasreference(struct mbuf *m)
3576{
3577	if (!(m->m_flags & M_EXT))
3578		return (0);
3579
3580	ASSERT(MEXT_RFA(m) != NULL);
3581
3582	return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3583}
3584
3585__private_extern__ caddr_t
3586m_bigalloc(int wait)
3587{
3588	int mcflags = MSLEEPF(wait);
3589
3590	/* Is this due to a non-blocking retry?  If so, then try harder */
3591	if (mcflags & MCR_NOSLEEP)
3592		mcflags |= MCR_TRYHARD;
3593
3594	return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3595}
3596
3597__private_extern__ void
3598m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3599{
3600	mcache_free(m_cache(MC_BIGCL), p);
3601}
3602
3603/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3604__private_extern__ struct mbuf *
3605m_mbigget(struct mbuf *m, int wait)
3606{
3607	struct ext_ref *rfa;
3608
3609	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3610		return (m);
3611
3612	m->m_ext.ext_buf =  m_bigalloc(wait);
3613	if (m->m_ext.ext_buf != NULL) {
3614		MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3615	} else {
3616		mcache_free(ref_cache, rfa);
3617	}
3618	return (m);
3619}
3620
3621__private_extern__ caddr_t
3622m_16kalloc(int wait)
3623{
3624	int mcflags = MSLEEPF(wait);
3625
3626	/* Is this due to a non-blocking retry?  If so, then try harder */
3627	if (mcflags & MCR_NOSLEEP)
3628		mcflags |= MCR_TRYHARD;
3629
3630	return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3631}
3632
3633__private_extern__ void
3634m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3635{
3636	mcache_free(m_cache(MC_16KCL), p);
3637}
3638
3639/* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3640__private_extern__ struct mbuf *
3641m_m16kget(struct mbuf *m, int wait)
3642{
3643	struct ext_ref *rfa;
3644
3645	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3646		return (m);
3647
3648	m->m_ext.ext_buf =  m_16kalloc(wait);
3649	if (m->m_ext.ext_buf != NULL) {
3650		MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3651	} else {
3652		mcache_free(ref_cache, rfa);
3653	}
3654	return (m);
3655}
3656
3657/*
3658 * "Move" mbuf pkthdr from "from" to "to".
3659 * "from" must have M_PKTHDR set, and "to" must be empty.
3660 */
3661void
3662m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3663{
3664	VERIFY(from->m_flags & M_PKTHDR);
3665
3666	/* Check for scratch area overflow */
3667	m_redzone_verify(from);
3668
3669	if (to->m_flags & M_PKTHDR) {
3670		/* Check for scratch area overflow */
3671		m_redzone_verify(to);
3672		/* We will be taking over the tags of 'to' */
3673		m_tag_delete_chain(to, NULL);
3674	}
3675	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
3676	m_classifier_init(from, 0);		/* purge classifier info */
3677	m_tag_init(from, 1);			/* purge all tags from src */
3678	m_scratch_init(from);			/* clear src scratch area */
3679	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3680	if ((to->m_flags & M_EXT) == 0)
3681		to->m_data = to->m_pktdat;
3682	m_redzone_init(to);			/* setup red zone on dst */
3683}
3684
3685/*
3686 * Duplicate "from"'s mbuf pkthdr in "to".
3687 * "from" must have M_PKTHDR set, and "to" must be empty.
3688 * In particular, this does a deep copy of the packet tags.
3689 */
3690static int
3691m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3692{
3693	VERIFY(from->m_flags & M_PKTHDR);
3694
3695	/* Check for scratch area overflow */
3696	m_redzone_verify(from);
3697
3698	if (to->m_flags & M_PKTHDR) {
3699		/* Check for scratch area overflow */
3700		m_redzone_verify(to);
3701		/* We will be taking over the tags of 'to' */
3702		m_tag_delete_chain(to, NULL);
3703	}
3704	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3705	if ((to->m_flags & M_EXT) == 0)
3706		to->m_data = to->m_pktdat;
3707	to->m_pkthdr = from->m_pkthdr;
3708	m_redzone_init(to);			/* setup red zone on dst */
3709	m_tag_init(to, 0);			/* preserve dst static tags */
3710	return (m_tag_copy_chain(to, from, how));
3711}
3712
3713void
3714m_copy_pftag(struct mbuf *to, struct mbuf *from)
3715{
3716	to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag;
3717#if PF_ECN
3718	to->m_pkthdr.pf_mtag.pftag_hdr = NULL;
3719	to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
3720#endif /* PF_ECN */
3721}
3722
3723void
3724m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
3725{
3726	VERIFY(m->m_flags & M_PKTHDR);
3727
3728	m->m_pkthdr.pkt_proto = 0;
3729	m->m_pkthdr.pkt_flowsrc = 0;
3730	m->m_pkthdr.pkt_flowid = 0;
3731	m->m_pkthdr.pkt_flags &= pktf_mask;	/* caller-defined mask */
3732	/* preserve service class and interface info for loopback packets */
3733	if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
3734		(void) m_set_service_class(m, MBUF_SC_BE);
3735	if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
3736		m->m_pkthdr.pkt_ifainfo = 0;
3737#if MEASURE_BW
3738	m->m_pkthdr.pkt_bwseq  = 0;
3739#endif /* MEASURE_BW */
3740}
3741
3742void
3743m_copy_classifier(struct mbuf *to, struct mbuf *from)
3744{
3745	VERIFY(to->m_flags & M_PKTHDR);
3746	VERIFY(from->m_flags & M_PKTHDR);
3747
3748	to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
3749	to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
3750	to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
3751	to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
3752	(void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
3753	to->m_pkthdr.pkt_ifainfo  = from->m_pkthdr.pkt_ifainfo;
3754#if MEASURE_BW
3755	to->m_pkthdr.pkt_bwseq  = from->m_pkthdr.pkt_bwseq;
3756#endif /* MEASURE_BW */
3757}
3758
3759/*
3760 * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3761 * if wantall is not set, return whatever number were available.  Set up the
3762 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3763 * are chained on the m_nextpkt field.  Any packets requested beyond this
3764 * are chained onto the last packet header's m_next field.  The size of
3765 * the cluster is controlled by the parameter bufsize.
3766 */
3767__private_extern__ struct mbuf *
3768m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3769    int wait, int wantall, size_t bufsize)
3770{
3771	struct mbuf *m;
3772	struct mbuf **np, *top;
3773	unsigned int pnum, needed = *num_needed;
3774	mcache_obj_t *mp_list = NULL;
3775	int mcflags = MSLEEPF(wait);
3776	u_int32_t flag;
3777	struct ext_ref *rfa;
3778	mcache_t *cp;
3779	void *cl;
3780
3781	ASSERT(bufsize == m_maxsize(MC_CL) ||
3782	    bufsize == m_maxsize(MC_BIGCL) ||
3783	    bufsize == m_maxsize(MC_16KCL));
3784
3785	/*
3786	 * Caller must first check for njcl because this
3787	 * routine is internal and not exposed/used via KPI.
3788	 */
3789	VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3790
3791	top = NULL;
3792	np = &top;
3793	pnum = 0;
3794
3795	/*
3796	 * The caller doesn't want all the requested buffers; only some.
3797	 * Try hard to get what we can, but don't block.  This effectively
3798	 * overrides MCR_SLEEP, since this thread will not go to sleep
3799	 * if we can't get all the buffers.
3800	 */
3801	if (!wantall || (mcflags & MCR_NOSLEEP))
3802		mcflags |= MCR_TRYHARD;
3803
3804	/* Allocate the composite mbuf + cluster elements from the cache */
3805	if (bufsize == m_maxsize(MC_CL))
3806		cp = m_cache(MC_MBUF_CL);
3807	else if (bufsize == m_maxsize(MC_BIGCL))
3808		cp = m_cache(MC_MBUF_BIGCL);
3809	else
3810		cp = m_cache(MC_MBUF_16KCL);
3811	needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3812
3813	for (pnum = 0; pnum < needed; pnum++) {
3814		m = (struct mbuf *)mp_list;
3815		mp_list = mp_list->obj_next;
3816
3817		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3818		cl = m->m_ext.ext_buf;
3819		rfa = MEXT_RFA(m);
3820
3821		ASSERT(cl != NULL && rfa != NULL);
3822		VERIFY(MBUF_IS_COMPOSITE(m));
3823
3824		flag = MEXT_FLAGS(m);
3825
3826		MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3827		if (bufsize == m_maxsize(MC_16KCL)) {
3828			MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3829		} else if (bufsize == m_maxsize(MC_BIGCL)) {
3830			MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3831		} else {
3832			MBUF_CL_INIT(m, cl, rfa, 1, flag);
3833		}
3834
3835		if (num_with_pkthdrs > 0) {
3836			--num_with_pkthdrs;
3837#if CONFIG_MACF_NET
3838			if (mac_mbuf_label_init(m, wait) != 0) {
3839				m_freem(m);
3840				break;
3841			}
3842#endif /* MAC_NET */
3843		}
3844
3845		*np = m;
3846		if (num_with_pkthdrs > 0)
3847			np = &m->m_nextpkt;
3848		else
3849			np = &m->m_next;
3850	}
3851	ASSERT(pnum != *num_needed || mp_list == NULL);
3852	if (mp_list != NULL)
3853		mcache_free_ext(cp, mp_list);
3854
3855	if (pnum > 0) {
3856		mtype_stat_add(MT_DATA, pnum);
3857		mtype_stat_sub(MT_FREE, pnum);
3858	}
3859
3860	if (wantall && (pnum != *num_needed)) {
3861		if (top != NULL)
3862			m_freem_list(top);
3863		return (NULL);
3864	}
3865
3866	if (pnum > *num_needed) {
3867		printf("%s: File a radar related to <rdar://10146739>. \
3868			needed = %u, pnum = %u, num_needed = %u \n",
3869			__func__, needed, pnum, *num_needed);
3870	}
3871
3872	*num_needed = pnum;
3873	return (top);
3874}
3875
3876/*
3877 * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3878 * wantall is not set, return whatever number were available.  The size of
3879 * each mbuf in the list is controlled by the parameter packetlen.  Each
3880 * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3881 * in the chain is called a segment.  If maxsegments is not null and the
3882 * value pointed to is not null, this specify the maximum number of segments
3883 * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3884 * is zero the caller does not have any restriction on the number of segments.
3885 * The actual  number of segments of a mbuf chain is return in the value
3886 * pointed to by maxsegments.
3887 */
3888__private_extern__ struct mbuf *
3889m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3890    unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3891{
3892	struct mbuf **np, *top, *first = NULL;
3893	size_t bufsize, r_bufsize;
3894	unsigned int num = 0;
3895	unsigned int nsegs = 0;
3896	unsigned int needed, resid;
3897	int mcflags = MSLEEPF(wait);
3898	mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3899	mcache_t *cp = NULL, *rcp = NULL;
3900
3901	if (*numlist == 0)
3902		return (NULL);
3903
3904	top = NULL;
3905	np = &top;
3906
3907	if (wantsize == 0) {
3908		if (packetlen <= MINCLSIZE) {
3909			bufsize = packetlen;
3910		} else if (packetlen > m_maxsize(MC_CL)) {
3911			/* Use 4KB if jumbo cluster pool isn't available */
3912			if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3913				bufsize = m_maxsize(MC_BIGCL);
3914			else
3915				bufsize = m_maxsize(MC_16KCL);
3916		} else {
3917			bufsize = m_maxsize(MC_CL);
3918		}
3919	} else if (wantsize == m_maxsize(MC_CL) ||
3920	    wantsize == m_maxsize(MC_BIGCL) ||
3921	    (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3922		bufsize = wantsize;
3923	} else {
3924		return (NULL);
3925	}
3926
3927	if (bufsize <= MHLEN) {
3928		nsegs = 1;
3929	} else if (bufsize <= MINCLSIZE) {
3930		if (maxsegments != NULL && *maxsegments == 1) {
3931			bufsize = m_maxsize(MC_CL);
3932			nsegs = 1;
3933		} else {
3934			nsegs = 2;
3935		}
3936	} else if (bufsize == m_maxsize(MC_16KCL)) {
3937		VERIFY(njcl > 0);
3938		nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3939	} else if (bufsize == m_maxsize(MC_BIGCL)) {
3940		nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3941	} else {
3942		nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3943	}
3944	if (maxsegments != NULL) {
3945		if (*maxsegments && nsegs > *maxsegments) {
3946			*maxsegments = nsegs;
3947			return (NULL);
3948		}
3949		*maxsegments = nsegs;
3950	}
3951
3952	/*
3953	 * The caller doesn't want all the requested buffers; only some.
3954	 * Try hard to get what we can, but don't block.  This effectively
3955	 * overrides MCR_SLEEP, since this thread will not go to sleep
3956	 * if we can't get all the buffers.
3957	 */
3958	if (!wantall || (mcflags & MCR_NOSLEEP))
3959		mcflags |= MCR_TRYHARD;
3960
3961	/*
3962	 * Simple case where all elements in the lists/chains are mbufs.
3963	 * Unless bufsize is greater than MHLEN, each segment chain is made
3964	 * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
3965	 * of 2 mbufs; the second one is used for the residual data, i.e.
3966	 * the remaining data that cannot fit into the first mbuf.
3967	 */
3968	if (bufsize <= MINCLSIZE) {
3969		/* Allocate the elements in one shot from the mbuf cache */
3970		ASSERT(bufsize <= MHLEN || nsegs == 2);
3971		cp = m_cache(MC_MBUF);
3972		needed = mcache_alloc_ext(cp, &mp_list,
3973		    (*numlist) * nsegs, mcflags);
3974
3975		/*
3976		 * The number of elements must be even if we are to use an
3977		 * mbuf (instead of a cluster) to store the residual data.
3978		 * If we couldn't allocate the requested number of mbufs,
3979		 * trim the number down (if it's odd) in order to avoid
3980		 * creating a partial segment chain.
3981		 */
3982		if (bufsize > MHLEN && (needed & 0x1))
3983			needed--;
3984
3985		while (num < needed) {
3986			struct mbuf *m;
3987
3988			m = (struct mbuf *)mp_list;
3989			mp_list = mp_list->obj_next;
3990			ASSERT(m != NULL);
3991
3992			MBUF_INIT(m, 1, MT_DATA);
3993#if CONFIG_MACF_NET
3994			if (mac_init_mbuf(m, wait) != 0) {
3995				m_free(m);
3996				break;
3997			}
3998#endif /* MAC_NET */
3999			num++;
4000			if (bufsize > MHLEN) {
4001				/* A second mbuf for this segment chain */
4002				m->m_next = (struct mbuf *)mp_list;
4003				mp_list = mp_list->obj_next;
4004				ASSERT(m->m_next != NULL);
4005
4006				MBUF_INIT(m->m_next, 0, MT_DATA);
4007				num++;
4008			}
4009			*np = m;
4010			np = &m->m_nextpkt;
4011		}
4012		ASSERT(num != *numlist || mp_list == NULL);
4013
4014		if (num > 0) {
4015			mtype_stat_add(MT_DATA, num);
4016			mtype_stat_sub(MT_FREE, num);
4017		}
4018		num /= nsegs;
4019
4020		/* We've got them all; return to caller */
4021		if (num == *numlist)
4022			return (top);
4023
4024		goto fail;
4025	}
4026
4027	/*
4028	 * Complex cases where elements are made up of one or more composite
4029	 * mbufs + cluster, depending on packetlen.  Each N-segment chain can
4030	 * be illustrated as follows:
4031	 *
4032	 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4033	 *
4034	 * Every composite mbuf + cluster element comes from the intermediate
4035	 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
4036	 * the last composite element will come from the MC_MBUF_CL cache,
4037	 * unless the residual data is larger than 2KB where we use the
4038	 * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
4039	 * data is defined as extra data beyond the first element that cannot
4040	 * fit into the previous element, i.e. there is no residual data if
4041	 * the chain only has 1 segment.
4042	 */
4043	r_bufsize = bufsize;
4044	resid = packetlen > bufsize ? packetlen % bufsize : 0;
4045	if (resid > 0) {
4046		/* There is residual data; figure out the cluster size */
4047		if (wantsize == 0 && packetlen > MINCLSIZE) {
4048			/*
4049			 * Caller didn't request that all of the segments
4050			 * in the chain use the same cluster size; use the
4051			 * smaller of the cluster sizes.
4052			 */
4053			if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
4054				r_bufsize = m_maxsize(MC_16KCL);
4055			else if (resid > m_maxsize(MC_CL))
4056				r_bufsize = m_maxsize(MC_BIGCL);
4057			else
4058				r_bufsize = m_maxsize(MC_CL);
4059		} else {
4060			/* Use the same cluster size as the other segments */
4061			resid = 0;
4062		}
4063	}
4064
4065	needed = *numlist;
4066	if (resid > 0) {
4067		/*
4068		 * Attempt to allocate composite mbuf + cluster elements for
4069		 * the residual data in each chain; record the number of such
4070		 * elements that can be allocated so that we know how many
4071		 * segment chains we can afford to create.
4072		 */
4073		if (r_bufsize <= m_maxsize(MC_CL))
4074			rcp = m_cache(MC_MBUF_CL);
4075		else if (r_bufsize <= m_maxsize(MC_BIGCL))
4076			rcp = m_cache(MC_MBUF_BIGCL);
4077		else
4078			rcp = m_cache(MC_MBUF_16KCL);
4079		needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4080
4081		if (needed == 0)
4082			goto fail;
4083
4084		/* This is temporarily reduced for calculation */
4085		ASSERT(nsegs > 1);
4086		nsegs--;
4087	}
4088
4089	/*
4090	 * Attempt to allocate the rest of the composite mbuf + cluster
4091	 * elements for the number of segment chains that we need.
4092	 */
4093	if (bufsize <= m_maxsize(MC_CL))
4094		cp = m_cache(MC_MBUF_CL);
4095	else if (bufsize <= m_maxsize(MC_BIGCL))
4096		cp = m_cache(MC_MBUF_BIGCL);
4097	else
4098		cp = m_cache(MC_MBUF_16KCL);
4099	needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4100
4101	/* Round it down to avoid creating a partial segment chain */
4102	needed = (needed / nsegs) * nsegs;
4103	if (needed == 0)
4104		goto fail;
4105
4106	if (resid > 0) {
4107		/*
4108		 * We're about to construct the chain(s); take into account
4109		 * the number of segments we have created above to hold the
4110		 * residual data for each chain, as well as restore the
4111		 * original count of segments per chain.
4112		 */
4113		ASSERT(nsegs > 0);
4114		needed += needed / nsegs;
4115		nsegs++;
4116	}
4117
4118	for (;;) {
4119		struct mbuf *m;
4120		u_int32_t flag;
4121		struct ext_ref *rfa;
4122		void *cl;
4123		int pkthdr;
4124
4125		++num;
4126		if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4127			m = (struct mbuf *)mp_list;
4128			mp_list = mp_list->obj_next;
4129		} else {
4130			m = (struct mbuf *)rmp_list;
4131			rmp_list = rmp_list->obj_next;
4132		}
4133		ASSERT(m != NULL);
4134		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4135		VERIFY(m->m_ext.ext_free == NULL ||
4136		    m->m_ext.ext_free == m_bigfree ||
4137		    m->m_ext.ext_free == m_16kfree);
4138
4139		cl = m->m_ext.ext_buf;
4140		rfa = MEXT_RFA(m);
4141
4142		ASSERT(cl != NULL && rfa != NULL);
4143		VERIFY(MBUF_IS_COMPOSITE(m));
4144
4145		flag = MEXT_FLAGS(m);
4146
4147		pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4148		if (pkthdr)
4149			first = m;
4150		MBUF_INIT(m, pkthdr, MT_DATA);
4151		if (m->m_ext.ext_free == m_16kfree) {
4152			MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4153		} else if (m->m_ext.ext_free == m_bigfree) {
4154			MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4155		} else {
4156			MBUF_CL_INIT(m, cl, rfa, 1, flag);
4157		}
4158#if CONFIG_MACF_NET
4159		if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4160			--num;
4161			m_freem(m);
4162			break;
4163		}
4164#endif /* MAC_NET */
4165
4166		*np = m;
4167		if ((num % nsegs) == 0)
4168			np = &first->m_nextpkt;
4169		else
4170			np = &m->m_next;
4171
4172		if (num == needed)
4173			break;
4174	}
4175
4176	if (num > 0) {
4177		mtype_stat_add(MT_DATA, num);
4178		mtype_stat_sub(MT_FREE, num);
4179	}
4180
4181	num /= nsegs;
4182
4183	/* We've got them all; return to caller */
4184	if (num == *numlist) {
4185		ASSERT(mp_list == NULL && rmp_list == NULL);
4186		return (top);
4187	}
4188
4189fail:
4190	/* Free up what's left of the above */
4191	if (mp_list != NULL)
4192		mcache_free_ext(cp, mp_list);
4193	if (rmp_list != NULL)
4194		mcache_free_ext(rcp, rmp_list);
4195	if (wantall && top != NULL) {
4196		m_freem(top);
4197		return (NULL);
4198	}
4199	*numlist = num;
4200	return (top);
4201}
4202
4203/*
4204 * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4205 * packets on receive ring.
4206 */
4207__private_extern__ struct mbuf *
4208m_getpacket_how(int wait)
4209{
4210	unsigned int num_needed = 1;
4211
4212	return (m_getpackets_internal(&num_needed, 1, wait, 1,
4213	    m_maxsize(MC_CL)));
4214}
4215
4216/*
4217 * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4218 * packets on receive ring.
4219 */
4220struct mbuf *
4221m_getpacket(void)
4222{
4223	unsigned int num_needed = 1;
4224
4225	return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4226	    m_maxsize(MC_CL)));
4227}
4228
4229/*
4230 * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4231 * if this can't be met, return whatever number were available.  Set up the
4232 * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
4233 * are chained on the m_nextpkt field.  Any packets requested beyond this are
4234 * chained onto the last packet header's m_next field.
4235 */
4236struct mbuf *
4237m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4238{
4239	unsigned int n = num_needed;
4240
4241	return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4242	    m_maxsize(MC_CL)));
4243}
4244
4245/*
4246 * Return a list of mbuf hdrs set up as packet hdrs chained together
4247 * on the m_nextpkt field
4248 */
4249struct mbuf *
4250m_getpackethdrs(int num_needed, int how)
4251{
4252	struct mbuf *m;
4253	struct mbuf **np, *top;
4254
4255	top = NULL;
4256	np = &top;
4257
4258	while (num_needed--) {
4259		m = _M_RETRYHDR(how, MT_DATA);
4260		if (m == NULL)
4261			break;
4262
4263		*np = m;
4264		np = &m->m_nextpkt;
4265	}
4266
4267	return (top);
4268}
4269
4270/*
4271 * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
4272 * for mbufs packets freed.  Used by the drivers.
4273 */
4274int
4275m_freem_list(struct mbuf *m)
4276{
4277	struct mbuf *nextpkt;
4278	mcache_obj_t *mp_list = NULL;
4279	mcache_obj_t *mcl_list = NULL;
4280	mcache_obj_t *mbc_list = NULL;
4281	mcache_obj_t *m16k_list = NULL;
4282	mcache_obj_t *m_mcl_list = NULL;
4283	mcache_obj_t *m_mbc_list = NULL;
4284	mcache_obj_t *m_m16k_list = NULL;
4285	mcache_obj_t *ref_list = NULL;
4286	int pktcount = 0;
4287	int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4288
4289	while (m != NULL) {
4290		pktcount++;
4291
4292		nextpkt = m->m_nextpkt;
4293		m->m_nextpkt = NULL;
4294
4295		while (m != NULL) {
4296			struct mbuf *next = m->m_next;
4297			mcache_obj_t *o, *rfa;
4298			u_int32_t refcnt, composite;
4299
4300			if (m->m_type == MT_FREE)
4301				panic("m_free: freeing an already freed mbuf");
4302
4303			if (m->m_type != MT_FREE)
4304				mt_free++;
4305
4306			if (m->m_flags & M_PKTHDR) {
4307				/* Check for scratch area overflow */
4308				m_redzone_verify(m);
4309				/* Free the aux data and tags if there is any */
4310				m_tag_delete_chain(m, NULL);
4311			}
4312
4313			if (!(m->m_flags & M_EXT))
4314				goto simple_free;
4315
4316			o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4317			refcnt = m_decref(m);
4318			composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4319			if (refcnt == 0 && !composite) {
4320				if (m->m_ext.ext_free == NULL) {
4321					o->obj_next = mcl_list;
4322					mcl_list = o;
4323				} else if (m->m_ext.ext_free == m_bigfree) {
4324					o->obj_next = mbc_list;
4325					mbc_list = o;
4326				} else if (m->m_ext.ext_free == m_16kfree) {
4327					o->obj_next = m16k_list;
4328					m16k_list = o;
4329				} else {
4330					(*(m->m_ext.ext_free))((caddr_t)o,
4331					    m->m_ext.ext_size,
4332					    m->m_ext.ext_arg);
4333				}
4334				rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
4335				rfa->obj_next = ref_list;
4336				ref_list = rfa;
4337				MEXT_RFA(m) = NULL;
4338			} else if (refcnt == 0 && composite) {
4339				VERIFY(m->m_type != MT_FREE);
4340				/*
4341				 * Amortize the costs of atomic operations
4342				 * by doing them at the end, if possible.
4343				 */
4344				if (m->m_type == MT_DATA)
4345					mt_data++;
4346				else if (m->m_type == MT_HEADER)
4347					mt_header++;
4348				else if (m->m_type == MT_SONAME)
4349					mt_soname++;
4350				else if (m->m_type == MT_TAG)
4351					mt_tag++;
4352				else
4353					mtype_stat_dec(m->m_type);
4354
4355				m->m_type = MT_FREE;
4356				m->m_flags = M_EXT;
4357				m->m_len = 0;
4358				m->m_next = m->m_nextpkt = NULL;
4359
4360				MEXT_FLAGS(m) &= ~EXTF_READONLY;
4361
4362				/* "Free" into the intermediate cache */
4363				o = (mcache_obj_t *)m;
4364				if (m->m_ext.ext_free == NULL) {
4365					o->obj_next = m_mcl_list;
4366					m_mcl_list = o;
4367				} else if (m->m_ext.ext_free == m_bigfree) {
4368					o->obj_next = m_mbc_list;
4369					m_mbc_list = o;
4370				} else {
4371					VERIFY(m->m_ext.ext_free == m_16kfree);
4372					o->obj_next = m_m16k_list;
4373					m_m16k_list = o;
4374				}
4375				m = next;
4376				continue;
4377			}
4378simple_free:
4379			/*
4380			 * Amortize the costs of atomic operations
4381			 * by doing them at the end, if possible.
4382			 */
4383			if (m->m_type == MT_DATA)
4384				mt_data++;
4385			else if (m->m_type == MT_HEADER)
4386				mt_header++;
4387			else if (m->m_type == MT_SONAME)
4388				mt_soname++;
4389			else if (m->m_type == MT_TAG)
4390				mt_tag++;
4391			else if (m->m_type != MT_FREE)
4392				mtype_stat_dec(m->m_type);
4393
4394			m->m_type = MT_FREE;
4395			m->m_flags = m->m_len = 0;
4396			m->m_next = m->m_nextpkt = NULL;
4397
4398			((mcache_obj_t *)m)->obj_next = mp_list;
4399			mp_list = (mcache_obj_t *)m;
4400
4401			m = next;
4402		}
4403
4404		m = nextpkt;
4405	}
4406
4407	if (mt_free > 0)
4408		mtype_stat_add(MT_FREE, mt_free);
4409	if (mt_data > 0)
4410		mtype_stat_sub(MT_DATA, mt_data);
4411	if (mt_header > 0)
4412		mtype_stat_sub(MT_HEADER, mt_header);
4413	if (mt_soname > 0)
4414		mtype_stat_sub(MT_SONAME, mt_soname);
4415	if (mt_tag > 0)
4416		mtype_stat_sub(MT_TAG, mt_tag);
4417
4418	if (mp_list != NULL)
4419		mcache_free_ext(m_cache(MC_MBUF), mp_list);
4420	if (mcl_list != NULL)
4421		mcache_free_ext(m_cache(MC_CL), mcl_list);
4422	if (mbc_list != NULL)
4423		mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4424	if (m16k_list != NULL)
4425		mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4426	if (m_mcl_list != NULL)
4427		mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4428	if (m_mbc_list != NULL)
4429		mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4430	if (m_m16k_list != NULL)
4431		mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4432	if (ref_list != NULL)
4433		mcache_free_ext(ref_cache, ref_list);
4434
4435	return (pktcount);
4436}
4437
4438void
4439m_freem(struct mbuf *m)
4440{
4441	while (m != NULL)
4442		m = m_free(m);
4443}
4444
4445/*
4446 * Mbuffer utility routines.
4447 */
4448
4449/*
4450 * Compute the amount of space available before the current start
4451 * of data in an mbuf.
4452 */
4453int
4454m_leadingspace(struct mbuf *m)
4455{
4456	if (m->m_flags & M_EXT) {
4457		if (MCLHASREFERENCE(m))
4458			return (0);
4459		return (m->m_data - m->m_ext.ext_buf);
4460	}
4461	if (m->m_flags & M_PKTHDR)
4462		return (m->m_data - m->m_pktdat);
4463	return (m->m_data - m->m_dat);
4464}
4465
4466/*
4467 * Compute the amount of space available after the end of data in an mbuf.
4468 */
4469int
4470m_trailingspace(struct mbuf *m)
4471{
4472	if (m->m_flags & M_EXT) {
4473		if (MCLHASREFERENCE(m))
4474			return (0);
4475		return (m->m_ext.ext_buf + m->m_ext.ext_size -
4476		    (m->m_data + m->m_len));
4477	}
4478	return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4479}
4480
4481/*
4482 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4483 * copy junk along.  Does not adjust packet header length.
4484 */
4485struct mbuf *
4486m_prepend(struct mbuf *m, int len, int how)
4487{
4488	struct mbuf *mn;
4489
4490	_MGET(mn, how, m->m_type);
4491	if (mn == NULL) {
4492		m_freem(m);
4493		return (NULL);
4494	}
4495	if (m->m_flags & M_PKTHDR) {
4496		M_COPY_PKTHDR(mn, m);
4497		m->m_flags &= ~M_PKTHDR;
4498	}
4499	mn->m_next = m;
4500	m = mn;
4501	if (len < MHLEN)
4502		MH_ALIGN(m, len);
4503	m->m_len = len;
4504	return (m);
4505}
4506
4507/*
4508 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4509 * chain, copy junk along, and adjust length.
4510 */
4511struct mbuf *
4512m_prepend_2(struct mbuf *m, int len, int how)
4513{
4514	if (M_LEADINGSPACE(m) >= len) {
4515		m->m_data -= len;
4516		m->m_len += len;
4517	} else {
4518		m = m_prepend(m, len, how);
4519	}
4520	if ((m) && (m->m_flags & M_PKTHDR))
4521		m->m_pkthdr.len += len;
4522	return (m);
4523}
4524
4525/*
4526 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4527 * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
4528 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4529 */
4530int MCFail;
4531
4532struct mbuf *
4533m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
4534{
4535	struct mbuf *n, *mhdr = NULL, **np;
4536	int off = off0;
4537	struct mbuf *top;
4538	int copyhdr = 0;
4539
4540	if (off < 0 || len < 0)
4541		panic("m_copym: invalid offset %d or len %d", off, len);
4542
4543	VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
4544	    mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
4545
4546	if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
4547	    mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
4548		mhdr = m;
4549		copyhdr = 1;
4550	}
4551
4552	while (off >= m->m_len) {
4553		if (m->m_next == NULL)
4554			panic("m_copym: invalid mbuf chain");
4555		off -= m->m_len;
4556		m = m->m_next;
4557	}
4558	np = &top;
4559	top = NULL;
4560
4561	while (len > 0) {
4562		if (m == NULL) {
4563			if (len != M_COPYALL)
4564				panic("m_copym: len != M_COPYALL");
4565			break;
4566		}
4567
4568		if (copyhdr)
4569			n = _M_RETRYHDR(wait, m->m_type);
4570		else
4571			n = _M_RETRY(wait, m->m_type);
4572		*np = n;
4573
4574		if (n == NULL)
4575			goto nospace;
4576
4577		if (copyhdr != 0) {
4578			if ((mode == M_COPYM_MOVE_HDR) ||
4579			    (mode == M_COPYM_MUST_MOVE_HDR)) {
4580				M_COPY_PKTHDR(n, mhdr);
4581			} else if ((mode == M_COPYM_COPY_HDR) ||
4582			    (mode == M_COPYM_MUST_COPY_HDR)) {
4583				if (m_dup_pkthdr(n, mhdr, wait) == 0)
4584					goto nospace;
4585			}
4586			if (len == M_COPYALL)
4587				n->m_pkthdr.len -= off0;
4588			else
4589				n->m_pkthdr.len = len;
4590			copyhdr = 0;
4591			/*
4592			 * There is data to copy from the packet header mbuf
4593			 * if it is empty or it is before the starting offset
4594			 */
4595			if (mhdr != m) {
4596				np = &n->m_next;
4597				continue;
4598			}
4599		}
4600		n->m_len = MIN(len, (m->m_len - off));
4601		if (m->m_flags & M_EXT) {
4602			n->m_ext = m->m_ext;
4603			m_incref(m);
4604			n->m_data = m->m_data + off;
4605			n->m_flags |= M_EXT;
4606		} else {
4607			/*
4608			 * Limit to the capacity of the destination
4609			 */
4610			if (n->m_flags & M_PKTHDR)
4611				n->m_len = MIN(n->m_len, MHLEN);
4612			else
4613				n->m_len = MIN(n->m_len, MLEN);
4614
4615			if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
4616				panic("%s n %p copy overflow",
4617					__func__, n);
4618
4619			bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4620			    (unsigned)n->m_len);
4621		}
4622		if (len != M_COPYALL)
4623			len -= n->m_len;
4624		off = 0;
4625		m = m->m_next;
4626		np = &n->m_next;
4627	}
4628
4629	if (top == NULL)
4630		MCFail++;
4631
4632	return (top);
4633nospace:
4634
4635	m_freem(top);
4636	MCFail++;
4637	return (NULL);
4638}
4639
4640
4641struct mbuf *
4642m_copym(struct mbuf *m, int off0, int len, int wait)
4643{
4644	return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
4645}
4646
4647/*
4648 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4649 * within this routine also, the last mbuf and offset accessed are passed
4650 * out and can be passed back in to avoid having to rescan the entire mbuf
4651 * list (normally hung off of the socket)
4652 */
4653struct mbuf *
4654m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
4655    struct mbuf **m_lastm, int *m_off, uint32_t mode)
4656{
4657	struct mbuf *m = m0, *n, **np = NULL;
4658	int off = off0, len = len0;
4659	struct mbuf *top = NULL;
4660	int mcflags = MSLEEPF(wait);
4661	int copyhdr = 0;
4662	int type = 0;
4663	mcache_obj_t *list = NULL;
4664	int needed = 0;
4665
4666	if (off == 0 && (m->m_flags & M_PKTHDR))
4667		copyhdr = 1;
4668
4669	if (m_lastm != NULL && *m_lastm != NULL) {
4670		m = *m_lastm;
4671		off = *m_off;
4672	} else {
4673		while (off >= m->m_len) {
4674			off -= m->m_len;
4675			m = m->m_next;
4676		}
4677	}
4678
4679	n = m;
4680	while (len > 0) {
4681		needed++;
4682		ASSERT(n != NULL);
4683		len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4684		n = n->m_next;
4685	}
4686	needed++;
4687	len = len0;
4688
4689	/*
4690	 * If the caller doesn't want to be put to sleep, mark it with
4691	 * MCR_TRYHARD so that we may reclaim buffers from other places
4692	 * before giving up.
4693	 */
4694	if (mcflags & MCR_NOSLEEP)
4695		mcflags |= MCR_TRYHARD;
4696
4697	if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4698	    mcflags) != needed)
4699		goto nospace;
4700
4701	needed = 0;
4702	while (len > 0) {
4703		n = (struct mbuf *)list;
4704		list = list->obj_next;
4705		ASSERT(n != NULL && m != NULL);
4706
4707		type = (top == NULL) ? MT_HEADER : m->m_type;
4708		MBUF_INIT(n, (top == NULL), type);
4709#if CONFIG_MACF_NET
4710		if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4711			mtype_stat_inc(MT_HEADER);
4712			mtype_stat_dec(MT_FREE);
4713			m_free(n);
4714			goto nospace;
4715		}
4716#endif /* MAC_NET */
4717
4718		if (top == NULL) {
4719			top = n;
4720			np = &top->m_next;
4721			continue;
4722		} else {
4723			needed++;
4724			*np = n;
4725		}
4726
4727		if (copyhdr) {
4728			if ((mode == M_COPYM_MOVE_HDR) ||
4729			    (mode == M_COPYM_MUST_MOVE_HDR)) {
4730				M_COPY_PKTHDR(n, m);
4731			} else if ((mode == M_COPYM_COPY_HDR) ||
4732			    (mode == M_COPYM_MUST_COPY_HDR)) {
4733				if (m_dup_pkthdr(n, m, wait) == 0)
4734					goto nospace;
4735			}
4736			n->m_pkthdr.len = len;
4737			copyhdr = 0;
4738		}
4739		n->m_len = MIN(len, (m->m_len - off));
4740
4741		if (m->m_flags & M_EXT) {
4742			n->m_ext = m->m_ext;
4743			m_incref(m);
4744			n->m_data = m->m_data + off;
4745			n->m_flags |= M_EXT;
4746		} else {
4747			if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
4748				panic("%s n %p copy overflow",
4749					__func__, n);
4750
4751			bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4752			    (unsigned)n->m_len);
4753		}
4754		len -= n->m_len;
4755
4756		if (len == 0) {
4757			if (m_lastm != NULL && m_off != NULL) {
4758				if ((off + n->m_len) == m->m_len) {
4759					*m_lastm = m->m_next;
4760					*m_off  = 0;
4761				} else {
4762					*m_lastm = m;
4763					*m_off  = off + n->m_len;
4764				}
4765			}
4766			break;
4767		}
4768		off = 0;
4769		m = m->m_next;
4770		np = &n->m_next;
4771	}
4772
4773	mtype_stat_inc(MT_HEADER);
4774	mtype_stat_add(type, needed);
4775	mtype_stat_sub(MT_FREE, needed + 1);
4776
4777	ASSERT(list == NULL);
4778	return (top);
4779
4780nospace:
4781	if (list != NULL)
4782		mcache_free_ext(m_cache(MC_MBUF), list);
4783	if (top != NULL)
4784		m_freem(top);
4785	MCFail++;
4786	return (NULL);
4787}
4788
4789/*
4790 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4791 * continuing for "len" bytes, into the indicated buffer.
4792 */
4793void
4794m_copydata(struct mbuf *m, int off, int len, void *vp)
4795{
4796	unsigned count;
4797	char *cp = vp;
4798
4799	if (off < 0 || len < 0)
4800		panic("m_copydata: invalid offset %d or len %d", off, len);
4801
4802	while (off > 0) {
4803		if (m == NULL)
4804			panic("m_copydata: invalid mbuf chain");
4805		if (off < m->m_len)
4806			break;
4807		off -= m->m_len;
4808		m = m->m_next;
4809	}
4810	while (len > 0) {
4811		if (m == NULL)
4812			panic("m_copydata: invalid mbuf chain");
4813		count = MIN(m->m_len - off, len);
4814		bcopy(MTOD(m, caddr_t) + off, cp, count);
4815		len -= count;
4816		cp += count;
4817		off = 0;
4818		m = m->m_next;
4819	}
4820}
4821
4822/*
4823 * Concatenate mbuf chain n to m.  Both chains must be of the same type
4824 * (e.g. MT_DATA).  Any m_pkthdr is not updated.
4825 */
4826void
4827m_cat(struct mbuf *m, struct mbuf *n)
4828{
4829	while (m->m_next)
4830		m = m->m_next;
4831	while (n) {
4832		if ((m->m_flags & M_EXT) ||
4833		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4834			/* just join the two chains */
4835			m->m_next = n;
4836			return;
4837		}
4838		/* splat the data from one into the other */
4839		bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4840		    (u_int)n->m_len);
4841		m->m_len += n->m_len;
4842		n = m_free(n);
4843	}
4844}
4845
4846void
4847m_adj(struct mbuf *mp, int req_len)
4848{
4849	int len = req_len;
4850	struct mbuf *m;
4851	int count;
4852
4853	if ((m = mp) == NULL)
4854		return;
4855	if (len >= 0) {
4856		/*
4857		 * Trim from head.
4858		 */
4859		while (m != NULL && len > 0) {
4860			if (m->m_len <= len) {
4861				len -= m->m_len;
4862				m->m_len = 0;
4863				m = m->m_next;
4864			} else {
4865				m->m_len -= len;
4866				m->m_data += len;
4867				len = 0;
4868			}
4869		}
4870		m = mp;
4871		if (m->m_flags & M_PKTHDR)
4872			m->m_pkthdr.len -= (req_len - len);
4873	} else {
4874		/*
4875		 * Trim from tail.  Scan the mbuf chain,
4876		 * calculating its length and finding the last mbuf.
4877		 * If the adjustment only affects this mbuf, then just
4878		 * adjust and return.  Otherwise, rescan and truncate
4879		 * after the remaining size.
4880		 */
4881		len = -len;
4882		count = 0;
4883		for (;;) {
4884			count += m->m_len;
4885			if (m->m_next == (struct mbuf *)0)
4886				break;
4887			m = m->m_next;
4888		}
4889		if (m->m_len >= len) {
4890			m->m_len -= len;
4891			m = mp;
4892			if (m->m_flags & M_PKTHDR)
4893				m->m_pkthdr.len -= len;
4894			return;
4895		}
4896		count -= len;
4897		if (count < 0)
4898			count = 0;
4899		/*
4900		 * Correct length for chain is "count".
4901		 * Find the mbuf with last data, adjust its length,
4902		 * and toss data from remaining mbufs on chain.
4903		 */
4904		m = mp;
4905		if (m->m_flags & M_PKTHDR)
4906			m->m_pkthdr.len = count;
4907		for (; m; m = m->m_next) {
4908			if (m->m_len >= count) {
4909				m->m_len = count;
4910				break;
4911			}
4912			count -= m->m_len;
4913		}
4914		while ((m = m->m_next))
4915			m->m_len = 0;
4916	}
4917}
4918
4919/*
4920 * Rearange an mbuf chain so that len bytes are contiguous
4921 * and in the data area of an mbuf (so that mtod and dtom
4922 * will work for a structure of size len).  Returns the resulting
4923 * mbuf chain on success, frees it and returns null on failure.
4924 * If there is room, it will add up to max_protohdr-len extra bytes to the
4925 * contiguous region in an attempt to avoid being called next time.
4926 */
4927int MPFail;
4928
4929struct mbuf *
4930m_pullup(struct mbuf *n, int len)
4931{
4932	struct mbuf *m;
4933	int count;
4934	int space;
4935
4936	/*
4937	 * If first mbuf has no cluster, and has room for len bytes
4938	 * without shifting current data, pullup into it,
4939	 * otherwise allocate a new mbuf to prepend to the chain.
4940	 */
4941	if ((n->m_flags & M_EXT) == 0 &&
4942	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4943		if (n->m_len >= len)
4944			return (n);
4945		m = n;
4946		n = n->m_next;
4947		len -= m->m_len;
4948	} else {
4949		if (len > MHLEN)
4950			goto bad;
4951		_MGET(m, M_DONTWAIT, n->m_type);
4952		if (m == 0)
4953			goto bad;
4954		m->m_len = 0;
4955		if (n->m_flags & M_PKTHDR) {
4956			M_COPY_PKTHDR(m, n);
4957			n->m_flags &= ~M_PKTHDR;
4958		}
4959	}
4960	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4961	do {
4962		count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4963		bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4964		    (unsigned)count);
4965		len -= count;
4966		m->m_len += count;
4967		n->m_len -= count;
4968		space -= count;
4969		if (n->m_len)
4970			n->m_data += count;
4971		else
4972			n = m_free(n);
4973	} while (len > 0 && n);
4974	if (len > 0) {
4975		(void) m_free(m);
4976		goto bad;
4977	}
4978	m->m_next = n;
4979	return (m);
4980bad:
4981	m_freem(n);
4982	MPFail++;
4983	return (0);
4984}
4985
4986/*
4987 * Like m_pullup(), except a new mbuf is always allocated, and we allow
4988 * the amount of empty space before the data in the new mbuf to be specified
4989 * (in the event that the caller expects to prepend later).
4990 */
4991__private_extern__ int MSFail = 0;
4992
4993__private_extern__ struct mbuf *
4994m_copyup(struct mbuf *n, int len, int dstoff)
4995{
4996	struct mbuf *m;
4997	int count, space;
4998
4999	if (len > (MHLEN - dstoff))
5000		goto bad;
5001	MGET(m, M_DONTWAIT, n->m_type);
5002	if (m == NULL)
5003		goto bad;
5004	m->m_len = 0;
5005	if (n->m_flags & M_PKTHDR) {
5006		m_copy_pkthdr(m, n);
5007		n->m_flags &= ~M_PKTHDR;
5008	}
5009	m->m_data += dstoff;
5010	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5011	do {
5012		count = min(min(max(len, max_protohdr), space), n->m_len);
5013		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
5014		    (unsigned)count);
5015		len -= count;
5016		m->m_len += count;
5017		n->m_len -= count;
5018		space -= count;
5019		if (n->m_len)
5020			n->m_data += count;
5021		else
5022			n = m_free(n);
5023	} while (len > 0 && n);
5024	if (len > 0) {
5025		(void) m_free(m);
5026		goto bad;
5027	}
5028	m->m_next = n;
5029	return (m);
5030bad:
5031	m_freem(n);
5032	MSFail++;
5033	return (NULL);
5034}
5035
5036/*
5037 * Partition an mbuf chain in two pieces, returning the tail --
5038 * all but the first len0 bytes.  In case of failure, it returns NULL and
5039 * attempts to restore the chain to its original state.
5040 */
5041struct mbuf *
5042m_split(struct mbuf *m0, int len0, int wait)
5043{
5044	return (m_split0(m0, len0, wait, 1));
5045}
5046
5047static struct mbuf *
5048m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
5049{
5050	struct mbuf *m, *n;
5051	unsigned len = len0, remain;
5052
5053	for (m = m0; m && len > m->m_len; m = m->m_next)
5054		len -= m->m_len;
5055	if (m == NULL)
5056		return (NULL);
5057	remain = m->m_len - len;
5058	if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5059		_MGETHDR(n, wait, m0->m_type);
5060		if (n == NULL)
5061			return (NULL);
5062		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5063		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5064		m0->m_pkthdr.len = len0;
5065		if (m->m_flags & M_EXT)
5066			goto extpacket;
5067		if (remain > MHLEN) {
5068			/* m can't be the lead packet */
5069			MH_ALIGN(n, 0);
5070			n->m_next = m_split(m, len, wait);
5071			if (n->m_next == NULL) {
5072				(void) m_free(n);
5073				return (NULL);
5074			} else
5075				return (n);
5076		} else
5077			MH_ALIGN(n, remain);
5078	} else if (remain == 0) {
5079		n = m->m_next;
5080		m->m_next = NULL;
5081		return (n);
5082	} else {
5083		_MGET(n, wait, m->m_type);
5084		if (n == NULL)
5085			return (NULL);
5086		M_ALIGN(n, remain);
5087	}
5088extpacket:
5089	if (m->m_flags & M_EXT) {
5090		n->m_flags |= M_EXT;
5091		n->m_ext = m->m_ext;
5092		m_incref(m);
5093		n->m_data = m->m_data + len;
5094	} else {
5095		bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5096	}
5097	n->m_len = remain;
5098	m->m_len = len;
5099	n->m_next = m->m_next;
5100	m->m_next = NULL;
5101	return (n);
5102}
5103
5104/*
5105 * Routine to copy from device local memory into mbufs.
5106 */
5107struct mbuf *
5108m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5109    void (*copy)(const void *, void *, size_t))
5110{
5111	struct mbuf *m;
5112	struct mbuf *top = NULL, **mp = &top;
5113	int off = off0, len;
5114	char *cp;
5115	char *epkt;
5116
5117	cp = buf;
5118	epkt = cp + totlen;
5119	if (off) {
5120		/*
5121		 * If 'off' is non-zero, packet is trailer-encapsulated,
5122		 * so we have to skip the type and length fields.
5123		 */
5124		cp += off + 2 * sizeof (u_int16_t);
5125		totlen -= 2 * sizeof (u_int16_t);
5126	}
5127	_MGETHDR(m, M_DONTWAIT, MT_DATA);
5128	if (m == NULL)
5129		return (NULL);
5130	m->m_pkthdr.rcvif = ifp;
5131	m->m_pkthdr.len = totlen;
5132	m->m_len = MHLEN;
5133
5134	while (totlen > 0) {
5135		if (top != NULL) {
5136			_MGET(m, M_DONTWAIT, MT_DATA);
5137			if (m == NULL) {
5138				m_freem(top);
5139				return (NULL);
5140			}
5141			m->m_len = MLEN;
5142		}
5143		len = MIN(totlen, epkt - cp);
5144		if (len >= MINCLSIZE) {
5145			MCLGET(m, M_DONTWAIT);
5146			if (m->m_flags & M_EXT) {
5147				m->m_len = len = MIN(len, m_maxsize(MC_CL));
5148			} else {
5149				/* give up when it's out of cluster mbufs */
5150				if (top != NULL)
5151					m_freem(top);
5152				m_freem(m);
5153				return (NULL);
5154			}
5155		} else {
5156			/*
5157			 * Place initial small packet/header at end of mbuf.
5158			 */
5159			if (len < m->m_len) {
5160				if (top == NULL &&
5161				    len + max_linkhdr <= m->m_len)
5162					m->m_data += max_linkhdr;
5163				m->m_len = len;
5164			} else {
5165				len = m->m_len;
5166			}
5167		}
5168		if (copy)
5169			copy(cp, MTOD(m, caddr_t), (unsigned)len);
5170		else
5171			bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5172		cp += len;
5173		*mp = m;
5174		mp = &m->m_next;
5175		totlen -= len;
5176		if (cp == epkt)
5177			cp = buf;
5178	}
5179	return (top);
5180}
5181
5182#ifndef MBUF_GROWTH_NORMAL_THRESH
5183#define	MBUF_GROWTH_NORMAL_THRESH 25
5184#endif
5185
5186/*
5187 * Cluster freelist allocation check.
5188 */
5189static int
5190m_howmany(int num, size_t bufsize)
5191{
5192	int i = 0, j = 0;
5193	u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5194	u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5195	u_int32_t sumclusters, freeclusters;
5196	u_int32_t percent_pool, percent_kmem;
5197	u_int32_t mb_growth, mb_growth_thresh;
5198
5199	VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5200	    bufsize == m_maxsize(MC_16KCL));
5201
5202	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5203
5204	/* Numbers in 2K cluster units */
5205	m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5206	m_clusters = m_total(MC_CL);
5207	m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5208	m_16kclusters = m_total(MC_16KCL);
5209	sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5210
5211	m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5212	m_clfree = m_infree(MC_CL);
5213	m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5214	m_16kclfree = m_infree(MC_16KCL);
5215	freeclusters = m_mbfree + m_clfree + m_bigclfree;
5216
5217	/* Bail if we've maxed out the mbuf memory map */
5218	if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5219	    (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5220	    (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5221		return (0);
5222	}
5223
5224	if (bufsize == m_maxsize(MC_BIGCL)) {
5225		/* Under minimum */
5226		if (m_bigclusters < m_minlimit(MC_BIGCL))
5227			return (m_minlimit(MC_BIGCL) - m_bigclusters);
5228
5229		percent_pool =
5230		    ((sumclusters - freeclusters) * 100) / sumclusters;
5231		percent_kmem = (sumclusters * 100) / nclusters;
5232
5233		/*
5234		 * If a light/normal user, grow conservatively (75%)
5235		 * If a heavy user, grow aggressively (50%)
5236		 */
5237		if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5238			mb_growth = MB_GROWTH_NORMAL;
5239		else
5240			mb_growth = MB_GROWTH_AGGRESSIVE;
5241
5242		if (percent_kmem < 5) {
5243			/* For initial allocations */
5244			i = num;
5245		} else {
5246			/* Return if >= MBIGCL_LOWAT clusters available */
5247			if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5248			    m_total(MC_BIGCL) >=
5249			    MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5250				return (0);
5251
5252			/* Ensure at least num clusters are accessible */
5253			if (num >= m_infree(MC_BIGCL))
5254				i = num - m_infree(MC_BIGCL);
5255			if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5256				j = num - (m_total(MC_BIGCL) -
5257				    m_minlimit(MC_BIGCL));
5258
5259			i = MAX(i, j);
5260
5261			/*
5262			 * Grow pool if percent_pool > 75 (normal growth)
5263			 * or percent_pool > 50 (aggressive growth).
5264			 */
5265			mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5266			if (percent_pool > mb_growth_thresh)
5267				j = ((sumclusters + num) >> mb_growth) -
5268				    freeclusters;
5269			i = MAX(i, j);
5270		}
5271
5272		/* Check to ensure we didn't go over limits */
5273		if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5274			i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5275		if ((i << 1) + sumclusters >= nclusters)
5276			i = (nclusters - sumclusters) >> 1;
5277		VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5278		VERIFY(sumclusters + (i << 1) <= nclusters);
5279
5280	} else { /* 16K CL */
5281		VERIFY(njcl > 0);
5282		/* Under minimum */
5283		if (m_16kclusters < MIN16KCL)
5284			return (MIN16KCL - m_16kclusters);
5285		if (m_16kclfree >= M16KCL_LOWAT)
5286			return (0);
5287
5288		/* Ensure at least num clusters are available */
5289		if (num >= m_16kclfree)
5290			i = num - m_16kclfree;
5291
5292		/* Always grow 16KCL pool aggressively */
5293		if (((m_16kclusters + num) >> 1) > m_16kclfree)
5294			j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5295		i = MAX(i, j);
5296
5297		/* Check to ensure we don't go over limit */
5298		if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5299			i = m_maxlimit(MC_16KCL) - m_16kclusters;
5300		VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
5301	}
5302	return (i);
5303}
5304/*
5305 * Return the number of bytes in the mbuf chain, m.
5306 */
5307unsigned int
5308m_length(struct mbuf *m)
5309{
5310	struct mbuf *m0;
5311	unsigned int pktlen;
5312
5313	if (m->m_flags & M_PKTHDR)
5314		return (m->m_pkthdr.len);
5315
5316	pktlen = 0;
5317	for (m0 = m; m0 != NULL; m0 = m0->m_next)
5318		pktlen += m0->m_len;
5319	return (pktlen);
5320}
5321
5322/*
5323 * Copy data from a buffer back into the indicated mbuf chain,
5324 * starting "off" bytes from the beginning, extending the mbuf
5325 * chain if necessary.
5326 */
5327void
5328m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5329{
5330#if DEBUG
5331	struct mbuf *origm = m0;
5332	int error;
5333#endif /* DEBUG */
5334
5335	if (m0 == NULL)
5336		return;
5337
5338#if DEBUG
5339	error =
5340#endif /* DEBUG */
5341	m_copyback0(&m0, off, len, cp,
5342	    M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5343
5344#if DEBUG
5345	if (error != 0 || (m0 != NULL && origm != m0))
5346		panic("m_copyback");
5347#endif /* DEBUG */
5348}
5349
5350struct mbuf *
5351m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5352{
5353	int error;
5354
5355	/* don't support chain expansion */
5356	VERIFY(off + len <= m_length(m0));
5357
5358	error = m_copyback0(&m0, off, len, cp,
5359	    M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5360	if (error) {
5361		/*
5362		 * no way to recover from partial success.
5363		 * just free the chain.
5364		 */
5365		m_freem(m0);
5366		return (NULL);
5367	}
5368	return (m0);
5369}
5370
5371/*
5372 * m_makewritable: ensure the specified range writable.
5373 */
5374int
5375m_makewritable(struct mbuf **mp, int off, int len, int how)
5376{
5377	int error;
5378#if DEBUG
5379	struct mbuf *n;
5380	int origlen, reslen;
5381
5382	origlen = m_length(*mp);
5383#endif /* DEBUG */
5384
5385#if 0 /* M_COPYALL is large enough */
5386	if (len == M_COPYALL)
5387		len = m_length(*mp) - off; /* XXX */
5388#endif
5389
5390	error = m_copyback0(mp, off, len, NULL,
5391	    M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5392
5393#if DEBUG
5394	reslen = 0;
5395	for (n = *mp; n; n = n->m_next)
5396		reslen += n->m_len;
5397	if (origlen != reslen)
5398		panic("m_makewritable: length changed");
5399	if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5400		panic("m_makewritable: inconsist");
5401#endif /* DEBUG */
5402
5403	return (error);
5404}
5405
5406static int
5407m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5408    int how)
5409{
5410	int mlen;
5411	struct mbuf *m, *n;
5412	struct mbuf **mp;
5413	int totlen = 0;
5414	const char *cp = vp;
5415
5416	VERIFY(mp0 != NULL);
5417	VERIFY(*mp0 != NULL);
5418	VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5419	VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5420
5421	/*
5422	 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5423	 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5424	 */
5425
5426	VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5427
5428	mp = mp0;
5429	m = *mp;
5430	while (off > (mlen = m->m_len)) {
5431		off -= mlen;
5432		totlen += mlen;
5433		if (m->m_next == NULL) {
5434			int tspace;
5435extend:
5436			if (!(flags & M_COPYBACK0_EXTEND))
5437				goto out;
5438
5439			/*
5440			 * try to make some space at the end of "m".
5441			 */
5442
5443			mlen = m->m_len;
5444			if (off + len >= MINCLSIZE &&
5445			    !(m->m_flags & M_EXT) && m->m_len == 0) {
5446				MCLGET(m, how);
5447			}
5448			tspace = M_TRAILINGSPACE(m);
5449			if (tspace > 0) {
5450				tspace = MIN(tspace, off + len);
5451				VERIFY(tspace > 0);
5452				bzero(mtod(m, char *) + m->m_len,
5453				    MIN(off, tspace));
5454				m->m_len += tspace;
5455				off += mlen;
5456				totlen -= mlen;
5457				continue;
5458			}
5459
5460			/*
5461			 * need to allocate an mbuf.
5462			 */
5463
5464			if (off + len >= MINCLSIZE) {
5465				n = m_getcl(how, m->m_type, 0);
5466			} else {
5467				n = _M_GET(how, m->m_type);
5468			}
5469			if (n == NULL) {
5470				goto out;
5471			}
5472			n->m_len = 0;
5473			n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5474			bzero(mtod(n, char *), MIN(n->m_len, off));
5475			m->m_next = n;
5476		}
5477		mp = &m->m_next;
5478		m = m->m_next;
5479	}
5480	while (len > 0) {
5481		mlen = m->m_len - off;
5482		if (mlen != 0 && m_mclhasreference(m)) {
5483			char *datap;
5484			int eatlen;
5485
5486			/*
5487			 * this mbuf is read-only.
5488			 * allocate a new writable mbuf and try again.
5489			 */
5490
5491#if DIAGNOSTIC
5492			if (!(flags & M_COPYBACK0_COW))
5493				panic("m_copyback0: read-only");
5494#endif /* DIAGNOSTIC */
5495
5496			/*
5497			 * if we're going to write into the middle of
5498			 * a mbuf, split it first.
5499			 */
5500			if (off > 0 && len < mlen) {
5501				n = m_split0(m, off, how, 0);
5502				if (n == NULL)
5503					goto enobufs;
5504				m->m_next = n;
5505				mp = &m->m_next;
5506				m = n;
5507				off = 0;
5508				continue;
5509			}
5510
5511			/*
5512			 * XXX TODO coalesce into the trailingspace of
5513			 * the previous mbuf when possible.
5514			 */
5515
5516			/*
5517			 * allocate a new mbuf.  copy packet header if needed.
5518			 */
5519			n = _M_GET(how, m->m_type);
5520			if (n == NULL)
5521				goto enobufs;
5522			if (off == 0 && (m->m_flags & M_PKTHDR)) {
5523				M_COPY_PKTHDR(n, m);
5524				n->m_len = MHLEN;
5525			} else {
5526				if (len >= MINCLSIZE)
5527					MCLGET(n, M_DONTWAIT);
5528				n->m_len =
5529				    (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5530			}
5531			if (n->m_len > len)
5532				n->m_len = len;
5533
5534			/*
5535			 * free the region which has been overwritten.
5536			 * copying data from old mbufs if requested.
5537			 */
5538			if (flags & M_COPYBACK0_PRESERVE)
5539				datap = mtod(n, char *);
5540			else
5541				datap = NULL;
5542			eatlen = n->m_len;
5543			VERIFY(off == 0 || eatlen >= mlen);
5544			if (off > 0) {
5545				VERIFY(len >= mlen);
5546				m->m_len = off;
5547				m->m_next = n;
5548				if (datap) {
5549					m_copydata(m, off, mlen, datap);
5550					datap += mlen;
5551				}
5552				eatlen -= mlen;
5553				mp = &m->m_next;
5554				m = m->m_next;
5555			}
5556			while (m != NULL && m_mclhasreference(m) &&
5557			    n->m_type == m->m_type && eatlen > 0) {
5558				mlen = MIN(eatlen, m->m_len);
5559				if (datap) {
5560					m_copydata(m, 0, mlen, datap);
5561					datap += mlen;
5562				}
5563				m->m_data += mlen;
5564				m->m_len -= mlen;
5565				eatlen -= mlen;
5566				if (m->m_len == 0)
5567					*mp = m = m_free(m);
5568			}
5569			if (eatlen > 0)
5570				n->m_len -= eatlen;
5571			n->m_next = m;
5572			*mp = m = n;
5573			continue;
5574		}
5575		mlen = MIN(mlen, len);
5576		if (flags & M_COPYBACK0_COPYBACK) {
5577			bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5578			cp += mlen;
5579		}
5580		len -= mlen;
5581		mlen += off;
5582		off = 0;
5583		totlen += mlen;
5584		if (len == 0)
5585			break;
5586		if (m->m_next == NULL) {
5587			goto extend;
5588		}
5589		mp = &m->m_next;
5590		m = m->m_next;
5591	}
5592out:
5593	if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5594		VERIFY(flags & M_COPYBACK0_EXTEND);
5595		m->m_pkthdr.len = totlen;
5596	}
5597
5598	return (0);
5599
5600enobufs:
5601	return (ENOBUFS);
5602}
5603
5604uint64_t
5605mcl_to_paddr(char *addr)
5606{
5607	vm_offset_t base_phys;
5608
5609	if (!MBUF_IN_MAP(addr))
5610		return (0);
5611	base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
5612
5613	if (base_phys == 0)
5614		return (0);
5615	return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)));
5616}
5617
5618/*
5619 * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
5620 * And really copy the thing.  That way, we don't "precompute" checksums
5621 * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
5622 * small packets, don't dup into a cluster.  That way received  packets
5623 * don't take up too much room in the sockbuf (cf. sbspace()).
5624 */
5625int MDFail;
5626
5627struct mbuf *
5628m_dup(struct mbuf *m, int how)
5629{
5630	struct mbuf *n, **np;
5631	struct mbuf *top;
5632	int copyhdr = 0;
5633
5634	np = &top;
5635	top = NULL;
5636	if (m->m_flags & M_PKTHDR)
5637		copyhdr = 1;
5638
5639	/*
5640	 * Quick check: if we have one mbuf and its data fits in an
5641	 *  mbuf with packet header, just copy and go.
5642	 */
5643	if (m->m_next == NULL) {
5644		/* Then just move the data into an mbuf and be done... */
5645		if (copyhdr) {
5646			if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5647				if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5648					return (NULL);
5649				n->m_len = m->m_len;
5650				m_dup_pkthdr(n, m, how);
5651				bcopy(m->m_data, n->m_data, m->m_len);
5652				return (n);
5653			}
5654		} else if (m->m_len <= MLEN) {
5655			if ((n = _M_GET(how, m->m_type)) == NULL)
5656				return (NULL);
5657			bcopy(m->m_data, n->m_data, m->m_len);
5658			n->m_len = m->m_len;
5659			return (n);
5660		}
5661	}
5662	while (m != NULL) {
5663#if BLUE_DEBUG
5664		kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5665		    m->m_data);
5666#endif
5667		if (copyhdr)
5668			n = _M_GETHDR(how, m->m_type);
5669		else
5670			n = _M_GET(how, m->m_type);
5671		if (n == NULL)
5672			goto nospace;
5673		if (m->m_flags & M_EXT) {
5674			if (m->m_len <= m_maxsize(MC_CL))
5675				MCLGET(n, how);
5676			else if (m->m_len <= m_maxsize(MC_BIGCL))
5677				n = m_mbigget(n, how);
5678			else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5679				n = m_m16kget(n, how);
5680			if (!(n->m_flags & M_EXT)) {
5681				(void) m_free(n);
5682				goto nospace;
5683			}
5684		}
5685		*np = n;
5686		if (copyhdr) {
5687			/* Don't use M_COPY_PKTHDR: preserve m_data */
5688			m_dup_pkthdr(n, m, how);
5689			copyhdr = 0;
5690			if (!(n->m_flags & M_EXT))
5691				n->m_data = n->m_pktdat;
5692		}
5693		n->m_len = m->m_len;
5694		/*
5695		 * Get the dup on the same bdry as the original
5696		 * Assume that the two mbufs have the same offset to data area
5697		 * (up to word boundaries)
5698		 */
5699		bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5700		m = m->m_next;
5701		np = &n->m_next;
5702#if BLUE_DEBUG
5703		kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5704		    n->m_data);
5705#endif
5706	}
5707
5708	if (top == NULL)
5709		MDFail++;
5710	return (top);
5711
5712nospace:
5713	m_freem(top);
5714	MDFail++;
5715	return (NULL);
5716}
5717
5718#define	MBUF_MULTIPAGES(m)						\
5719	(((m)->m_flags & M_EXT) &&					\
5720	((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) ||	\
5721	(!IS_P2ALIGNED((m)->m_data, NBPG) &&				\
5722	P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5723
5724static struct mbuf *
5725m_expand(struct mbuf *m, struct mbuf **last)
5726{
5727	struct mbuf *top = NULL;
5728	struct mbuf **nm = &top;
5729	uintptr_t data0, data;
5730	unsigned int len0, len;
5731
5732	VERIFY(MBUF_MULTIPAGES(m));
5733	VERIFY(m->m_next == NULL);
5734	data0 = (uintptr_t)m->m_data;
5735	len0 = m->m_len;
5736	*last = top;
5737
5738	for (;;) {
5739		struct mbuf *n;
5740
5741		data = data0;
5742		if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5743			len = NBPG;
5744		else if (!IS_P2ALIGNED(data, NBPG) &&
5745		    P2ROUNDUP(data, NBPG) < (data + len0))
5746			len = P2ROUNDUP(data, NBPG) - data;
5747		else
5748			len = len0;
5749
5750		VERIFY(len > 0);
5751		VERIFY(m->m_flags & M_EXT);
5752		m->m_data = (void *)data;
5753		m->m_len = len;
5754
5755		*nm = *last = m;
5756		nm = &m->m_next;
5757		m->m_next = NULL;
5758
5759		data0 += len;
5760		len0 -= len;
5761		if (len0 == 0)
5762			break;
5763
5764		n = _M_RETRY(M_DONTWAIT, MT_DATA);
5765		if (n == NULL) {
5766			m_freem(top);
5767			top = *last = NULL;
5768			break;
5769		}
5770
5771		n->m_ext = m->m_ext;
5772		m_incref(m);
5773		n->m_flags |= M_EXT;
5774		m = n;
5775	}
5776	return (top);
5777}
5778
5779struct mbuf *
5780m_normalize(struct mbuf *m)
5781{
5782	struct mbuf *top = NULL;
5783	struct mbuf **nm = &top;
5784	boolean_t expanded = FALSE;
5785
5786	while (m != NULL) {
5787		struct mbuf *n;
5788
5789		n = m->m_next;
5790		m->m_next = NULL;
5791
5792		/* Does the data cross one or more page boundaries? */
5793		if (MBUF_MULTIPAGES(m)) {
5794			struct mbuf *last;
5795			if ((m = m_expand(m, &last)) == NULL) {
5796				m_freem(n);
5797				m_freem(top);
5798				top = NULL;
5799				break;
5800			}
5801			*nm = m;
5802			nm = &last->m_next;
5803			expanded = TRUE;
5804		} else {
5805			*nm = m;
5806			nm = &m->m_next;
5807		}
5808		m = n;
5809	}
5810	if (expanded)
5811		atomic_add_32(&mb_normalized, 1);
5812	return (top);
5813}
5814
5815/*
5816 * Append the specified data to the indicated mbuf chain,
5817 * Extend the mbuf chain if the new data does not fit in
5818 * existing space.
5819 *
5820 * Return 1 if able to complete the job; otherwise 0.
5821 */
5822int
5823m_append(struct mbuf *m0, int len, caddr_t cp)
5824{
5825	struct mbuf *m, *n;
5826	int remainder, space;
5827
5828	for (m = m0; m->m_next != NULL; m = m->m_next)
5829		;
5830	remainder = len;
5831	space = M_TRAILINGSPACE(m);
5832	if (space > 0) {
5833		/*
5834		 * Copy into available space.
5835		 */
5836		if (space > remainder)
5837			space = remainder;
5838		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5839		m->m_len += space;
5840		cp += space, remainder -= space;
5841	}
5842	while (remainder > 0) {
5843		/*
5844		 * Allocate a new mbuf; could check space
5845		 * and allocate a cluster instead.
5846		 */
5847		n = m_get(M_WAITOK, m->m_type);
5848		if (n == NULL)
5849			break;
5850		n->m_len = min(MLEN, remainder);
5851		bcopy(cp, mtod(n, caddr_t), n->m_len);
5852		cp += n->m_len;
5853		remainder -= n->m_len;
5854		m->m_next = n;
5855		m = n;
5856	}
5857	if (m0->m_flags & M_PKTHDR)
5858		m0->m_pkthdr.len += len - remainder;
5859	return (remainder == 0);
5860}
5861
5862struct mbuf *
5863m_last(struct mbuf *m)
5864{
5865	while (m->m_next != NULL)
5866		m = m->m_next;
5867	return (m);
5868}
5869
5870unsigned int
5871m_fixhdr(struct mbuf *m0)
5872{
5873	u_int len;
5874
5875	VERIFY(m0->m_flags & M_PKTHDR);
5876
5877	len = m_length2(m0, NULL);
5878	m0->m_pkthdr.len = len;
5879	return (len);
5880}
5881
5882unsigned int
5883m_length2(struct mbuf *m0, struct mbuf **last)
5884{
5885	struct mbuf *m;
5886	u_int len;
5887
5888	len = 0;
5889	for (m = m0; m != NULL; m = m->m_next) {
5890		len += m->m_len;
5891		if (m->m_next == NULL)
5892			break;
5893	}
5894	if (last != NULL)
5895		*last = m;
5896	return (len);
5897}
5898
5899/*
5900 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5901 * and clusters.  If allocation fails and this cannot be completed, NULL will
5902 * be returned, but the passed in chain will be unchanged.  Upon success,
5903 * the original chain will be freed, and the new chain will be returned.
5904 *
5905 * If a non-packet header is passed in, the original mbuf (chain?) will
5906 * be returned unharmed.
5907 *
5908 * If offset is specfied, the first mbuf in the chain will have a leading
5909 * space of the amount stated by the "off" parameter.
5910 *
5911 * This routine requires that the m_pkthdr.header field of the original
5912 * mbuf chain is cleared by the caller.
5913 */
5914struct mbuf *
5915m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
5916{
5917	struct mbuf *m_new = NULL, *m_final = NULL;
5918	int progress = 0, length, pktlen;
5919
5920	if (!(m0->m_flags & M_PKTHDR))
5921		return (m0);
5922
5923	VERIFY(off < MHLEN);
5924	m_fixhdr(m0); /* Needed sanity check */
5925
5926	pktlen = m0->m_pkthdr.len + off;
5927	if (pktlen > MHLEN)
5928		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
5929	else
5930		m_final = m_gethdr(how, MT_DATA);
5931
5932	if (m_final == NULL)
5933		goto nospace;
5934
5935	if (off > 0) {
5936		pktlen -= off;
5937		m_final->m_data += off;
5938	}
5939
5940	/*
5941	 * Caller must have handled the contents pointed to by this
5942	 * pointer before coming here, as otherwise it will point to
5943	 * the original mbuf which will get freed upon success.
5944	 */
5945	VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
5946
5947	if (m_dup_pkthdr(m_final, m0, how) == 0)
5948		goto nospace;
5949
5950	m_new = m_final;
5951
5952	while (progress < pktlen) {
5953		length = pktlen - progress;
5954		if (length > MCLBYTES)
5955			length = MCLBYTES;
5956		length -= ((m_new == m_final) ? off : 0);
5957
5958		if (m_new == NULL) {
5959			if (length > MLEN)
5960				m_new = m_getcl(how, MT_DATA, 0);
5961			else
5962				m_new = m_get(how, MT_DATA);
5963			if (m_new == NULL)
5964				goto nospace;
5965		}
5966
5967		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
5968		progress += length;
5969		m_new->m_len = length;
5970		if (m_new != m_final)
5971			m_cat(m_final, m_new);
5972		m_new = NULL;
5973	}
5974	m_freem(m0);
5975	m0 = m_final;
5976	return (m0);
5977nospace:
5978	if (m_final)
5979		m_freem(m_final);
5980	return (NULL);
5981}
5982
5983struct mbuf *
5984m_defrag(struct mbuf *m0, int how)
5985{
5986	return (m_defrag_offset(m0, 0, how));
5987}
5988
5989void
5990m_mchtype(struct mbuf *m, int t)
5991{
5992	mtype_stat_inc(t);
5993	mtype_stat_dec(m->m_type);
5994	(m)->m_type = t;
5995}
5996
5997void *
5998m_mtod(struct mbuf *m)
5999{
6000	return (MTOD(m, void *));
6001}
6002
6003struct mbuf *
6004m_dtom(void *x)
6005{
6006	return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
6007}
6008
6009void
6010m_mcheck(struct mbuf *m)
6011{
6012	_MCHECK(m);
6013}
6014
6015/*
6016 * Return a pointer to mbuf/offset of location in mbuf chain.
6017 */
6018struct mbuf *
6019m_getptr(struct mbuf *m, int loc, int *off)
6020{
6021
6022	while (loc >= 0) {
6023		/* Normal end of search. */
6024		if (m->m_len > loc) {
6025			*off = loc;
6026			return (m);
6027		} else {
6028			loc -= m->m_len;
6029			if (m->m_next == NULL) {
6030				if (loc == 0) {
6031					/* Point at the end of valid data. */
6032					*off = m->m_len;
6033					return (m);
6034				}
6035				return (NULL);
6036			}
6037			m = m->m_next;
6038		}
6039	}
6040	return (NULL);
6041}
6042
6043/*
6044 * Inform the corresponding mcache(s) that there's a waiter below.
6045 */
6046static void
6047mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
6048{
6049	mcache_waiter_inc(m_cache(class));
6050	if (comp) {
6051		if (class == MC_CL) {
6052			mcache_waiter_inc(m_cache(MC_MBUF_CL));
6053		} else if (class == MC_BIGCL) {
6054			mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6055		} else if (class == MC_16KCL) {
6056			mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6057		} else {
6058			mcache_waiter_inc(m_cache(MC_MBUF_CL));
6059			mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6060		}
6061	}
6062}
6063
6064/*
6065 * Inform the corresponding mcache(s) that there's no more waiter below.
6066 */
6067static void
6068mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6069{
6070	mcache_waiter_dec(m_cache(class));
6071	if (comp) {
6072		if (class == MC_CL) {
6073			mcache_waiter_dec(m_cache(MC_MBUF_CL));
6074		} else if (class == MC_BIGCL) {
6075			mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6076		} else if (class == MC_16KCL) {
6077			mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6078		} else {
6079			mcache_waiter_dec(m_cache(MC_MBUF_CL));
6080			mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6081		}
6082	}
6083}
6084
6085/*
6086 * Called during slab (blocking and non-blocking) allocation.  If there
6087 * is at least one waiter, and the time since the first waiter is blocked
6088 * is greater than the watchdog timeout, panic the system.
6089 */
6090static void
6091mbuf_watchdog(void)
6092{
6093	struct timeval now;
6094	unsigned int since;
6095
6096	if (mb_waiters == 0 || !mb_watchdog)
6097		return;
6098
6099	microuptime(&now);
6100	since = now.tv_sec - mb_wdtstart.tv_sec;
6101	if (since >= MB_WDT_MAXTIME) {
6102		panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6103		    mb_waiters, since, mbuf_dump());
6104		/* NOTREACHED */
6105	}
6106}
6107
6108/*
6109 * Called during blocking allocation.  Returns TRUE if one or more objects
6110 * are available at the per-CPU caches layer and that allocation should be
6111 * retried at that level.
6112 */
6113static boolean_t
6114mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
6115{
6116	boolean_t mcache_retry = FALSE;
6117
6118	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6119
6120	/* Check if there's anything at the cache layer */
6121	if (mbuf_cached_above(class, wait)) {
6122		mcache_retry = TRUE;
6123		goto done;
6124	}
6125
6126	/* Nothing?  Then try hard to get it from somewhere */
6127	m_reclaim(class, num, (wait & MCR_COMP));
6128
6129	/* We tried hard and got something? */
6130	if (m_infree(class) > 0) {
6131		mbstat.m_wait++;
6132		goto done;
6133	} else if (mbuf_cached_above(class, wait)) {
6134		mbstat.m_wait++;
6135		mcache_retry = TRUE;
6136		goto done;
6137	} else if (wait & MCR_TRYHARD) {
6138		mcache_retry = TRUE;
6139		goto done;
6140	}
6141
6142	/*
6143	 * There's really nothing for us right now; inform the
6144	 * cache(s) that there is a waiter below and go to sleep.
6145	 */
6146	mbuf_waiter_inc(class, (wait & MCR_COMP));
6147
6148	VERIFY(!(wait & MCR_NOSLEEP));
6149
6150	/*
6151	 * If this is the first waiter, arm the watchdog timer.  Otherwise
6152	 * check if we need to panic the system due to watchdog timeout.
6153	 */
6154	if (mb_waiters == 0)
6155		microuptime(&mb_wdtstart);
6156	else
6157		mbuf_watchdog();
6158
6159	mb_waiters++;
6160	(void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
6161
6162	/* We are now up; stop getting notified until next round */
6163	mbuf_waiter_dec(class, (wait & MCR_COMP));
6164
6165	/* We waited and got something */
6166	if (m_infree(class) > 0) {
6167		mbstat.m_wait++;
6168		goto done;
6169	} else if (mbuf_cached_above(class, wait)) {
6170		mbstat.m_wait++;
6171		mcache_retry = TRUE;
6172	}
6173done:
6174	return (mcache_retry);
6175}
6176
6177static void
6178mbuf_worker_thread(void)
6179{
6180	int mbuf_expand;
6181
6182	while (1) {
6183		lck_mtx_lock(mbuf_mlock);
6184
6185		mbuf_expand = 0;
6186		if (mbuf_expand_mcl) {
6187			int n;
6188
6189			/* Adjust to current number of cluster in use */
6190			n = mbuf_expand_mcl -
6191			    (m_total(MC_CL) - m_infree(MC_CL));
6192			if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6193				n = m_maxlimit(MC_CL) - m_total(MC_CL);
6194			mbuf_expand_mcl = 0;
6195
6196			if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6197				mbuf_expand++;
6198		}
6199		if (mbuf_expand_big) {
6200			int n;
6201
6202			/* Adjust to current number of 4 KB cluster in use */
6203			n = mbuf_expand_big -
6204			    (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6205			if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6206				n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6207			mbuf_expand_big = 0;
6208
6209			if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6210				mbuf_expand++;
6211		}
6212		if (mbuf_expand_16k) {
6213			int n;
6214
6215			/* Adjust to current number of 16 KB cluster in use */
6216			n = mbuf_expand_16k -
6217			    (m_total(MC_16KCL) - m_infree(MC_16KCL));
6218			if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6219				n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6220			mbuf_expand_16k = 0;
6221
6222			if (n > 0)
6223				(void) freelist_populate(MC_16KCL, n, M_WAIT);
6224		}
6225
6226		/*
6227		 * Because we can run out of memory before filling the mbuf
6228		 * map, we should not allocate more clusters than they are
6229		 * mbufs -- otherwise we could have a large number of useless
6230		 * clusters allocated.
6231		 */
6232		if (mbuf_expand) {
6233			while (m_total(MC_MBUF) <
6234			    (m_total(MC_BIGCL) + m_total(MC_CL))) {
6235				if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6236					break;
6237			}
6238		}
6239
6240		lck_mtx_unlock(mbuf_mlock);
6241
6242		assert_wait(&mbuf_worker_run, THREAD_UNINT);
6243		(void) thread_block((thread_continue_t)mbuf_worker_thread);
6244	}
6245}
6246
6247static void
6248mbuf_worker_thread_init(void)
6249{
6250	mbuf_worker_ready++;
6251	mbuf_worker_thread();
6252}
6253
6254static mcl_slab_t *
6255slab_get(void *buf)
6256{
6257	mcl_slabg_t *slg;
6258	unsigned int ix, k;
6259
6260	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6261
6262	VERIFY(MBUF_IN_MAP(buf));
6263	ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
6264	VERIFY(ix < maxslabgrp);
6265
6266	if ((slg = slabstbl[ix]) == NULL) {
6267		/*
6268		 * In the current implementation, we never shrink the slabs
6269		 * table; if we attempt to reallocate a cluster group when
6270		 * it's already allocated, panic since this is a sign of a
6271		 * memory corruption (slabstbl[ix] got nullified).
6272		 */
6273		++slabgrp;
6274		VERIFY(ix < slabgrp);
6275		/*
6276		 * Slabs expansion can only be done single threaded; when
6277		 * we get here, it must be as a result of m_clalloc() which
6278		 * is serialized and therefore mb_clalloc_busy must be set.
6279		 */
6280		VERIFY(mb_clalloc_busy);
6281		lck_mtx_unlock(mbuf_mlock);
6282
6283		/* This is a new buffer; create the slabs group for it */
6284		MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6285		    M_WAITOK | M_ZERO);
6286		VERIFY(slg != NULL);
6287
6288		lck_mtx_lock(mbuf_mlock);
6289		/*
6290		 * No other thread could have gone into m_clalloc() after
6291		 * we dropped the lock above, so verify that it's true.
6292		 */
6293		VERIFY(mb_clalloc_busy);
6294
6295		slabstbl[ix] = slg;
6296
6297		/* Chain each slab in the group to its forward neighbor */
6298		for (k = 1; k < NSLABSPMB; k++)
6299			slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6300		VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6301
6302		/* And chain the last slab in the previous group to this */
6303		if (ix > 0) {
6304			VERIFY(slabstbl[ix - 1]->
6305			    slg_slab[NSLABSPMB - 1].sl_next == NULL);
6306			slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6307			    &slg->slg_slab[0];
6308		}
6309	}
6310
6311	ix = MTOBG(buf) % NSLABSPMB;
6312	VERIFY(ix < NSLABSPMB);
6313
6314	return (&slg->slg_slab[ix]);
6315}
6316
6317static void
6318slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6319    void *base, void *head, unsigned int len, int refcnt, int chunks)
6320{
6321	sp->sl_class = class;
6322	sp->sl_flags = flags;
6323	sp->sl_base = base;
6324	sp->sl_head = head;
6325	sp->sl_len = len;
6326	sp->sl_refcnt = refcnt;
6327	sp->sl_chunks = chunks;
6328	slab_detach(sp);
6329}
6330
6331static void
6332slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6333{
6334	VERIFY(slab_is_detached(sp));
6335	m_slab_cnt(class)++;
6336	TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6337	sp->sl_flags &= ~SLF_DETACHED;
6338	if (class == MC_16KCL) {
6339		int k;
6340		for (k = 1; k < NSLABSP16KB; k++) {
6341			sp = sp->sl_next;
6342			/* Next slab must already be present */
6343			VERIFY(sp != NULL);
6344			VERIFY(slab_is_detached(sp));
6345			sp->sl_flags &= ~SLF_DETACHED;
6346		}
6347	}
6348}
6349
6350static void
6351slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6352{
6353	VERIFY(!slab_is_detached(sp));
6354	VERIFY(m_slab_cnt(class) > 0);
6355	m_slab_cnt(class)--;
6356	TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6357	slab_detach(sp);
6358	if (class == MC_16KCL) {
6359		int k;
6360		for (k = 1; k < NSLABSP16KB; k++) {
6361			sp = sp->sl_next;
6362			/* Next slab must already be present */
6363			VERIFY(sp != NULL);
6364			VERIFY(!slab_is_detached(sp));
6365			slab_detach(sp);
6366		}
6367	}
6368}
6369
6370static boolean_t
6371slab_inrange(mcl_slab_t *sp, void *buf)
6372{
6373	return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6374	    (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6375}
6376
6377#undef panic
6378
6379static void
6380slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6381{
6382	int i;
6383	unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6384	uintptr_t buf = (uintptr_t)sp->sl_base;
6385
6386	for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6387		void *next = ((mcache_obj_t *)buf)->obj_next;
6388		if (next != addr)
6389			continue;
6390		if (!mclverify) {
6391			if (next != NULL && !MBUF_IN_MAP(next)) {
6392				mcache_t *cp = m_cache(sp->sl_class);
6393				panic("%s: %s buffer %p in slab %p modified "
6394				    "after free at offset 0: %p out of range "
6395				    "[%p-%p)\n", __func__, cp->mc_name,
6396				    (void *)buf, sp, next, mbutl, embutl);
6397				/* NOTREACHED */
6398			}
6399		} else {
6400			mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6401			    (mcache_obj_t *)buf);
6402			mcl_audit_verify_nextptr(next, mca);
6403		}
6404	}
6405}
6406
6407static void
6408slab_detach(mcl_slab_t *sp)
6409{
6410	sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6411	sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6412	sp->sl_flags |= SLF_DETACHED;
6413}
6414
6415static boolean_t
6416slab_is_detached(mcl_slab_t *sp)
6417{
6418	return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6419	    (intptr_t)sp->sl_link.tqe_prev == -1 &&
6420	    (sp->sl_flags & SLF_DETACHED));
6421}
6422
6423static void
6424mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6425    mcache_obj_t **con_list, size_t con_size, unsigned int num)
6426{
6427	mcache_audit_t *mca, *mca_tail;
6428	mcache_obj_t *con = NULL;
6429	boolean_t save_contents = (con_list != NULL);
6430	unsigned int i, ix;
6431
6432	ASSERT(num <= NMBPBG);
6433	ASSERT(con_list == NULL || con_size != 0);
6434
6435	ix = MTOBG(buf);
6436	VERIFY(ix < maxclaudit);
6437
6438	/* Make sure we haven't been here before */
6439	for (i = 0; i < NMBPBG; i++)
6440		VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6441
6442	mca = mca_tail = *mca_list;
6443	if (save_contents)
6444		con = *con_list;
6445
6446	for (i = 0; i < num; i++) {
6447		mcache_audit_t *next;
6448
6449		next = mca->mca_next;
6450		bzero(mca, sizeof (*mca));
6451		mca->mca_next = next;
6452		mclaudit[ix].cl_audit[i] = mca;
6453
6454		/* Attach the contents buffer if requested */
6455		if (save_contents) {
6456			mcl_saved_contents_t *msc =
6457			    (mcl_saved_contents_t *)(void *)con;
6458
6459			VERIFY(msc != NULL);
6460			VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6461			VERIFY(con_size == sizeof (*msc));
6462			mca->mca_contents_size = con_size;
6463			mca->mca_contents = msc;
6464			con = con->obj_next;
6465			bzero(mca->mca_contents, mca->mca_contents_size);
6466		}
6467
6468		mca_tail = mca;
6469		mca = mca->mca_next;
6470	}
6471
6472	if (save_contents)
6473		*con_list = con;
6474
6475	*mca_list = mca_tail->mca_next;
6476	mca_tail->mca_next = NULL;
6477}
6478
6479static void
6480mcl_audit_free(void *buf, unsigned int num)
6481{
6482	unsigned int i, ix;
6483	mcache_audit_t *mca, *mca_list;
6484
6485	ix = MTOBG(buf);
6486	VERIFY(ix < maxclaudit);
6487
6488	if (mclaudit[ix].cl_audit[0] != NULL) {
6489		mca_list = mclaudit[ix].cl_audit[0];
6490		for (i = 0; i < num; i++) {
6491			mca = mclaudit[ix].cl_audit[i];
6492			mclaudit[ix].cl_audit[i] = NULL;
6493			if (mca->mca_contents)
6494				mcache_free(mcl_audit_con_cache,
6495				    mca->mca_contents);
6496		}
6497		mcache_free_ext(mcache_audit_cache,
6498		    (mcache_obj_t *)mca_list);
6499	}
6500}
6501
6502/*
6503 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6504 * the corresponding audit structure for that buffer.
6505 */
6506static mcache_audit_t *
6507mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6508{
6509	mcache_audit_t *mca = NULL;
6510	int ix = MTOBG(o);
6511
6512	VERIFY(ix < maxclaudit);
6513	VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6514
6515	switch (class) {
6516	case MC_MBUF:
6517		/*
6518		 * For the mbuf case, find the index of the page
6519		 * used by the mbuf and use that index to locate the
6520		 * base address of the page.  Then find out the
6521		 * mbuf index relative to the page base and use
6522		 * it to locate the audit structure.
6523		 */
6524		VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6525		mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
6526		break;
6527
6528	case MC_CL:
6529		/*
6530		 * Same thing as above, but for 2KB clusters in a page.
6531		 */
6532		VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6533		mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6534		break;
6535
6536	case MC_BIGCL:
6537	case MC_16KCL:
6538		/*
6539		 * Same as above, but only return the first element.
6540		 */
6541		mca = mclaudit[ix].cl_audit[0];
6542		break;
6543
6544	default:
6545		VERIFY(0);
6546		/* NOTREACHED */
6547	}
6548
6549	return (mca);
6550}
6551
6552static void
6553mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6554    boolean_t alloc)
6555{
6556	struct mbuf *m = addr;
6557	mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6558
6559	VERIFY(mca->mca_contents != NULL &&
6560	    mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6561
6562	if (mclverify)
6563		mcl_audit_verify_nextptr(next, mca);
6564
6565	if (!alloc) {
6566		/* Save constructed mbuf fields */
6567		mcl_audit_save_mbuf(m, mca);
6568		if (mclverify) {
6569			mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6570			    m_maxsize(MC_MBUF));
6571		}
6572		((mcache_obj_t *)m)->obj_next = next;
6573		return;
6574	}
6575
6576	/* Check if the buffer has been corrupted while in freelist */
6577	if (mclverify) {
6578		mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6579	}
6580	/* Restore constructed mbuf fields */
6581	mcl_audit_restore_mbuf(m, mca, composite);
6582}
6583
6584static void
6585mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6586{
6587	struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
6588
6589	if (composite) {
6590		struct mbuf *next = m->m_next;
6591		VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6592		    MBUF_IS_COMPOSITE(ms));
6593		VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6594		/*
6595		 * We could have hand-picked the mbuf fields and restore
6596		 * them individually, but that will be a maintenance
6597		 * headache.  Instead, restore everything that was saved;
6598		 * the mbuf layer will recheck and reinitialize anyway.
6599		 */
6600		bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
6601		m->m_next = next;
6602	} else {
6603		/*
6604		 * For a regular mbuf (no cluster attached) there's nothing
6605		 * to restore other than the type field, which is expected
6606		 * to be MT_FREE.
6607		 */
6608		m->m_type = ms->m_type;
6609	}
6610	_MCHECK(m);
6611}
6612
6613static void
6614mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6615{
6616	VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6617	_MCHECK(m);
6618	bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
6619}
6620
6621static void
6622mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6623    boolean_t save_next)
6624{
6625	mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6626
6627	if (!alloc) {
6628		if (mclverify) {
6629			mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6630		}
6631		if (save_next) {
6632			mcl_audit_verify_nextptr(next, mca);
6633			((mcache_obj_t *)addr)->obj_next = next;
6634		}
6635	} else if (mclverify) {
6636		/* Check if the buffer has been corrupted while in freelist */
6637		mcl_audit_verify_nextptr(next, mca);
6638		mcache_audit_free_verify_set(mca, addr, 0, size);
6639	}
6640}
6641
6642static void
6643mcl_audit_scratch(mcache_audit_t *mca)
6644{
6645	void *stack[MCACHE_STACK_DEPTH + 1];
6646	mcl_scratch_audit_t *msa;
6647	struct timeval now;
6648
6649	VERIFY(mca->mca_contents != NULL);
6650	msa = MCA_SAVED_SCRATCH_PTR(mca);
6651
6652	msa->msa_pthread = msa->msa_thread;
6653	msa->msa_thread = current_thread();
6654	bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
6655	msa->msa_pdepth = msa->msa_depth;
6656	bzero(stack, sizeof (stack));
6657	msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
6658	bcopy(&stack[1], msa->msa_stack, sizeof (msa->msa_stack));
6659
6660	msa->msa_ptstamp = msa->msa_tstamp;
6661	microuptime(&now);
6662	/* tstamp is in ms relative to base_ts */
6663	msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
6664	if ((now.tv_sec - mb_start.tv_sec) > 0)
6665		msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
6666}
6667
6668static void
6669mcl_audit_mcheck_panic(struct mbuf *m)
6670{
6671	mcache_audit_t *mca;
6672
6673	MRANGE(m);
6674	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6675
6676	panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6677	    m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6678	/* NOTREACHED */
6679}
6680
6681static void
6682mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6683{
6684	if (next != NULL && !MBUF_IN_MAP(next) &&
6685	    (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
6686		panic("mcl_audit: buffer %p modified after free at offset 0: "
6687		    "%p out of range [%p-%p)\n%s\n",
6688		    mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6689		/* NOTREACHED */
6690	}
6691}
6692
6693/* This function turns on mbuf leak detection */
6694static void
6695mleak_activate(void)
6696{
6697	mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6698	PE_parse_boot_argn("mleak_sample_factor",
6699	    &mleak_table.mleak_sample_factor,
6700	    sizeof (mleak_table.mleak_sample_factor));
6701
6702	if (mleak_table.mleak_sample_factor == 0)
6703		mclfindleak = 0;
6704
6705	if (mclfindleak == 0)
6706		return;
6707
6708	vm_size_t alloc_size =
6709	    mleak_alloc_buckets * sizeof (struct mallocation);
6710	vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6711
6712	MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6713	    M_TEMP, M_WAITOK | M_ZERO);
6714	VERIFY(mleak_allocations != NULL);
6715
6716	MALLOC(mleak_traces, struct mtrace *, trace_size,
6717	    M_TEMP, M_WAITOK | M_ZERO);
6718	VERIFY(mleak_traces != NULL);
6719
6720	MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6721	    M_TEMP, M_WAITOK | M_ZERO);
6722	VERIFY(mleak_stat != NULL);
6723	mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6724#ifdef __LP64__
6725	mleak_stat->ml_isaddr64 = 1;
6726#endif /* __LP64__ */
6727}
6728
6729static void
6730mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6731{
6732	int temp;
6733
6734	if (mclfindleak == 0)
6735		return;
6736
6737	if (!alloc)
6738		return (mleak_free(addr));
6739
6740	temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6741
6742	if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6743		uintptr_t bt[MLEAK_STACK_DEPTH];
6744		int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6745		mleak_log(bt, addr, logged, num);
6746	}
6747}
6748
6749/*
6750 * This function records the allocation in the mleak_allocations table
6751 * and the backtrace in the mleak_traces table; if allocation slot is in use,
6752 * replace old allocation with new one if the trace slot is in use, return
6753 * (or increment refcount if same trace).
6754 */
6755static boolean_t
6756mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6757{
6758	struct mallocation *allocation;
6759	struct mtrace *trace;
6760	uint32_t trace_index;
6761
6762	/* Quit if someone else modifying the tables */
6763	if (!lck_mtx_try_lock_spin(mleak_lock)) {
6764		mleak_table.total_conflicts++;
6765		return (FALSE);
6766	}
6767
6768	allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6769	    mleak_alloc_buckets)];
6770	trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6771	trace = &mleak_traces[trace_index];
6772
6773	VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6774	VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6775
6776	allocation->hitcount++;
6777	trace->hitcount++;
6778
6779	/*
6780	 * If the allocation bucket we want is occupied
6781	 * and the occupier has the same trace, just bail.
6782	 */
6783	if (allocation->element != NULL &&
6784	    trace_index == allocation->trace_index) {
6785		mleak_table.alloc_collisions++;
6786		lck_mtx_unlock(mleak_lock);
6787		return (TRUE);
6788	}
6789
6790	/*
6791	 * Store the backtrace in the traces array;
6792	 * Size of zero = trace bucket is free.
6793	 */
6794	if (trace->allocs > 0 &&
6795	    bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6796		/* Different, unique trace, but the same hash! Bail out. */
6797		trace->collisions++;
6798		mleak_table.trace_collisions++;
6799		lck_mtx_unlock(mleak_lock);
6800		return (TRUE);
6801	} else if (trace->allocs > 0) {
6802		/* Same trace, already added, so increment refcount */
6803		trace->allocs++;
6804	} else {
6805		/* Found an unused trace bucket, so record the trace here */
6806		if (trace->depth != 0) {
6807			/* this slot previously used but not currently in use */
6808			mleak_table.trace_overwrites++;
6809		}
6810		mleak_table.trace_recorded++;
6811		trace->allocs = 1;
6812		memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6813		trace->depth = depth;
6814		trace->collisions = 0;
6815	}
6816
6817	/* Step 2: Store the allocation record in the allocations array */
6818	if (allocation->element != NULL) {
6819		/*
6820		 * Replace an existing allocation.  No need to preserve
6821		 * because only a subset of the allocations are being
6822		 * recorded anyway.
6823		 */
6824		mleak_table.alloc_collisions++;
6825	} else if (allocation->trace_index != 0) {
6826		mleak_table.alloc_overwrites++;
6827	}
6828	allocation->element = addr;
6829	allocation->trace_index = trace_index;
6830	allocation->count = num;
6831	mleak_table.alloc_recorded++;
6832	mleak_table.outstanding_allocs++;
6833
6834	lck_mtx_unlock(mleak_lock);
6835	return (TRUE);
6836}
6837
6838static void
6839mleak_free(mcache_obj_t *addr)
6840{
6841	while (addr != NULL) {
6842		struct mallocation *allocation = &mleak_allocations
6843		    [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6844
6845		if (allocation->element == addr &&
6846		    allocation->trace_index < mleak_trace_buckets) {
6847			lck_mtx_lock_spin(mleak_lock);
6848			if (allocation->element == addr &&
6849			    allocation->trace_index < mleak_trace_buckets) {
6850				struct mtrace *trace;
6851				trace = &mleak_traces[allocation->trace_index];
6852				/* allocs = 0 means trace bucket is unused */
6853				if (trace->allocs > 0)
6854					trace->allocs--;
6855				if (trace->allocs == 0)
6856					trace->depth = 0;
6857				/* NULL element means alloc bucket is unused */
6858				allocation->element = NULL;
6859				mleak_table.outstanding_allocs--;
6860			}
6861			lck_mtx_unlock(mleak_lock);
6862		}
6863		addr = addr->obj_next;
6864	}
6865}
6866
6867static void
6868mleak_sort_traces()
6869{
6870	int i, j, k;
6871	struct mtrace *swap;
6872
6873	for(i = 0; i < MLEAK_NUM_TRACES; i++)
6874		mleak_top_trace[i] = NULL;
6875
6876	for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
6877	{
6878		if (mleak_traces[i].allocs <= 0)
6879			continue;
6880
6881		mleak_top_trace[j] = &mleak_traces[i];
6882		for (k = j; k > 0; k--) {
6883			if (mleak_top_trace[k]->allocs <=
6884			    mleak_top_trace[k-1]->allocs)
6885				break;
6886
6887			swap = mleak_top_trace[k-1];
6888			mleak_top_trace[k-1] = mleak_top_trace[k];
6889			mleak_top_trace[k] = swap;
6890		}
6891		j++;
6892	}
6893
6894	j--;
6895	for(; i < mleak_trace_buckets; i++) {
6896		if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
6897			continue;
6898
6899		mleak_top_trace[j] = &mleak_traces[i];
6900
6901		for (k = j; k > 0; k--) {
6902			if (mleak_top_trace[k]->allocs <=
6903			    mleak_top_trace[k-1]->allocs)
6904				break;
6905
6906			swap = mleak_top_trace[k-1];
6907			mleak_top_trace[k-1] = mleak_top_trace[k];
6908			mleak_top_trace[k] = swap;
6909		}
6910	}
6911}
6912
6913static void
6914mleak_update_stats()
6915{
6916	mleak_trace_stat_t *mltr;
6917	int i;
6918
6919	VERIFY(mleak_stat != NULL);
6920#ifdef __LP64__
6921	VERIFY(mleak_stat->ml_isaddr64);
6922#else
6923	VERIFY(!mleak_stat->ml_isaddr64);
6924#endif /* !__LP64__ */
6925	VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
6926
6927	mleak_sort_traces();
6928
6929	mltr = &mleak_stat->ml_trace[0];
6930	bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
6931	for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6932	int j;
6933
6934		if (mleak_top_trace[i] == NULL ||
6935		    mleak_top_trace[i]->allocs == 0)
6936			continue;
6937
6938		mltr->mltr_collisions	= mleak_top_trace[i]->collisions;
6939		mltr->mltr_hitcount	= mleak_top_trace[i]->hitcount;
6940		mltr->mltr_allocs	= mleak_top_trace[i]->allocs;
6941		mltr->mltr_depth	= mleak_top_trace[i]->depth;
6942
6943		VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
6944		for (j = 0; j < mltr->mltr_depth; j++)
6945			mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
6946
6947		mltr++;
6948	}
6949}
6950
6951static struct mbtypes {
6952	int		mt_type;
6953	const char	*mt_name;
6954} mbtypes[] = {
6955	{ MT_DATA,	"data" },
6956	{ MT_OOBDATA,	"oob data" },
6957	{ MT_CONTROL,	"ancillary data" },
6958	{ MT_HEADER,	"packet headers" },
6959	{ MT_SOCKET,	"socket structures" },
6960	{ MT_PCB,	"protocol control blocks" },
6961	{ MT_RTABLE,	"routing table entries" },
6962	{ MT_HTABLE,	"IMP host table entries" },
6963	{ MT_ATABLE,	"address resolution tables" },
6964	{ MT_FTABLE,	"fragment reassembly queue headers" },
6965	{ MT_SONAME,	"socket names and addresses" },
6966	{ MT_SOOPTS,	"socket options" },
6967	{ MT_RIGHTS,	"access rights" },
6968	{ MT_IFADDR,	"interface addresses" },
6969	{ MT_TAG,	"packet tags" },
6970	{ 0,		NULL }
6971};
6972
6973#define	MBUF_DUMP_BUF_CHK() {	\
6974	clen -= k;		\
6975	if (clen < 1)		\
6976		goto done;	\
6977	c += k;			\
6978}
6979
6980static char *
6981mbuf_dump(void)
6982{
6983	unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6984	u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6985	u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6986	u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6987	int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6988	uint8_t seen[256];
6989	struct mbtypes *mp;
6990	mb_class_stat_t *sp;
6991	mleak_trace_stat_t *mltr;
6992	char *c = mbuf_dump_buf;
6993	int i, k, clen = MBUF_DUMP_BUF_SIZE;
6994
6995	mbuf_dump_buf[0] = '\0';
6996
6997	/* synchronize all statistics in the mbuf table */
6998	mbuf_stat_sync();
6999	mbuf_mtypes_sync(TRUE);
7000
7001	sp = &mb_stat->mbs_class[0];
7002	for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
7003		u_int32_t mem;
7004
7005		if (m_class(i) == MC_MBUF) {
7006			m_mbufs = sp->mbcl_active;
7007		} else if (m_class(i) == MC_CL) {
7008			m_clfree = sp->mbcl_total - sp->mbcl_active;
7009		} else if (m_class(i) == MC_BIGCL) {
7010			m_bigclfree = sp->mbcl_total - sp->mbcl_active;
7011		} else if (njcl > 0 && m_class(i) == MC_16KCL) {
7012			m_16kclfree = sp->mbcl_total - sp->mbcl_active;
7013			m_16kclusters = sp->mbcl_total;
7014		} else if (m_class(i) == MC_MBUF_CL) {
7015			m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
7016		} else if (m_class(i) == MC_MBUF_BIGCL) {
7017			m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
7018		} else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
7019			m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
7020		}
7021
7022		mem = sp->mbcl_ctotal * sp->mbcl_size;
7023		totmem += mem;
7024		totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
7025		    sp->mbcl_size;
7026
7027	}
7028
7029	/* adjust free counts to include composite caches */
7030	m_clfree += m_mbufclfree;
7031	m_bigclfree += m_mbufbigclfree;
7032	m_16kclfree += m_mbuf16kclfree;
7033
7034	totmbufs = 0;
7035	for (mp = mbtypes; mp->mt_name != NULL; mp++)
7036		totmbufs += mbstat.m_mtypes[mp->mt_type];
7037	if (totmbufs > m_mbufs)
7038		totmbufs = m_mbufs;
7039	k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
7040	MBUF_DUMP_BUF_CHK();
7041
7042	bzero(&seen, sizeof (seen));
7043	for (mp = mbtypes; mp->mt_name != NULL; mp++) {
7044		if (mbstat.m_mtypes[mp->mt_type] != 0) {
7045			seen[mp->mt_type] = 1;
7046			k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
7047			    mbstat.m_mtypes[mp->mt_type], mp->mt_name);
7048			MBUF_DUMP_BUF_CHK();
7049		}
7050	}
7051	seen[MT_FREE] = 1;
7052	for (i = 0; i < nmbtypes; i++)
7053		if (!seen[i] && mbstat.m_mtypes[i] != 0) {
7054			k = snprintf(c, clen, "\t%u mbufs allocated to "
7055			    "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
7056			MBUF_DUMP_BUF_CHK();
7057		}
7058	if ((m_mbufs - totmbufs) > 0) {
7059		k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
7060		    m_mbufs - totmbufs);
7061		MBUF_DUMP_BUF_CHK();
7062	}
7063	k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
7064	    "%u/%u mbuf 4KB clusters in use\n",
7065	    (unsigned int)(mbstat.m_clusters - m_clfree),
7066	    (unsigned int)mbstat.m_clusters,
7067	    (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
7068	    (unsigned int)mbstat.m_bigclusters);
7069	MBUF_DUMP_BUF_CHK();
7070
7071	if (njcl > 0) {
7072		k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
7073		    m_16kclusters - m_16kclfree, m_16kclusters,
7074		    njclbytes / 1024);
7075		MBUF_DUMP_BUF_CHK();
7076	}
7077	totused = totmem - totfree;
7078	if (totmem == 0) {
7079		totpct = 0;
7080	} else if (totused < (ULONG_MAX / 100)) {
7081		totpct = (totused * 100) / totmem;
7082	} else {
7083		u_long totmem1 = totmem / 100;
7084		u_long totused1 = totused / 100;
7085		totpct = (totused1 * 100) / totmem1;
7086	}
7087	k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7088	    "in use)\n", totmem / 1024, totpct);
7089	MBUF_DUMP_BUF_CHK();
7090
7091	/* mbuf leak detection statistics */
7092	mleak_update_stats();
7093
7094	k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7095	MBUF_DUMP_BUF_CHK();
7096	k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7097	    mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7098	    mleak_table.mleak_sample_factor);
7099	MBUF_DUMP_BUF_CHK();
7100	k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7101	    mleak_table.outstanding_allocs);
7102	MBUF_DUMP_BUF_CHK();
7103	k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7104	    mleak_table.alloc_recorded, mleak_table.trace_recorded);
7105	MBUF_DUMP_BUF_CHK();
7106	k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7107	    mleak_table.alloc_collisions, mleak_table.trace_collisions);
7108	MBUF_DUMP_BUF_CHK();
7109	k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7110	    mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7111	MBUF_DUMP_BUF_CHK();
7112	k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7113	    mleak_table.total_conflicts);
7114	MBUF_DUMP_BUF_CHK();
7115
7116	k = snprintf(c, clen, "top %d outstanding traces:\n",
7117	    mleak_stat->ml_cnt);
7118	MBUF_DUMP_BUF_CHK();
7119	for (i = 0; i < mleak_stat->ml_cnt; i++) {
7120		mltr = &mleak_stat->ml_trace[i];
7121		k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7122		    "%llu hit(s), %llu collision(s)\n", (i + 1),
7123		    mltr->mltr_allocs, mltr->mltr_hitcount,
7124		    mltr->mltr_collisions);
7125		MBUF_DUMP_BUF_CHK();
7126	}
7127
7128	if (mleak_stat->ml_isaddr64)
7129		k = snprintf(c, clen, MB_LEAK_HDR_64);
7130	else
7131		k = snprintf(c, clen, MB_LEAK_HDR_32);
7132	MBUF_DUMP_BUF_CHK();
7133
7134	for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
7135		int j;
7136		k = snprintf(c, clen, "%2d: ", (i + 1));
7137		MBUF_DUMP_BUF_CHK();
7138		for (j = 0; j < mleak_stat->ml_cnt; j++) {
7139			mltr = &mleak_stat->ml_trace[j];
7140			if (i < mltr->mltr_depth) {
7141				if (mleak_stat->ml_isaddr64) {
7142					k = snprintf(c, clen, "0x%0llx  ",
7143					    (uint64_t)VM_KERNEL_UNSLIDE(
7144						mltr->mltr_addr[i]));
7145				} else {
7146					k = snprintf(c, clen,
7147					    "0x%08x  ",
7148					    (uint32_t)VM_KERNEL_UNSLIDE(
7149						mltr->mltr_addr[i]));
7150				}
7151			} else {
7152				if (mleak_stat->ml_isaddr64)
7153					k = snprintf(c, clen,
7154					    MB_LEAK_SPACING_64);
7155				else
7156					k = snprintf(c, clen,
7157					    MB_LEAK_SPACING_32);
7158			}
7159			MBUF_DUMP_BUF_CHK();
7160		}
7161		k = snprintf(c, clen, "\n");
7162		MBUF_DUMP_BUF_CHK();
7163	}
7164done:
7165	return (mbuf_dump_buf);
7166}
7167
7168#undef MBUF_DUMP_BUF_CHK
7169
7170/*
7171 * Convert between a regular and a packet header mbuf.  Caller is responsible
7172 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7173 */
7174int
7175m_reinit(struct mbuf *m, int hdr)
7176{
7177	int ret = 0;
7178
7179	if (hdr) {
7180		VERIFY(!(m->m_flags & M_PKTHDR));
7181		if (!(m->m_flags & M_EXT) &&
7182		    (m->m_data != m->m_dat || m->m_len > 0)) {
7183			/*
7184			 * If there's no external cluster attached and the
7185			 * mbuf appears to contain user data, we cannot
7186			 * safely convert this to a packet header mbuf,
7187			 * as the packet header structure might overlap
7188			 * with the data.
7189			 */
7190			printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
7191			    "m_data %llx (expected %llx), "
7192			    "m_len %d (expected 0)\n",
7193			    __func__,
7194			    (uint64_t)VM_KERNEL_ADDRPERM(m),
7195			    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
7196			    (uint64_t)VM_KERNEL_ADDRPERM(m->m_dat), m->m_len);
7197			ret = EBUSY;
7198		} else {
7199			VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
7200			m->m_flags |= M_PKTHDR;
7201			MBUF_INIT_PKTHDR(m);
7202		}
7203	} else {
7204		/* Check for scratch area overflow */
7205		m_redzone_verify(m);
7206		/* Free the aux data and tags if there is any */
7207		m_tag_delete_chain(m, NULL);
7208		m->m_flags &= ~M_PKTHDR;
7209	}
7210
7211	return (ret);
7212}
7213
7214void
7215m_scratch_init(struct mbuf *m)
7216{
7217	struct pkthdr *pkt = &m->m_pkthdr;
7218
7219	VERIFY(m->m_flags & M_PKTHDR);
7220
7221	/* See comments in <rdar://problem/14040693> */
7222	if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7223		panic_plain("Invalid attempt to modify guarded module-private "
7224		    "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7225		/* NOTREACHED */
7226	}
7227
7228	bzero(&pkt->pkt_mpriv, sizeof (pkt->pkt_mpriv));
7229}
7230
7231/*
7232 * This routine is reserved for mbuf_get_driver_scratch(); clients inside
7233 * xnu that intend on utilizing the module-private area should directly
7234 * refer to the pkt_mpriv structure in the pkthdr.  They are also expected
7235 * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
7236 * to handing it off to another module, respectively.
7237 */
7238u_int32_t
7239m_scratch_get(struct mbuf *m, u_int8_t **p)
7240{
7241	struct pkthdr *pkt = &m->m_pkthdr;
7242
7243	VERIFY(m->m_flags & M_PKTHDR);
7244
7245	/* See comments in <rdar://problem/14040693> */
7246	if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7247		panic_plain("Invalid attempt to access guarded module-private "
7248		    "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7249		/* NOTREACHED */
7250	}
7251
7252	if (mcltrace) {
7253		mcache_audit_t *mca;
7254
7255		lck_mtx_lock(mbuf_mlock);
7256		mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7257		if (mca->mca_uflags & MB_SCVALID)
7258			mcl_audit_scratch(mca);
7259		lck_mtx_unlock(mbuf_mlock);
7260	}
7261
7262	*p = (u_int8_t *)&pkt->pkt_mpriv;
7263	return (sizeof (pkt->pkt_mpriv));
7264}
7265
7266static void
7267m_redzone_init(struct mbuf *m)
7268{
7269	VERIFY(m->m_flags & M_PKTHDR);
7270	/*
7271	 * Each mbuf has a unique red zone pattern, which is a XOR
7272	 * of the red zone cookie and the address of the mbuf.
7273	 */
7274	m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7275}
7276
7277static void
7278m_redzone_verify(struct mbuf *m)
7279{
7280	u_int32_t mb_redzone;
7281
7282	VERIFY(m->m_flags & M_PKTHDR);
7283
7284	mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7285	if (m->m_pkthdr.redzone != mb_redzone) {
7286		panic("mbuf %p redzone violation with value 0x%x "
7287		    "(instead of 0x%x, using cookie 0x%x)\n",
7288		    m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7289		/* NOTREACHED */
7290	}
7291}
7292
7293/*
7294 * Send a report of mbuf usage if the usage is at least 6% of max limit
7295 * or if there has been at least 3% increase since the last report.
7296 *
7297 * The values 6% and 3% are chosen so that we can do simple arithmetic
7298 * with shift operations.
7299 */
7300static boolean_t
7301mbuf_report_usage(mbuf_class_t cl)
7302{
7303	/* if a report is already in progress, nothing to do */
7304	if (mb_peak_newreport)
7305		return (TRUE);
7306
7307	if (m_total(cl) > m_peak(cl) &&
7308	    m_total(cl) >= (m_maxlimit(cl) >> 4) &&
7309	    (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5))
7310		return (TRUE);
7311	return (FALSE);
7312}
7313
7314__private_extern__ void
7315mbuf_report_peak_usage(void)
7316{
7317	int i = 0;
7318	u_int64_t uptime;
7319	struct nstat_sysinfo_data ns_data;
7320	uint32_t memreleased = 0;
7321
7322	uptime = net_uptime();
7323	lck_mtx_lock(mbuf_mlock);
7324
7325	/* Generate an initial report after 1 week of uptime */
7326	if (!mb_peak_firstreport &&
7327	    uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
7328		mb_peak_newreport = TRUE;
7329		mb_peak_firstreport = TRUE;
7330	}
7331
7332	if (!mb_peak_newreport) {
7333		lck_mtx_unlock(mbuf_mlock);
7334		return;
7335	}
7336
7337	/*
7338	 * Since a report is being generated before 1 week,
7339	 * we do not need to force another one later
7340	 */
7341	if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD)
7342		mb_peak_firstreport = TRUE;
7343
7344	for (i = 0; i < NELEM(mbuf_table); i++) {
7345		m_peak(m_class(i)) = m_total(m_class(i));
7346		memreleased += m_release_cnt(i);
7347	}
7348	mb_peak_newreport = FALSE;
7349	lck_mtx_unlock(mbuf_mlock);
7350
7351	bzero(&ns_data, sizeof(ns_data));
7352	ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
7353	ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
7354	ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
7355	ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
7356	ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
7357	ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
7358	ns_data.u.mb_stats.draincnt = mbstat.m_drain;
7359	ns_data.u.mb_stats.memreleased = memreleased;
7360
7361	nstat_sysinfo_send_data(&ns_data);
7362}
7363
7364/*
7365 * Called by the VM when there's memory pressure.
7366 */
7367__private_extern__ void
7368m_drain(void)
7369{
7370	mbuf_class_t mc;
7371	mcl_slab_t *sp, *sp_tmp, *nsp;
7372	unsigned int num, k, interval, released = 0;
7373	unsigned int total_mem = 0, use_mem = 0;
7374	boolean_t ret, purge_caches = FALSE;
7375	ppnum_t offset;
7376	mcache_obj_t *obj;
7377	float per;
7378	static uint64_t last_drain = 0;
7379	static unsigned char scratch[32];
7380	static ppnum_t scratch_pa = 0;
7381
7382	if (mb_drain_maxint == 0 || mb_waiters)
7383		return;
7384	if (scratch_pa == 0) {
7385		bzero(scratch, sizeof(scratch));
7386		scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
7387		VERIFY(scratch_pa);
7388	} else if (mclverify) {
7389		/*
7390		 * Panic if a driver wrote to our scratch memory.
7391		 */
7392		for (k = 0; k < sizeof(scratch); k++)
7393			if (scratch[k])
7394				panic("suspect DMA to freed address");
7395	}
7396	/*
7397	 * Don't free memory too often as that could cause excessive
7398	 * waiting times for mbufs.  Purge caches if we were asked to drain
7399	 * in the last 5 minutes.
7400	 */
7401	lck_mtx_lock(mbuf_mlock);
7402	if (last_drain == 0) {
7403		last_drain = net_uptime();
7404		lck_mtx_unlock(mbuf_mlock);
7405		return;
7406	}
7407	interval = net_uptime() - last_drain;
7408	if (interval <= mb_drain_maxint) {
7409		lck_mtx_unlock(mbuf_mlock);
7410		return;
7411	}
7412	if (interval <= mb_drain_maxint * 5)
7413		purge_caches = TRUE;
7414	last_drain = net_uptime();
7415	/*
7416	 * Don't free any memory if we're using 60% or more.
7417	 */
7418	for (mc = 0; mc < NELEM(mbuf_table); mc++) {
7419		total_mem += m_total(mc) * m_maxsize(mc);
7420		use_mem += m_active(mc) * m_maxsize(mc);
7421	}
7422	per = (float)use_mem / (float)total_mem;
7423	if (per >= 0.6) {
7424		lck_mtx_unlock(mbuf_mlock);
7425		return;
7426	}
7427	/*
7428	 * Purge all the caches.  This effectively disables
7429	 * caching for a few seconds, but the mbuf worker thread will
7430	 * re-enable them again.
7431	 */
7432	if (purge_caches == TRUE)
7433		for (mc = 0; mc < NELEM(mbuf_table); mc++) {
7434			if (m_total(mc) < m_avgtotal(mc))
7435				continue;
7436			lck_mtx_unlock(mbuf_mlock);
7437			ret = mcache_purge_cache(m_cache(mc), FALSE);
7438			lck_mtx_lock(mbuf_mlock);
7439			if (ret == TRUE)
7440				m_purge_cnt(mc)++;
7441		}
7442	/*
7443	 * Move the objects from the composite class freelist to
7444	 * the rudimentary slabs list, but keep at least 10% of the average
7445	 * total in the freelist.
7446	 */
7447	for (mc = 0; mc < NELEM(mbuf_table); mc++) {
7448		while (m_cobjlist(mc) &&
7449		    m_total(mc) < m_avgtotal(mc) &&
7450		    m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
7451			obj = m_cobjlist(mc);
7452			m_cobjlist(mc) = obj->obj_next;
7453			obj->obj_next = NULL;
7454			num = cslab_free(mc, obj, 1);
7455			VERIFY(num == 1);
7456			m_free_cnt(mc)++;
7457			m_infree(mc)--;
7458			/* cslab_free() handles m_total */
7459		}
7460	}
7461	/*
7462	 * Free the buffers present in the slab list up to 10% of the total
7463	 * average per class.
7464	 *
7465	 * We walk the list backwards in an attempt to reduce fragmentation.
7466	 */
7467	for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
7468		TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
7469			/*
7470			 * Process only unused slabs occupying memory.
7471			 */
7472			if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
7473			    sp->sl_base == NULL)
7474				continue;
7475			if (m_total(mc) < m_avgtotal(mc) ||
7476			    m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc))
7477				break;
7478			slab_remove(sp, mc);
7479			switch (mc) {
7480			case MC_MBUF:
7481				m_infree(mc) -= NMBPBG;
7482				m_total(mc) -= NMBPBG;
7483				if (mclaudit != NULL)
7484					mcl_audit_free(sp->sl_base, NMBPBG);
7485				break;
7486			case MC_CL:
7487				m_infree(mc) -= NCLPBG;
7488				m_total(mc) -= NCLPBG;
7489				if (mclaudit != NULL)
7490					mcl_audit_free(sp->sl_base, NMBPBG);
7491				break;
7492			case MC_BIGCL:
7493				m_infree(mc)--;
7494				m_total(mc)--;
7495				if (mclaudit != NULL)
7496					mcl_audit_free(sp->sl_base, NMBPBG);
7497				break;
7498			case MC_16KCL:
7499				m_infree(mc)--;
7500				m_total(mc)--;
7501				for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
7502					nsp = nsp->sl_next;
7503					VERIFY(nsp->sl_refcnt == 0 &&
7504					    nsp->sl_base != NULL &&
7505					    nsp->sl_len == 0);
7506					slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
7507					    0);
7508					nsp->sl_flags = 0;
7509				}
7510				if (mclaudit != NULL)
7511					mcl_audit_free(sp->sl_base, 1);
7512				break;
7513			default:
7514				/*
7515				 * The composite classes have their own
7516				 * freelist (m_cobjlist), so we only
7517				 * process rudimentary classes here.
7518				 */
7519				VERIFY(0);
7520			}
7521			m_release_cnt(mc) += m_size(mc);
7522			released += m_size(mc);
7523			offset = ((char *)sp->sl_base - (char *)mbutl) / NBPG;
7524			/*
7525			 * Make sure the IOMapper points to a valid, but
7526			 * bogus, address.  This should prevent further DMA
7527			 * accesses to freed memory.
7528			 */
7529			IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
7530			mcl_paddr[offset] = 0;
7531			kmem_free(mb_map, (vm_offset_t)sp->sl_base,
7532			    sp->sl_len);
7533			slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
7534			sp->sl_flags = 0;
7535		}
7536	}
7537	mbstat.m_drain++;
7538	mbstat.m_bigclusters = m_total(MC_BIGCL);
7539	mbstat.m_clusters = m_total(MC_CL);
7540	mbstat.m_mbufs = m_total(MC_MBUF);
7541	mbuf_stat_sync();
7542	mbuf_mtypes_sync(TRUE);
7543	lck_mtx_unlock(mbuf_mlock);
7544}
7545
7546static int
7547m_drain_force_sysctl SYSCTL_HANDLER_ARGS
7548{
7549#pragma unused(arg1, arg2)
7550	int val = 0, err;
7551
7552	err = sysctl_handle_int(oidp, &val, 0, req);
7553	if (err != 0 || req->newptr == USER_ADDR_NULL)
7554		return (err);
7555	if (val)
7556		m_drain();
7557
7558	return (err);
7559}
7560
7561SYSCTL_DECL(_kern_ipc);
7562SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
7563    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
7564    0, 0, mbstat_sysctl, "S,mbstat", "");
7565SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
7566    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
7567    0, 0, mb_stat_sysctl, "S,mb_stat", "");
7568SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
7569    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
7570    0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
7571SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
7572    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
7573    0, 0, mleak_table_sysctl, "S,mleak_table", "");
7574SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
7575    CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
7576SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
7577    CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
7578SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
7579    CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
7580SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
7581    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
7582    m_drain_force_sysctl, "I",
7583    "Forces the mbuf garbage collection to run");
7584SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
7585    CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
7586    "Minimum time interval between garbage collection");
7587