1/* 2 * Copyright (c) 1998-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1982, 1986, 1988, 1991, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. All advertising materials mentioning features or use of this software 42 * must display the following acknowledgement: 43 * This product includes software developed by the University of 44 * California, Berkeley and its contributors. 45 * 4. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 62 */ 63/* 64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 65 * support for mandatory and extensible security protections. This notice 66 * is included in support of clause 2.2 (b) of the Apple Public License, 67 * Version 2.0. 68 */ 69 70#include <sys/param.h> 71#include <sys/systm.h> 72#include <sys/malloc.h> 73#include <sys/mbuf.h> 74#include <sys/kernel.h> 75#include <sys/sysctl.h> 76#include <sys/syslog.h> 77#include <sys/protosw.h> 78#include <sys/domain.h> 79#include <sys/queue.h> 80#include <sys/proc.h> 81 82#include <dev/random/randomdev.h> 83 84#include <kern/kern_types.h> 85#include <kern/simple_lock.h> 86#include <kern/queue.h> 87#include <kern/sched_prim.h> 88#include <kern/cpu_number.h> 89#include <kern/zalloc.h> 90 91#include <libkern/OSAtomic.h> 92#include <libkern/OSDebug.h> 93#include <libkern/libkern.h> 94 95#include <IOKit/IOMapper.h> 96 97#include <machine/limits.h> 98#include <machine/machine_routines.h> 99 100#if CONFIG_MACF_NET 101#include <security/mac_framework.h> 102#endif /* MAC_NET */ 103 104#include <sys/mcache.h> 105#include <net/ntstat.h> 106 107/* 108 * MBUF IMPLEMENTATION NOTES. 109 * 110 * There is a total of 5 per-CPU caches: 111 * 112 * MC_MBUF: 113 * This is a cache of rudimentary objects of MSIZE in size; each 114 * object represents an mbuf structure. This cache preserves only 115 * the m_type field of the mbuf during its transactions. 116 * 117 * MC_CL: 118 * This is a cache of rudimentary objects of MCLBYTES in size; each 119 * object represents a mcluster structure. This cache does not 120 * preserve the contents of the objects during its transactions. 121 * 122 * MC_BIGCL: 123 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each 124 * object represents a mbigcluster structure. This cache does not 125 * preserve the contents of the objects during its transaction. 126 * 127 * MC_MBUF_CL: 128 * This is a cache of mbufs each having a cluster attached to it. 129 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several 130 * fields of the mbuf related to the external cluster are preserved 131 * during transactions. 132 * 133 * MC_MBUF_BIGCL: 134 * This is a cache of mbufs each having a big cluster attached to it. 135 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several 136 * fields of the mbuf related to the external cluster are preserved 137 * during transactions. 138 * 139 * OBJECT ALLOCATION: 140 * 141 * Allocation requests are handled first at the per-CPU (mcache) layer 142 * before falling back to the slab layer. Performance is optimal when 143 * the request is satisfied at the CPU layer because global data/lock 144 * never gets accessed. When the slab layer is entered for allocation, 145 * the slab freelist will be checked first for available objects before 146 * the VM backing store is invoked. Slab layer operations are serialized 147 * for all of the caches as the mbuf global lock is held most of the time. 148 * Allocation paths are different depending on the class of objects: 149 * 150 * a. Rudimentary object: 151 * 152 * { m_get_common(), m_clattach(), m_mclget(), 153 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(), 154 * composite object allocation } 155 * | ^ 156 * | | 157 * | +-----------------------+ 158 * v | 159 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit() 160 * | ^ 161 * v | 162 * [CPU cache] -------> (found?) -------+ 163 * | | 164 * v | 165 * mbuf_slab_alloc() | 166 * | | 167 * v | 168 * +---------> [freelist] -------> (found?) -------+ 169 * | | 170 * | v 171 * | m_clalloc() 172 * | | 173 * | v 174 * +---<<---- kmem_mb_alloc() 175 * 176 * b. Composite object: 177 * 178 * { m_getpackets_internal(), m_allocpacket_internal() } 179 * | ^ 180 * | | 181 * | +------ (done) ---------+ 182 * v | 183 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit() 184 * | ^ 185 * v | 186 * [CPU cache] -------> (found?) -------+ 187 * | | 188 * v | 189 * mbuf_cslab_alloc() | 190 * | | 191 * v | 192 * [freelist] -------> (found?) -------+ 193 * | | 194 * v | 195 * (rudimentary object) | 196 * mcache_alloc/mcache_alloc_ext() ------>>-----+ 197 * 198 * Auditing notes: If auditing is enabled, buffers will be subjected to 199 * integrity checks by the audit routine. This is done by verifying their 200 * contents against DEADBEEF (free) pattern before returning them to caller. 201 * As part of this step, the routine will also record the transaction and 202 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will 203 * also restore any constructed data structure fields if necessary. 204 * 205 * OBJECT DEALLOCATION: 206 * 207 * Freeing an object simply involves placing it into the CPU cache; this 208 * pollutes the cache to benefit subsequent allocations. The slab layer 209 * will only be entered if the object is to be purged out of the cache. 210 * During normal operations, this happens only when the CPU layer resizes 211 * its bucket while it's adjusting to the allocation load. Deallocation 212 * paths are different depending on the class of objects: 213 * 214 * a. Rudimentary object: 215 * 216 * { m_free(), m_freem_list(), composite object deallocation } 217 * | ^ 218 * | | 219 * | +------ (done) ---------+ 220 * v | 221 * mcache_free/mcache_free_ext() | 222 * | | 223 * v | 224 * mbuf_slab_audit() | 225 * | | 226 * v | 227 * [CPU cache] ---> (not purging?) -----+ 228 * | | 229 * v | 230 * mbuf_slab_free() | 231 * | | 232 * v | 233 * [freelist] ----------->>------------+ 234 * (objects never get purged to VM) 235 * 236 * b. Composite object: 237 * 238 * { m_free(), m_freem_list() } 239 * | ^ 240 * | | 241 * | +------ (done) ---------+ 242 * v | 243 * mcache_free/mcache_free_ext() | 244 * | | 245 * v | 246 * mbuf_cslab_audit() | 247 * | | 248 * v | 249 * [CPU cache] ---> (not purging?) -----+ 250 * | | 251 * v | 252 * mbuf_cslab_free() | 253 * | | 254 * v | 255 * [freelist] ---> (not purging?) -----+ 256 * | | 257 * v | 258 * (rudimentary object) | 259 * mcache_free/mcache_free_ext() ------->>------+ 260 * 261 * Auditing notes: If auditing is enabled, the audit routine will save 262 * any constructed data structure fields (if necessary) before filling the 263 * contents of the buffers with DEADBEEF (free) pattern and recording the 264 * transaction. Buffers that are freed (whether at CPU or slab layer) are 265 * expected to contain the free pattern. 266 * 267 * DEBUGGING: 268 * 269 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this 270 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally, 271 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag, 272 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak 273 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g. 274 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory. 275 * 276 * Each object is associated with exactly one mcache_audit_t structure that 277 * contains the information related to its last buffer transaction. Given 278 * an address of an object, the audit structure can be retrieved by finding 279 * the position of the object relevant to the base address of the cluster: 280 * 281 * +------------+ +=============+ 282 * | mbuf addr | | mclaudit[i] | 283 * +------------+ +=============+ 284 * | | cl_audit[0] | 285 * i = MTOBG(addr) +-------------+ 286 * | +-----> | cl_audit[1] | -----> mcache_audit_t 287 * b = BGTOM(i) | +-------------+ 288 * | | | ... | 289 * x = MCLIDX(b, addr) | +-------------+ 290 * | | | cl_audit[7] | 291 * +-----------------+ +-------------+ 292 * (e.g. x == 1) 293 * 294 * The mclaudit[] array is allocated at initialization time, but its contents 295 * get populated when the corresponding cluster is created. Because a page 296 * can be turned into NMBPBG number of mbufs, we preserve enough space for the 297 * mbufs so that there is a 1-to-1 mapping between them. A page that never 298 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the 299 * remaining entries unused. For 16KB cluster, only one entry from the first 300 * page is allocated and used for the entire object. 301 */ 302 303/* TODO: should be in header file */ 304/* kernel translater */ 305extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int); 306extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); 307extern vm_map_t mb_map; /* special map */ 308 309/* Global lock */ 310decl_lck_mtx_data(static, mbuf_mlock_data); 311static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data; 312static lck_attr_t *mbuf_mlock_attr; 313static lck_grp_t *mbuf_mlock_grp; 314static lck_grp_attr_t *mbuf_mlock_grp_attr; 315 316/* Back-end (common) layer */ 317static void *mbuf_worker_run; /* wait channel for worker thread */ 318static int mbuf_worker_ready; /* worker thread is runnable */ 319static int mbuf_expand_mcl; /* number of cluster creation requets */ 320static int mbuf_expand_big; /* number of big cluster creation requests */ 321static int mbuf_expand_16k; /* number of 16KB cluster creation requests */ 322static int ncpu; /* number of CPUs */ 323static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */ 324static ppnum_t mcl_pages; /* Size of array (# physical pages) */ 325static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */ 326static mcache_t *ref_cache; /* Cache of cluster reference & flags */ 327static mcache_t *mcl_audit_con_cache; /* Audit contents cache */ 328static unsigned int mbuf_debug; /* patchable mbuf mcache flags */ 329static unsigned int mb_normalized; /* number of packets "normalized" */ 330 331#define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */ 332#define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */ 333 334typedef enum { 335 MC_MBUF = 0, /* Regular mbuf */ 336 MC_CL, /* Cluster */ 337 MC_BIGCL, /* Large (4KB) cluster */ 338 MC_16KCL, /* Jumbo (16KB) cluster */ 339 MC_MBUF_CL, /* mbuf + cluster */ 340 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */ 341 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */ 342} mbuf_class_t; 343 344#define MBUF_CLASS_MIN MC_MBUF 345#define MBUF_CLASS_MAX MC_MBUF_16KCL 346#define MBUF_CLASS_LAST MC_16KCL 347#define MBUF_CLASS_VALID(c) \ 348 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX) 349#define MBUF_CLASS_COMPOSITE(c) \ 350 ((int)(c) > MBUF_CLASS_LAST) 351 352 353/* 354 * mbuf specific mcache allocation request flags. 355 */ 356#define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */ 357 358/* 359 * Per-cluster slab structure. 360 * 361 * A slab is a cluster control structure that contains one or more object 362 * chunks; the available chunks are chained in the slab's freelist (sl_head). 363 * Each time a chunk is taken out of the slab, the slab's reference count 364 * gets incremented. When all chunks have been taken out, the empty slab 365 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is 366 * returned to a slab causes the slab's reference count to be decremented; 367 * it also causes the slab to be reinserted back to class's slab list, if 368 * it's not already done. 369 * 370 * Compartmentalizing of the object chunks into slabs allows us to easily 371 * merge one or more slabs together when the adjacent slabs are idle, as 372 * well as to convert or move a slab from one class to another; e.g. the 373 * mbuf cluster slab can be converted to a regular cluster slab when all 374 * mbufs in the slab have been freed. 375 * 376 * A slab may also span across multiple clusters for chunks larger than 377 * a cluster's size. In this case, only the slab of the first cluster is 378 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate 379 * that they are part of the larger slab. 380 * 381 * Each slab controls a page of memory. 382 */ 383typedef struct mcl_slab { 384 struct mcl_slab *sl_next; /* neighboring slab */ 385 u_int8_t sl_class; /* controlling mbuf class */ 386 int8_t sl_refcnt; /* outstanding allocations */ 387 int8_t sl_chunks; /* chunks (bufs) in this slab */ 388 u_int16_t sl_flags; /* slab flags (see below) */ 389 u_int16_t sl_len; /* slab length */ 390 void *sl_base; /* base of allocated memory */ 391 void *sl_head; /* first free buffer */ 392 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */ 393} mcl_slab_t; 394 395#define SLF_MAPPED 0x0001 /* backed by a mapped page */ 396#define SLF_PARTIAL 0x0002 /* part of another slab */ 397#define SLF_DETACHED 0x0004 /* not in slab freelist */ 398 399/* 400 * The array of slabs are broken into groups of arrays per 1MB of kernel 401 * memory to reduce the footprint. Each group is allocated on demand 402 * whenever a new piece of memory mapped in from the VM crosses the 1MB 403 * boundary. 404 */ 405#define NSLABSPMB ((1 << MBSHIFT) >> PGSHIFT) /* 256 slabs/grp */ 406 407typedef struct mcl_slabg { 408 mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */ 409} mcl_slabg_t; 410 411/* 412 * Number of slabs needed to control a 16KB cluster object. 413 */ 414#define NSLABSP16KB (M16KCLBYTES >> PGSHIFT) 415 416/* 417 * Per-cluster audit structure. 418 */ 419typedef struct { 420 mcache_audit_t *cl_audit[NMBPBG]; /* array of audits */ 421} mcl_audit_t; 422 423typedef struct { 424 struct thread *msa_thread; /* thread doing transaction */ 425 struct thread *msa_pthread; /* previous transaction thread */ 426 uint32_t msa_tstamp; /* transaction timestamp (ms) */ 427 uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */ 428 uint16_t msa_depth; /* pc stack depth */ 429 uint16_t msa_pdepth; /* previous transaction pc stack */ 430 void *msa_stack[MCACHE_STACK_DEPTH]; 431 void *msa_pstack[MCACHE_STACK_DEPTH]; 432} mcl_scratch_audit_t; 433 434typedef struct { 435 /* 436 * Size of data from the beginning of an mbuf that covers m_hdr, 437 * pkthdr and m_ext structures. If auditing is enabled, we allocate 438 * a shadow mbuf structure of this size inside each audit structure, 439 * and the contents of the real mbuf gets copied into it when the mbuf 440 * is freed. This allows us to pattern-fill the mbuf for integrity 441 * check, and to preserve any constructed mbuf fields (e.g. mbuf + 442 * cluster cache case). Note that we don't save the contents of 443 * clusters when they are freed; we simply pattern-fill them. 444 */ 445 u_int8_t sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)]; 446 mcl_scratch_audit_t sc_scratch __attribute__((aligned(8))); 447} mcl_saved_contents_t; 448 449#define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t)) 450 451#define MCA_SAVED_MBUF_PTR(_mca) \ 452 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \ 453 (_mca)->mca_contents)->sc_mbuf) 454#define MCA_SAVED_MBUF_SIZE \ 455 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf)) 456#define MCA_SAVED_SCRATCH_PTR(_mca) \ 457 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch) 458 459/* 460 * mbuf specific mcache audit flags 461 */ 462#define MB_INUSE 0x01 /* object has not been returned to slab */ 463#define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */ 464#define MB_SCVALID 0x04 /* object has valid saved contents */ 465 466/* 467 * Each of the following two arrays hold up to nmbclusters elements. 468 */ 469static mcl_audit_t *mclaudit; /* array of cluster audit information */ 470static unsigned int maxclaudit; /* max # of entries in audit table */ 471static mcl_slabg_t **slabstbl; /* cluster slabs table */ 472static unsigned int maxslabgrp; /* max # of entries in slabs table */ 473static unsigned int slabgrp; /* # of entries in slabs table */ 474 475/* Globals */ 476int nclusters; /* # of clusters for non-jumbo (legacy) sizes */ 477int njcl; /* # of clusters for jumbo sizes */ 478int njclbytes; /* size of a jumbo cluster */ 479union mbigcluster *mbutl; /* first mapped cluster address */ 480union mbigcluster *embutl; /* ending virtual address of mclusters */ 481int _max_linkhdr; /* largest link-level header */ 482int _max_protohdr; /* largest protocol header */ 483int max_hdr; /* largest link+protocol header */ 484int max_datalen; /* MHLEN - max_hdr */ 485 486static boolean_t mclverify; /* debug: pattern-checking */ 487static boolean_t mcltrace; /* debug: stack tracing */ 488static boolean_t mclfindleak; /* debug: leak detection */ 489static boolean_t mclexpleak; /* debug: expose leak info to user space */ 490 491static struct timeval mb_start; /* beginning of time */ 492 493/* mbuf leak detection variables */ 494static struct mleak_table mleak_table; 495static mleak_stat_t *mleak_stat; 496 497#define MLEAK_STAT_SIZE(n) \ 498 ((size_t)(&((mleak_stat_t *)0)->ml_trace[n])) 499 500struct mallocation { 501 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */ 502 u_int32_t trace_index; /* mtrace index for corresponding backtrace */ 503 u_int32_t count; /* How many objects were requested */ 504 u_int64_t hitcount; /* for determining hash effectiveness */ 505}; 506 507struct mtrace { 508 u_int64_t collisions; 509 u_int64_t hitcount; 510 u_int64_t allocs; 511 u_int64_t depth; 512 uintptr_t addr[MLEAK_STACK_DEPTH]; 513}; 514 515/* Size must be a power of two for the zhash to be able to just mask off bits */ 516#define MLEAK_ALLOCATION_MAP_NUM 512 517#define MLEAK_TRACE_MAP_NUM 256 518 519/* 520 * Sample factor for how often to record a trace. This is overwritable 521 * by the boot-arg mleak_sample_factor. 522 */ 523#define MLEAK_SAMPLE_FACTOR 500 524 525/* 526 * Number of top leakers recorded. 527 */ 528#define MLEAK_NUM_TRACES 5 529 530#define MB_LEAK_SPACING_64 " " 531#define MB_LEAK_SPACING_32 " " 532 533 534#define MB_LEAK_HDR_32 "\n\ 535 trace [1] trace [2] trace [3] trace [4] trace [5] \n\ 536 ---------- ---------- ---------- ---------- ---------- \n\ 537" 538 539#define MB_LEAK_HDR_64 "\n\ 540 trace [1] trace [2] trace [3] \ 541 trace [4] trace [5] \n\ 542 ------------------ ------------------ ------------------ \ 543 ------------------ ------------------ \n\ 544" 545 546static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM; 547static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM; 548 549/* Hashmaps of allocations and their corresponding traces */ 550static struct mallocation *mleak_allocations; 551static struct mtrace *mleak_traces; 552static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES]; 553 554/* Lock to protect mleak tables from concurrent modification */ 555decl_lck_mtx_data(static, mleak_lock_data); 556static lck_mtx_t *mleak_lock = &mleak_lock_data; 557static lck_attr_t *mleak_lock_attr; 558static lck_grp_t *mleak_lock_grp; 559static lck_grp_attr_t *mleak_lock_grp_attr; 560 561extern u_int32_t high_sb_max; 562 563/* The minimum number of objects that are allocated, to start. */ 564#define MINCL 32 565#define MINBIGCL (MINCL >> 1) 566#define MIN16KCL (MINCL >> 2) 567 568/* Low watermarks (only map in pages once free counts go below) */ 569#define MBIGCL_LOWAT MINBIGCL 570#define M16KCL_LOWAT MIN16KCL 571 572typedef struct { 573 mbuf_class_t mtbl_class; /* class type */ 574 mcache_t *mtbl_cache; /* mcache for this buffer class */ 575 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */ 576 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */ 577 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */ 578 u_int32_t mtbl_maxsize; /* maximum buffer size */ 579 int mtbl_minlimit; /* minimum allowed */ 580 int mtbl_maxlimit; /* maximum allowed */ 581 u_int32_t mtbl_wantpurge; /* purge during next reclaim */ 582 uint32_t mtbl_avgtotal; /* average total on iOS */ 583} mbuf_table_t; 584 585#define m_class(c) mbuf_table[c].mtbl_class 586#define m_cache(c) mbuf_table[c].mtbl_cache 587#define m_slablist(c) mbuf_table[c].mtbl_slablist 588#define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist 589#define m_maxsize(c) mbuf_table[c].mtbl_maxsize 590#define m_minlimit(c) mbuf_table[c].mtbl_minlimit 591#define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit 592#define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge 593#define m_avgtotal(c) mbuf_table[c].mtbl_avgtotal 594#define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname 595#define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size 596#define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total 597#define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active 598#define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree 599#define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt 600#define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt 601#define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt 602#define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified 603#define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt 604#define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt 605#define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal 606#define m_peak(c) mbuf_table[c].mtbl_stats->mbcl_peak_reported 607#define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt 608 609static mbuf_table_t mbuf_table[] = { 610 /* 611 * The caches for mbufs, regular clusters and big clusters. 612 * The average total values were based on data gathered by actual 613 * usage patterns on iOS. 614 */ 615 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)), 616 NULL, NULL, 0, 0, 0, 0, 3000 }, 617 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)), 618 NULL, NULL, 0, 0, 0, 0, 2000 }, 619 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)), 620 NULL, NULL, 0, 0, 0, 0, 1000 }, 621 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)), 622 NULL, NULL, 0, 0, 0, 0, 1000 }, 623 /* 624 * The following are special caches; they serve as intermediate 625 * caches backed by the above rudimentary caches. Each object 626 * in the cache is an mbuf with a cluster attached to it. Unlike 627 * the above caches, these intermediate caches do not directly 628 * deal with the slab structures; instead, the constructed 629 * cached elements are simply stored in the freelists. 630 */ 631 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000 }, 632 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000 }, 633 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000 }, 634}; 635 636#define NELEM(a) (sizeof (a) / sizeof ((a)[0])) 637 638static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */ 639static int mb_waiters; /* number of waiters */ 640 641boolean_t mb_peak_newreport = FALSE; 642boolean_t mb_peak_firstreport = FALSE; 643 644/* generate a report by default after 1 week of uptime */ 645#define MBUF_PEAK_FIRST_REPORT_THRESHOLD 604800 646 647#define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */ 648static struct timeval mb_wdtstart; /* watchdog start timestamp */ 649static char *mbuf_dump_buf; 650 651#define MBUF_DUMP_BUF_SIZE 2048 652 653/* 654 * mbuf watchdog is enabled by default on embedded platforms. It is 655 * also toggeable via the kern.ipc.mb_watchdog sysctl. 656 * Garbage collection is also enabled by default on embedded platforms. 657 * mb_drain_maxint controls the amount of time to wait (in seconds) before 658 * consecutive calls to m_drain(). 659 */ 660static unsigned int mb_watchdog = 0; 661static unsigned int mb_drain_maxint = 0; 662 663/* Red zone */ 664static u_int32_t mb_redzone_cookie; 665static void m_redzone_init(struct mbuf *); 666static void m_redzone_verify(struct mbuf *m); 667 668/* The following are used to serialize m_clalloc() */ 669static boolean_t mb_clalloc_busy; 670static void *mb_clalloc_waitchan = &mb_clalloc_busy; 671static int mb_clalloc_waiters; 672 673static void mbuf_mtypes_sync(boolean_t); 674static int mbstat_sysctl SYSCTL_HANDLER_ARGS; 675static void mbuf_stat_sync(void); 676static int mb_stat_sysctl SYSCTL_HANDLER_ARGS; 677static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS; 678static int mleak_table_sysctl SYSCTL_HANDLER_ARGS; 679static char *mbuf_dump(void); 680static void mbuf_table_init(void); 681static inline void m_incref(struct mbuf *); 682static inline u_int32_t m_decref(struct mbuf *); 683static int m_clalloc(const u_int32_t, const int, const u_int32_t); 684static void mbuf_worker_thread_init(void); 685static mcache_obj_t *slab_alloc(mbuf_class_t, int); 686static void slab_free(mbuf_class_t, mcache_obj_t *); 687static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***, 688 unsigned int, int); 689static void mbuf_slab_free(void *, mcache_obj_t *, int); 690static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t); 691static void mbuf_slab_notify(void *, u_int32_t); 692static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***, 693 unsigned int); 694static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int); 695static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***, 696 unsigned int, int); 697static void mbuf_cslab_free(void *, mcache_obj_t *, int); 698static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t); 699static int freelist_populate(mbuf_class_t, unsigned int, int); 700static void freelist_init(mbuf_class_t); 701static boolean_t mbuf_cached_above(mbuf_class_t, int); 702static boolean_t mbuf_steal(mbuf_class_t, unsigned int); 703static void m_reclaim(mbuf_class_t, unsigned int, boolean_t); 704static int m_howmany(int, size_t); 705static void mbuf_worker_thread(void); 706static void mbuf_watchdog(void); 707static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int); 708 709static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **, 710 size_t, unsigned int); 711static void mcl_audit_free(void *, unsigned int); 712static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *); 713static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t); 714static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t, 715 boolean_t); 716static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t); 717static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *); 718static void mcl_audit_scratch(mcache_audit_t *); 719static void mcl_audit_mcheck_panic(struct mbuf *); 720static void mcl_audit_verify_nextptr(void *, mcache_audit_t *); 721 722static void mleak_activate(void); 723static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t); 724static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int); 725static void mleak_free(mcache_obj_t *); 726static void mleak_sort_traces(void); 727static void mleak_update_stats(void); 728 729static mcl_slab_t *slab_get(void *); 730static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t, 731 void *, void *, unsigned int, int, int); 732static void slab_insert(mcl_slab_t *, mbuf_class_t); 733static void slab_remove(mcl_slab_t *, mbuf_class_t); 734static boolean_t slab_inrange(mcl_slab_t *, void *); 735static void slab_nextptr_panic(mcl_slab_t *, void *); 736static void slab_detach(mcl_slab_t *); 737static boolean_t slab_is_detached(mcl_slab_t *); 738 739static int m_copyback0(struct mbuf **, int, int, const void *, int, int); 740static struct mbuf *m_split0(struct mbuf *, int, int, int); 741__private_extern__ void mbuf_report_peak_usage(void); 742static boolean_t mbuf_report_usage(mbuf_class_t); 743 744/* flags for m_copyback0 */ 745#define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */ 746#define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */ 747#define M_COPYBACK0_COW 0x0004 /* do copy-on-write */ 748#define M_COPYBACK0_EXTEND 0x0008 /* extend chain */ 749 750/* 751 * This flag is set for all mbufs that come out of and into the composite 752 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that 753 * are marked with such a flag have clusters attached to them, and will be 754 * treated differently when they are freed; instead of being placed back 755 * into the mbuf and cluster freelists, the composite mbuf + cluster objects 756 * are placed back into the appropriate composite cache's freelist, and the 757 * actual freeing is deferred until the composite objects are purged. At 758 * such a time, this flag will be cleared from the mbufs and the objects 759 * will be freed into their own separate freelists. 760 */ 761#define EXTF_COMPOSITE 0x1 762 763/* 764 * This flag indicates that the external cluster is read-only, i.e. it is 765 * or was referred to by more than one mbufs. Once set, this flag is never 766 * cleared. 767 */ 768#define EXTF_READONLY 0x2 769#define EXTF_MASK (EXTF_COMPOSITE | EXTF_READONLY) 770 771#define MEXT_RFA(m) ((m)->m_ext.ext_refflags) 772#define MEXT_REF(m) (MEXT_RFA(m)->refcnt) 773#define MEXT_FLAGS(m) (MEXT_RFA(m)->flags) 774#define MBUF_IS_COMPOSITE(m) \ 775 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE) 776 777/* 778 * Macros used to verify the integrity of the mbuf. 779 */ 780#define _MCHECK(m) { \ 781 if ((m)->m_type != MT_FREE) { \ 782 if (mclaudit == NULL) \ 783 panic("MCHECK: m_type=%d m=%p", \ 784 (u_int16_t)(m)->m_type, m); \ 785 else \ 786 mcl_audit_mcheck_panic(m); \ 787 } \ 788} 789 790#define MBUF_IN_MAP(addr) \ 791 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl) 792 793#define MRANGE(addr) { \ 794 if (!MBUF_IN_MAP(addr)) \ 795 panic("MRANGE: address out of range 0x%p", addr); \ 796} 797 798/* 799 * Macro version of mtod. 800 */ 801#define MTOD(m, t) ((t)((m)->m_data)) 802 803/* 804 * Macros to obtain (4KB) cluster index and base cluster address. 805 */ 806 807#define MTOBG(x) (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT) 808#define BGTOM(x) ((union mbigcluster *)(mbutl + (x))) 809 810/* 811 * Macro to find the mbuf index relative to a base. 812 */ 813#define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> MSIZESHIFT) 814 815/* 816 * Same thing for 2KB cluster index. 817 */ 818#define CLBGIDX(c, m) (((char *)(m) - (char *)(c)) >> MCLSHIFT) 819 820/* 821 * Macros used during mbuf and cluster initialization. 822 */ 823#define MBUF_INIT_PKTHDR(m) { \ 824 (m)->m_pkthdr.rcvif = NULL; \ 825 (m)->m_pkthdr.pkt_hdr = NULL; \ 826 (m)->m_pkthdr.len = 0; \ 827 (m)->m_pkthdr.csum_flags = 0; \ 828 (m)->m_pkthdr.csum_data = 0; \ 829 (m)->m_pkthdr.vlan_tag = 0; \ 830 m_classifier_init(m, 0); \ 831 m_tag_init(m, 1); \ 832 m_scratch_init(m); \ 833 m_redzone_init(m); \ 834} 835 836#define MBUF_INIT(m, pkthdr, type) { \ 837 _MCHECK(m); \ 838 (m)->m_next = (m)->m_nextpkt = NULL; \ 839 (m)->m_len = 0; \ 840 (m)->m_type = type; \ 841 if ((pkthdr) == 0) { \ 842 (m)->m_data = (m)->m_dat; \ 843 (m)->m_flags = 0; \ 844 } else { \ 845 (m)->m_data = (m)->m_pktdat; \ 846 (m)->m_flags = M_PKTHDR; \ 847 MBUF_INIT_PKTHDR(m); \ 848 } \ 849} 850 851#define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \ 852 (m)->m_data = (m)->m_ext.ext_buf = (buf); \ 853 (m)->m_flags |= M_EXT; \ 854 (m)->m_ext.ext_size = (size); \ 855 (m)->m_ext.ext_free = (free); \ 856 (m)->m_ext.ext_arg = (arg); \ 857 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \ 858 &(m)->m_ext.ext_refs; \ 859 MEXT_RFA(m) = (rfa); \ 860 MEXT_REF(m) = (ref); \ 861 MEXT_FLAGS(m) = (flag); \ 862} 863 864#define MBUF_CL_INIT(m, buf, rfa, ref, flag) \ 865 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag) 866 867#define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \ 868 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag) 869 870#define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \ 871 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag) 872 873/* 874 * Macro to convert BSD malloc sleep flag to mcache's 875 */ 876#define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP) 877 878/* 879 * The structure that holds all mbuf class statistics exportable via sysctl. 880 * Similar to mbstat structure, the mb_stat structure is protected by the 881 * global mbuf lock. It contains additional information about the classes 882 * that allows for a more accurate view of the state of the allocator. 883 */ 884struct mb_stat *mb_stat; 885struct omb_stat *omb_stat; /* For backwards compatibility */ 886 887#define MB_STAT_SIZE(n) \ 888 ((size_t)(&((mb_stat_t *)0)->mbs_class[n])) 889#define OMB_STAT_SIZE(n) \ 890 ((size_t)(&((struct omb_stat *)0)->mbs_class[n])) 891 892/* 893 * The legacy structure holding all of the mbuf allocation statistics. 894 * The actual statistics used by the kernel are stored in the mbuf_table 895 * instead, and are updated atomically while the global mbuf lock is held. 896 * They are mirrored in mbstat to support legacy applications (e.g. netstat). 897 * Unlike before, the kernel no longer relies on the contents of mbstat for 898 * its operations (e.g. cluster expansion) because the structure is exposed 899 * to outside and could possibly be modified, therefore making it unsafe. 900 * With the exception of the mbstat.m_mtypes array (see below), all of the 901 * statistics are updated as they change. 902 */ 903struct mbstat mbstat; 904 905#define MBSTAT_MTYPES_MAX \ 906 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0])) 907 908/* 909 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated 910 * atomically and stored in a per-CPU structure which is lock-free; this is 911 * done in order to avoid writing to the global mbstat data structure which 912 * would cause false sharing. During sysctl request for kern.ipc.mbstat, 913 * the statistics across all CPUs will be converged into the mbstat.m_mtypes 914 * array and returned to the application. Any updates for types greater or 915 * equal than MT_MAX would be done atomically to the mbstat; this slows down 916 * performance but is okay since the kernel uses only up to MT_MAX-1 while 917 * anything beyond that (up to type 255) is considered a corner case. 918 */ 919typedef struct { 920 unsigned int cpu_mtypes[MT_MAX]; 921} __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t; 922 923typedef struct { 924 mtypes_cpu_t mbs_cpu[1]; 925} mbuf_mtypes_t; 926 927static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */ 928 929#define MBUF_MTYPES_SIZE(n) \ 930 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n])) 931 932#define MTYPES_CPU(p) \ 933 ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number()))) 934 935#define mtype_stat_add(type, n) { \ 936 if ((unsigned)(type) < MT_MAX) { \ 937 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \ 938 atomic_add_32(&mbs->cpu_mtypes[type], n); \ 939 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \ 940 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \ 941 } \ 942} 943 944#define mtype_stat_sub(t, n) mtype_stat_add(t, -(n)) 945#define mtype_stat_inc(t) mtype_stat_add(t, 1) 946#define mtype_stat_dec(t) mtype_stat_sub(t, 1) 947 948static void 949mbuf_mtypes_sync(boolean_t locked) 950{ 951 int m, n; 952 mtypes_cpu_t mtc; 953 954 if (locked) 955 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 956 957 bzero(&mtc, sizeof (mtc)); 958 for (m = 0; m < ncpu; m++) { 959 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m]; 960 mtypes_cpu_t temp; 961 962 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes, 963 sizeof (temp.cpu_mtypes)); 964 965 for (n = 0; n < MT_MAX; n++) 966 mtc.cpu_mtypes[n] += temp.cpu_mtypes[n]; 967 } 968 if (!locked) 969 lck_mtx_lock(mbuf_mlock); 970 for (n = 0; n < MT_MAX; n++) 971 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n]; 972 if (!locked) 973 lck_mtx_unlock(mbuf_mlock); 974} 975 976static int 977mbstat_sysctl SYSCTL_HANDLER_ARGS 978{ 979#pragma unused(oidp, arg1, arg2) 980 mbuf_mtypes_sync(FALSE); 981 982 return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat))); 983} 984 985static void 986mbuf_stat_sync(void) 987{ 988 mb_class_stat_t *sp; 989 mcache_cpu_t *ccp; 990 mcache_t *cp; 991 int k, m, bktsize; 992 993 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 994 995 for (k = 0; k < NELEM(mbuf_table); k++) { 996 cp = m_cache(k); 997 ccp = &cp->mc_cpu[0]; 998 bktsize = ccp->cc_bktsize; 999 sp = mbuf_table[k].mtbl_stats; 1000 1001 if (cp->mc_flags & MCF_NOCPUCACHE) 1002 sp->mbcl_mc_state = MCS_DISABLED; 1003 else if (cp->mc_purge_cnt > 0) 1004 sp->mbcl_mc_state = MCS_PURGING; 1005 else if (bktsize == 0) 1006 sp->mbcl_mc_state = MCS_OFFLINE; 1007 else 1008 sp->mbcl_mc_state = MCS_ONLINE; 1009 1010 sp->mbcl_mc_cached = 0; 1011 for (m = 0; m < ncpu; m++) { 1012 ccp = &cp->mc_cpu[m]; 1013 if (ccp->cc_objs > 0) 1014 sp->mbcl_mc_cached += ccp->cc_objs; 1015 if (ccp->cc_pobjs > 0) 1016 sp->mbcl_mc_cached += ccp->cc_pobjs; 1017 } 1018 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize); 1019 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached - 1020 sp->mbcl_infree; 1021 1022 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt; 1023 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt; 1024 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt; 1025 1026 /* Calculate total count specific to each class */ 1027 sp->mbcl_ctotal = sp->mbcl_total; 1028 switch (m_class(k)) { 1029 case MC_MBUF: 1030 /* Deduct mbufs used in composite caches */ 1031 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) + 1032 m_total(MC_MBUF_BIGCL)); 1033 break; 1034 1035 case MC_CL: 1036 /* Deduct clusters used in composite cache */ 1037 sp->mbcl_ctotal -= m_total(MC_MBUF_CL); 1038 break; 1039 1040 case MC_BIGCL: 1041 /* Deduct clusters used in composite cache */ 1042 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL); 1043 break; 1044 1045 case MC_16KCL: 1046 /* Deduct clusters used in composite cache */ 1047 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL); 1048 break; 1049 1050 default: 1051 break; 1052 } 1053 } 1054} 1055 1056static int 1057mb_stat_sysctl SYSCTL_HANDLER_ARGS 1058{ 1059#pragma unused(oidp, arg1, arg2) 1060 void *statp; 1061 int k, statsz, proc64 = proc_is64bit(req->p); 1062 1063 lck_mtx_lock(mbuf_mlock); 1064 mbuf_stat_sync(); 1065 1066 if (!proc64) { 1067 struct omb_class_stat *oc; 1068 struct mb_class_stat *c; 1069 1070 omb_stat->mbs_cnt = mb_stat->mbs_cnt; 1071 oc = &omb_stat->mbs_class[0]; 1072 c = &mb_stat->mbs_class[0]; 1073 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) { 1074 (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname), 1075 "%s", c->mbcl_cname); 1076 oc->mbcl_size = c->mbcl_size; 1077 oc->mbcl_total = c->mbcl_total; 1078 oc->mbcl_active = c->mbcl_active; 1079 oc->mbcl_infree = c->mbcl_infree; 1080 oc->mbcl_slab_cnt = c->mbcl_slab_cnt; 1081 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt; 1082 oc->mbcl_free_cnt = c->mbcl_free_cnt; 1083 oc->mbcl_notified = c->mbcl_notified; 1084 oc->mbcl_purge_cnt = c->mbcl_purge_cnt; 1085 oc->mbcl_fail_cnt = c->mbcl_fail_cnt; 1086 oc->mbcl_ctotal = c->mbcl_ctotal; 1087 oc->mbcl_release_cnt = c->mbcl_release_cnt; 1088 oc->mbcl_mc_state = c->mbcl_mc_state; 1089 oc->mbcl_mc_cached = c->mbcl_mc_cached; 1090 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt; 1091 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt; 1092 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt; 1093 } 1094 statp = omb_stat; 1095 statsz = OMB_STAT_SIZE(NELEM(mbuf_table)); 1096 } else { 1097 statp = mb_stat; 1098 statsz = MB_STAT_SIZE(NELEM(mbuf_table)); 1099 } 1100 1101 lck_mtx_unlock(mbuf_mlock); 1102 1103 return (SYSCTL_OUT(req, statp, statsz)); 1104} 1105 1106static int 1107mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS 1108{ 1109#pragma unused(oidp, arg1, arg2) 1110 int i; 1111 1112 /* Ensure leak tracing turned on */ 1113 if (!mclfindleak || !mclexpleak) 1114 return (ENXIO); 1115 1116 lck_mtx_lock(mleak_lock); 1117 mleak_update_stats(); 1118 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES)); 1119 lck_mtx_unlock(mleak_lock); 1120 1121 return (i); 1122} 1123 1124static int 1125mleak_table_sysctl SYSCTL_HANDLER_ARGS 1126{ 1127#pragma unused(oidp, arg1, arg2) 1128 int i = 0; 1129 1130 /* Ensure leak tracing turned on */ 1131 if (!mclfindleak || !mclexpleak) 1132 return (ENXIO); 1133 1134 lck_mtx_lock(mleak_lock); 1135 i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table)); 1136 lck_mtx_unlock(mleak_lock); 1137 1138 return (i); 1139} 1140 1141static inline void 1142m_incref(struct mbuf *m) 1143{ 1144 UInt32 old, new; 1145 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m); 1146 1147 do { 1148 old = *addr; 1149 new = old + 1; 1150 ASSERT(new != 0); 1151 } while (!OSCompareAndSwap(old, new, addr)); 1152 1153 /* 1154 * If cluster is shared, mark it with (sticky) EXTF_READONLY; 1155 * we don't clear the flag when the refcount goes back to 1 1156 * to simplify code calling m_mclhasreference(). 1157 */ 1158 if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY)) 1159 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m)); 1160} 1161 1162static inline u_int32_t 1163m_decref(struct mbuf *m) 1164{ 1165 UInt32 old, new; 1166 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m); 1167 1168 do { 1169 old = *addr; 1170 new = old - 1; 1171 ASSERT(old != 0); 1172 } while (!OSCompareAndSwap(old, new, addr)); 1173 1174 return (new); 1175} 1176 1177static void 1178mbuf_table_init(void) 1179{ 1180 unsigned int b, c, s; 1181 int m; 1182 1183 MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)), 1184 M_TEMP, M_WAITOK | M_ZERO); 1185 VERIFY(omb_stat != NULL); 1186 1187 MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)), 1188 M_TEMP, M_WAITOK | M_ZERO); 1189 VERIFY(mb_stat != NULL); 1190 1191 mb_stat->mbs_cnt = NELEM(mbuf_table); 1192 for (m = 0; m < NELEM(mbuf_table); m++) 1193 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m]; 1194 1195#if CONFIG_MBUF_JUMBO 1196 /* 1197 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do 1198 * this only on platforms where jumbo cluster pool is enabled. 1199 */ 1200 njcl = nmbclusters / 3; 1201 njclbytes = M16KCLBYTES; 1202#endif /* CONFIG_MBUF_JUMBO */ 1203 1204 /* 1205 * nclusters holds both the 2KB and 4KB pools, so ensure it's 1206 * a multiple of 4KB clusters. 1207 */ 1208 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG); 1209 if (njcl > 0) { 1210 /* 1211 * Each jumbo cluster takes 8 2KB clusters, so make 1212 * sure that the pool size is evenly divisible by 8; 1213 * njcl is in 2KB unit, hence treated as such. 1214 */ 1215 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8); 1216 1217 /* Update nclusters with rounded down value of njcl */ 1218 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG); 1219 } 1220 1221 /* 1222 * njcl is valid only on platforms with 16KB jumbo clusters, where 1223 * it is configured to 1/3 of the pool size. On these platforms, 1224 * the remaining is used for 2KB and 4KB clusters. On platforms 1225 * without 16KB jumbo clusters, the entire pool is used for both 1226 * 2KB and 4KB clusters. A 4KB cluster can either be splitted into 1227 * 16 mbufs, or into 2 2KB clusters. 1228 * 1229 * +---+---+------------ ... -----------+------- ... -------+ 1230 * | c | b | s | njcl | 1231 * +---+---+------------ ... -----------+------- ... -------+ 1232 * 1233 * 1/32th of the shared region is reserved for pure 2KB and 4KB 1234 * clusters (1/64th each.) 1235 */ 1236 c = P2ROUNDDOWN((nclusters >> 6), 2); /* in 2KB unit */ 1237 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */ 1238 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */ 1239 1240 /* 1241 * 1/64th (c) is reserved for 2KB clusters. 1242 */ 1243 m_minlimit(MC_CL) = c; 1244 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */ 1245 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES; 1246 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl"); 1247 1248 /* 1249 * Another 1/64th (b) of the map is reserved for 4KB clusters. 1250 * It cannot be turned into 2KB clusters or mbufs. 1251 */ 1252 m_minlimit(MC_BIGCL) = b; 1253 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */ 1254 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES; 1255 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl"); 1256 1257 /* 1258 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB) 1259 */ 1260 m_minlimit(MC_MBUF) = 0; 1261 m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */ 1262 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE; 1263 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf"); 1264 1265 /* 1266 * Set limits for the composite classes. 1267 */ 1268 m_minlimit(MC_MBUF_CL) = 0; 1269 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL); 1270 m_maxsize(MC_MBUF_CL) = MCLBYTES; 1271 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL); 1272 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl"); 1273 1274 m_minlimit(MC_MBUF_BIGCL) = 0; 1275 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL); 1276 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES; 1277 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL); 1278 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl"); 1279 1280 /* 1281 * And for jumbo classes. 1282 */ 1283 m_minlimit(MC_16KCL) = 0; 1284 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */ 1285 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES; 1286 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl"); 1287 1288 m_minlimit(MC_MBUF_16KCL) = 0; 1289 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL); 1290 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES; 1291 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL); 1292 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl"); 1293 1294 /* 1295 * Initialize the legacy mbstat structure. 1296 */ 1297 bzero(&mbstat, sizeof (mbstat)); 1298 mbstat.m_msize = m_maxsize(MC_MBUF); 1299 mbstat.m_mclbytes = m_maxsize(MC_CL); 1300 mbstat.m_minclsize = MINCLSIZE; 1301 mbstat.m_mlen = MLEN; 1302 mbstat.m_mhlen = MHLEN; 1303 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL); 1304} 1305 1306#if defined(__LP64__) 1307typedef struct ncl_tbl { 1308 uint64_t nt_maxmem; /* memory (sane) size */ 1309 uint32_t nt_mbpool; /* mbuf pool size */ 1310} ncl_tbl_t; 1311 1312/* Non-server */ 1313static ncl_tbl_t ncl_table[] = { 1314 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ }, 1315 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (96 << MBSHIFT) /* 96 MB */ }, 1316 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (128 << MBSHIFT) /* 128 MB */ }, 1317 { 0, 0 } 1318}; 1319 1320/* Server */ 1321static ncl_tbl_t ncl_table_srv[] = { 1322 { (1ULL << GBSHIFT) /* 1 GB */, (96 << MBSHIFT) /* 96 MB */ }, 1323 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (128 << MBSHIFT) /* 128 MB */ }, 1324 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (160 << MBSHIFT) /* 160 MB */ }, 1325 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (192 << MBSHIFT) /* 192 MB */ }, 1326 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (256 << MBSHIFT) /* 256 MB */ }, 1327 { (1ULL << (GBSHIFT + 6)) /* 64 GB */, (384 << MBSHIFT) /* 384 MB */ }, 1328 { 0, 0 } 1329}; 1330#endif /* __LP64__ */ 1331 1332__private_extern__ unsigned int 1333mbuf_default_ncl(int server, uint64_t mem) 1334{ 1335#if !defined(__LP64__) 1336#pragma unused(server) 1337 unsigned int n; 1338 /* 1339 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM). 1340 */ 1341 if ((n = ((mem / 16) / MCLBYTES)) > 32768) 1342 n = 32768; 1343#else 1344 unsigned int n, i; 1345 ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table); 1346 /* 1347 * 64-bit kernel (mbuf pool size based on table). 1348 */ 1349 n = tbl[0].nt_mbpool; 1350 for (i = 0; tbl[i].nt_mbpool != 0; i++) { 1351 if (mem < tbl[i].nt_maxmem) 1352 break; 1353 n = tbl[i].nt_mbpool; 1354 } 1355 n >>= MCLSHIFT; 1356#endif /* !__LP64__ */ 1357 return (n); 1358} 1359 1360__private_extern__ void 1361mbinit(void) 1362{ 1363 unsigned int m; 1364 unsigned int initmcl = 0; 1365 void *buf; 1366 thread_t thread = THREAD_NULL; 1367 1368 microuptime(&mb_start); 1369 1370 /* 1371 * These MBUF_ values must be equal to their private counterparts. 1372 */ 1373 _CASSERT(MBUF_EXT == M_EXT); 1374 _CASSERT(MBUF_PKTHDR == M_PKTHDR); 1375 _CASSERT(MBUF_EOR == M_EOR); 1376 _CASSERT(MBUF_LOOP == M_LOOP); 1377 _CASSERT(MBUF_BCAST == M_BCAST); 1378 _CASSERT(MBUF_MCAST == M_MCAST); 1379 _CASSERT(MBUF_FRAG == M_FRAG); 1380 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG); 1381 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG); 1382 _CASSERT(MBUF_PROMISC == M_PROMISC); 1383 _CASSERT(MBUF_HASFCS == M_HASFCS); 1384 1385 _CASSERT(MBUF_TYPE_FREE == MT_FREE); 1386 _CASSERT(MBUF_TYPE_DATA == MT_DATA); 1387 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER); 1388 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET); 1389 _CASSERT(MBUF_TYPE_PCB == MT_PCB); 1390 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE); 1391 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE); 1392 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE); 1393 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME); 1394 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS); 1395 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE); 1396 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS); 1397 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR); 1398 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL); 1399 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA); 1400 1401 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4); 1402 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6); 1403 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL); 1404 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16); 1405 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP); 1406 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP); 1407 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP); 1408 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6); 1409 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6); 1410 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED); 1411 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID); 1412 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID); 1413 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR); 1414 1415 _CASSERT(MBUF_WAITOK == M_WAIT); 1416 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT); 1417 _CASSERT(MBUF_COPYALL == M_COPYALL); 1418 1419 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK); 1420 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK); 1421 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE); 1422 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE); 1423 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE); 1424 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI); 1425 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI); 1426 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI); 1427 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO); 1428 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO); 1429 1430 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK); 1431 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE); 1432 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI); 1433 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO); 1434 1435 /* Module specific scratch space (32-bit alignment requirement) */ 1436 _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) % 1437 sizeof (uint32_t))); 1438 1439 /* Initialize random red zone cookie value */ 1440 _CASSERT(sizeof (mb_redzone_cookie) == 1441 sizeof (((struct pkthdr *)0)->redzone)); 1442 read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie)); 1443 1444 /* Make sure we don't save more than we should */ 1445 _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf)); 1446 1447 if (nmbclusters == 0) 1448 nmbclusters = NMBCLUSTERS; 1449 1450 /* This should be a sane (at least even) value by now */ 1451 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1)); 1452 1453 /* Setup the mbuf table */ 1454 mbuf_table_init(); 1455 1456 /* Global lock for common layer */ 1457 mbuf_mlock_grp_attr = lck_grp_attr_alloc_init(); 1458 mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr); 1459 mbuf_mlock_attr = lck_attr_alloc_init(); 1460 lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr); 1461 1462 /* 1463 * Allocate cluster slabs table: 1464 * 1465 * maxslabgrp = (N * 2048) / (1024 * 1024) 1466 * 1467 * Where N is nmbclusters rounded up to the nearest 512. This yields 1468 * mcl_slab_g_t units, each one representing a MB of memory. 1469 */ 1470 maxslabgrp = 1471 (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT; 1472 MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *), 1473 M_TEMP, M_WAITOK | M_ZERO); 1474 VERIFY(slabstbl != NULL); 1475 1476 /* 1477 * Allocate audit structures, if needed: 1478 * 1479 * maxclaudit = (maxslabgrp * 1024 * 1024) / 4096 1480 * 1481 * This yields mcl_audit_t units, each one representing a page. 1482 */ 1483 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug)); 1484 mbuf_debug |= mcache_getflags(); 1485 if (mbuf_debug & MCF_DEBUG) { 1486 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT); 1487 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit), 1488 M_TEMP, M_WAITOK | M_ZERO); 1489 VERIFY(mclaudit != NULL); 1490 1491 mcl_audit_con_cache = mcache_create("mcl_audit_contents", 1492 AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP); 1493 VERIFY(mcl_audit_con_cache != NULL); 1494 } 1495 mclverify = (mbuf_debug & MCF_VERIFY); 1496 mcltrace = (mbuf_debug & MCF_TRACE); 1497 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG); 1498 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG); 1499 1500 /* Enable mbuf leak logging, with a lock to protect the tables */ 1501 1502 mleak_lock_grp_attr = lck_grp_attr_alloc_init(); 1503 mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr); 1504 mleak_lock_attr = lck_attr_alloc_init(); 1505 lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr); 1506 1507 mleak_activate(); 1508 1509 /* Calculate the number of pages assigned to the cluster pool */ 1510 mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES; 1511 MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t), 1512 M_TEMP, M_WAITOK); 1513 VERIFY(mcl_paddr != NULL); 1514 1515 /* Register with the I/O Bus mapper */ 1516 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); 1517 bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t)); 1518 1519 embutl = (union mbigcluster *) 1520 ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES))); 1521 VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0); 1522 1523 /* Prime up the freelist */ 1524 PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl)); 1525 if (initmcl != 0) { 1526 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */ 1527 if (initmcl > m_maxlimit(MC_BIGCL)) 1528 initmcl = m_maxlimit(MC_BIGCL); 1529 } 1530 if (initmcl < m_minlimit(MC_BIGCL)) 1531 initmcl = m_minlimit(MC_BIGCL); 1532 1533 lck_mtx_lock(mbuf_mlock); 1534 1535 /* 1536 * For classes with non-zero minimum limits, populate their freelists 1537 * so that m_total(class) is at least m_minlimit(class). 1538 */ 1539 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0); 1540 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT); 1541 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL)); 1542 freelist_init(m_class(MC_CL)); 1543 1544 for (m = 0; m < NELEM(mbuf_table); m++) { 1545 /* Make sure we didn't miss any */ 1546 VERIFY(m_minlimit(m_class(m)) == 0 || 1547 m_total(m_class(m)) >= m_minlimit(m_class(m))); 1548 1549 /* populate the initial sizes and report from there on */ 1550 m_peak(m_class(m)) = m_total(m_class(m)); 1551 } 1552 mb_peak_newreport = FALSE; 1553 1554 lck_mtx_unlock(mbuf_mlock); 1555 1556 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init, 1557 NULL, &thread); 1558 thread_deallocate(thread); 1559 1560 ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref), 1561 0, 0, MCR_SLEEP); 1562 1563 /* Create the cache for each class */ 1564 for (m = 0; m < NELEM(mbuf_table); m++) { 1565 void *allocfunc, *freefunc, *auditfunc, *logfunc; 1566 u_int32_t flags; 1567 1568 flags = mbuf_debug; 1569 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL || 1570 m_class(m) == MC_MBUF_16KCL) { 1571 allocfunc = mbuf_cslab_alloc; 1572 freefunc = mbuf_cslab_free; 1573 auditfunc = mbuf_cslab_audit; 1574 logfunc = mleak_logger; 1575 } else { 1576 allocfunc = mbuf_slab_alloc; 1577 freefunc = mbuf_slab_free; 1578 auditfunc = mbuf_slab_audit; 1579 logfunc = mleak_logger; 1580 } 1581 1582 /* 1583 * Disable per-CPU caches for jumbo classes if there 1584 * is no jumbo cluster pool available in the system. 1585 * The cache itself is still created (but will never 1586 * be populated) since it simplifies the code. 1587 */ 1588 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) && 1589 njcl == 0) 1590 flags |= MCF_NOCPUCACHE; 1591 1592 if (!mclfindleak) 1593 flags |= MCF_NOLEAKLOG; 1594 1595 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m), 1596 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify, 1597 (void *)(uintptr_t)m, flags, MCR_SLEEP); 1598 } 1599 1600 /* 1601 * Allocate structure for per-CPU statistics that's aligned 1602 * on the CPU cache boundary; this code assumes that we never 1603 * uninitialize this framework, since the original address 1604 * before alignment is not saved. 1605 */ 1606 ncpu = ml_get_max_cpus(); 1607 MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE, 1608 M_TEMP, M_WAITOK); 1609 VERIFY(buf != NULL); 1610 1611 mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, 1612 CPU_CACHE_LINE_SIZE); 1613 bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu)); 1614 1615 /* 1616 * Set the max limit on sb_max to be 1/16 th of the size of 1617 * memory allocated for mbuf clusters. 1618 */ 1619 high_sb_max = (nmbclusters << (MCLSHIFT - 4)); 1620 if (high_sb_max < sb_max) { 1621 /* sb_max is too large for this configuration, scale it down */ 1622 if (high_sb_max > (1 << MBSHIFT)) { 1623 /* We have atleast 16 M of mbuf pool */ 1624 sb_max = high_sb_max; 1625 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) { 1626 /* 1627 * If we have more than 1M of mbufpool, cap the size of 1628 * max sock buf at 1M 1629 */ 1630 sb_max = high_sb_max = (1 << MBSHIFT); 1631 } else { 1632 sb_max = high_sb_max; 1633 } 1634 } 1635 1636 /* allocate space for mbuf_dump_buf */ 1637 MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK); 1638 VERIFY(mbuf_dump_buf != NULL); 1639 1640 if (mbuf_debug & MCF_DEBUG) { 1641 printf("%s: MLEN %d, MHLEN %d\n", __func__, 1642 (int)_MLEN, (int)_MHLEN); 1643 } 1644 1645 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__, 1646 (nmbclusters << MCLSHIFT) >> MBSHIFT, 1647 (nclusters << MCLSHIFT) >> MBSHIFT, 1648 (njcl << MCLSHIFT) >> MBSHIFT); 1649} 1650 1651/* 1652 * Obtain a slab of object(s) from the class's freelist. 1653 */ 1654static mcache_obj_t * 1655slab_alloc(mbuf_class_t class, int wait) 1656{ 1657 mcl_slab_t *sp; 1658 mcache_obj_t *buf; 1659 1660 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 1661 1662 VERIFY(class != MC_16KCL || njcl > 0); 1663 1664 /* This should always be NULL for us */ 1665 VERIFY(m_cobjlist(class) == NULL); 1666 1667 /* 1668 * Treat composite objects as having longer lifespan by using 1669 * a slab from the reverse direction, in hoping that this could 1670 * reduce the probability of fragmentation for slabs that hold 1671 * more than one buffer chunks (e.g. mbuf slabs). For other 1672 * slabs, this probably doesn't make much of a difference. 1673 */ 1674 if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP)) 1675 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead); 1676 else 1677 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class)); 1678 1679 if (sp == NULL) { 1680 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0); 1681 /* The slab list for this class is empty */ 1682 return (NULL); 1683 } 1684 1685 VERIFY(m_infree(class) > 0); 1686 VERIFY(!slab_is_detached(sp)); 1687 VERIFY(sp->sl_class == class && 1688 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); 1689 buf = sp->sl_head; 1690 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf)); 1691 1692 if (class == MC_MBUF) { 1693 sp->sl_head = buf->obj_next; 1694 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1)); 1695 } else if (class == MC_CL) { 1696 sp->sl_head = buf->obj_next; 1697 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1)); 1698 } else { 1699 sp->sl_head = NULL; 1700 } 1701 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) { 1702 slab_nextptr_panic(sp, sp->sl_head); 1703 /* In case sl_head is in the map but not in the slab */ 1704 VERIFY(slab_inrange(sp, sp->sl_head)); 1705 /* NOTREACHED */ 1706 } 1707 1708 /* Increment slab reference */ 1709 sp->sl_refcnt++; 1710 1711 if (mclaudit != NULL) { 1712 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); 1713 mca->mca_uflags = 0; 1714 /* Save contents on mbuf objects only */ 1715 if (class == MC_MBUF) 1716 mca->mca_uflags |= MB_SCVALID; 1717 } 1718 1719 if (class == MC_CL) { 1720 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL); 1721 /* 1722 * A 2K cluster slab can have at most NCLPBG references. 1723 */ 1724 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG && 1725 sp->sl_chunks == NCLPBG && 1726 sp->sl_len == m_maxsize(MC_BIGCL)); 1727 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL); 1728 } else if (class == MC_BIGCL) { 1729 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) + 1730 m_infree(MC_MBUF_BIGCL); 1731 /* 1732 * A 4K cluster slab can have at most 1 reference. 1733 */ 1734 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && 1735 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); 1736 } else if (class == MC_16KCL) { 1737 mcl_slab_t *nsp; 1738 int k; 1739 1740 --m_infree(MC_16KCL); 1741 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && 1742 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); 1743 /* 1744 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB. 1745 * A 16KB big cluster takes NSLABSP16KB slabs, each having at 1746 * most 1 reference. 1747 */ 1748 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { 1749 nsp = nsp->sl_next; 1750 /* Next slab must already be present */ 1751 VERIFY(nsp != NULL); 1752 nsp->sl_refcnt++; 1753 VERIFY(!slab_is_detached(nsp)); 1754 VERIFY(nsp->sl_class == MC_16KCL && 1755 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) && 1756 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 && 1757 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && 1758 nsp->sl_head == NULL); 1759 } 1760 } else { 1761 VERIFY(class == MC_MBUF); 1762 --m_infree(MC_MBUF); 1763 /* 1764 * If auditing is turned on, this check is 1765 * deferred until later in mbuf_slab_audit(). 1766 */ 1767 if (mclaudit == NULL) 1768 _MCHECK((struct mbuf *)buf); 1769 /* 1770 * Since we have incremented the reference count above, 1771 * an mbuf slab (formerly a 4KB cluster slab that was cut 1772 * up into mbufs) must have a reference count between 1 1773 * and NMBPBG at this point. 1774 */ 1775 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG && 1776 sp->sl_chunks == NMBPBG && 1777 sp->sl_len == m_maxsize(MC_BIGCL)); 1778 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL); 1779 } 1780 1781 /* If empty, remove this slab from the class's freelist */ 1782 if (sp->sl_head == NULL) { 1783 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG); 1784 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG); 1785 slab_remove(sp, class); 1786 } 1787 1788 return (buf); 1789} 1790 1791/* 1792 * Place a slab of object(s) back into a class's slab list. 1793 */ 1794static void 1795slab_free(mbuf_class_t class, mcache_obj_t *buf) 1796{ 1797 mcl_slab_t *sp; 1798 1799 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 1800 1801 VERIFY(class != MC_16KCL || njcl > 0); 1802 VERIFY(buf->obj_next == NULL); 1803 sp = slab_get(buf); 1804 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) && 1805 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); 1806 1807 /* Decrement slab reference */ 1808 sp->sl_refcnt--; 1809 1810 if (class == MC_CL) { 1811 VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); 1812 /* 1813 * A slab that has been splitted for 2KB clusters can have 1814 * at most 1 outstanding reference at this point. 1815 */ 1816 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) && 1817 sp->sl_chunks == NCLPBG && 1818 sp->sl_len == m_maxsize(MC_BIGCL)); 1819 VERIFY(sp->sl_refcnt < (NCLPBG - 1) || 1820 (slab_is_detached(sp) && sp->sl_head == NULL)); 1821 } else if (class == MC_BIGCL) { 1822 VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); 1823 /* 1824 * A 4KB cluster slab can have at most 1 reference 1825 * which must be 0 at this point. 1826 */ 1827 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && 1828 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); 1829 VERIFY(slab_is_detached(sp)); 1830 } else if (class == MC_16KCL) { 1831 mcl_slab_t *nsp; 1832 int k; 1833 /* 1834 * A 16KB cluster takes NSLABSP16KB slabs, all must 1835 * now have 0 reference. 1836 */ 1837 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES)); 1838 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && 1839 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); 1840 VERIFY(slab_is_detached(sp)); 1841 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { 1842 nsp = nsp->sl_next; 1843 /* Next slab must already be present */ 1844 VERIFY(nsp != NULL); 1845 nsp->sl_refcnt--; 1846 VERIFY(slab_is_detached(nsp)); 1847 VERIFY(nsp->sl_class == MC_16KCL && 1848 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) && 1849 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 && 1850 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && 1851 nsp->sl_head == NULL); 1852 } 1853 } else { 1854 /* 1855 * A slab that has been splitted for mbufs has at most NMBPBG 1856 * reference counts. Since we have decremented one reference 1857 * above, it must now be between 0 and NMBPBG-1. 1858 */ 1859 VERIFY(class == MC_MBUF); 1860 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) && 1861 sp->sl_chunks == NMBPBG && 1862 sp->sl_len == m_maxsize(MC_BIGCL)); 1863 VERIFY(sp->sl_refcnt < (NMBPBG - 1) || 1864 (slab_is_detached(sp) && sp->sl_head == NULL)); 1865 } 1866 1867 /* 1868 * When auditing is enabled, ensure that the buffer still 1869 * contains the free pattern. Otherwise it got corrupted 1870 * while at the CPU cache layer. 1871 */ 1872 if (mclaudit != NULL) { 1873 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); 1874 if (mclverify) { 1875 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class)); 1876 } 1877 mca->mca_uflags &= ~MB_SCVALID; 1878 } 1879 1880 if (class == MC_CL) { 1881 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL); 1882 buf->obj_next = sp->sl_head; 1883 } else if (class == MC_BIGCL) { 1884 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + 1885 m_infree(MC_MBUF_BIGCL); 1886 } else if (class == MC_16KCL) { 1887 ++m_infree(MC_16KCL); 1888 } else { 1889 ++m_infree(MC_MBUF); 1890 buf->obj_next = sp->sl_head; 1891 } 1892 sp->sl_head = buf; 1893 1894 /* 1895 * If a slab has been splitted to either one which holds 2KB clusters, 1896 * or one which holds mbufs, turn it back to one which holds a 4KB 1897 * cluster. 1898 */ 1899 if (class == MC_MBUF && sp->sl_refcnt == 0 && 1900 m_total(class) > m_minlimit(class) && 1901 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) { 1902 int i = NMBPBG; 1903 1904 m_total(MC_BIGCL)++; 1905 mbstat.m_bigclusters = m_total(MC_BIGCL); 1906 m_total(MC_MBUF) -= NMBPBG; 1907 mbstat.m_mbufs = m_total(MC_MBUF); 1908 m_infree(MC_MBUF) -= NMBPBG; 1909 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG)); 1910 1911 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); 1912 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF)); 1913 1914 while (i--) { 1915 struct mbuf *m = sp->sl_head; 1916 VERIFY(m != NULL); 1917 sp->sl_head = m->m_next; 1918 m->m_next = NULL; 1919 } 1920 VERIFY(sp->sl_head == NULL); 1921 1922 /* Remove the slab from the mbuf class's slab list */ 1923 slab_remove(sp, class); 1924 1925 /* Reinitialize it as a 4KB cluster slab */ 1926 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base, 1927 sp->sl_len, 0, 1); 1928 1929 if (mclverify) { 1930 mcache_set_pattern(MCACHE_FREE_PATTERN, 1931 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL)); 1932 } 1933 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + 1934 m_infree(MC_MBUF_BIGCL); 1935 1936 VERIFY(slab_is_detached(sp)); 1937 /* And finally switch class */ 1938 class = MC_BIGCL; 1939 } else if (class == MC_CL && sp->sl_refcnt == 0 && 1940 m_total(class) > m_minlimit(class) && 1941 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) { 1942 int i = NCLPBG; 1943 1944 m_total(MC_BIGCL)++; 1945 mbstat.m_bigclusters = m_total(MC_BIGCL); 1946 m_total(MC_CL) -= NCLPBG; 1947 mbstat.m_clusters = m_total(MC_CL); 1948 m_infree(MC_CL) -= NCLPBG; 1949 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); 1950 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL)); 1951 1952 while (i--) { 1953 union mcluster *c = sp->sl_head; 1954 VERIFY(c != NULL); 1955 sp->sl_head = c->mcl_next; 1956 c->mcl_next = NULL; 1957 } 1958 VERIFY(sp->sl_head == NULL); 1959 1960 /* Remove the slab from the 2KB cluster class's slab list */ 1961 slab_remove(sp, class); 1962 1963 /* Reinitialize it as a 4KB cluster slab */ 1964 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base, 1965 sp->sl_len, 0, 1); 1966 1967 if (mclverify) { 1968 mcache_set_pattern(MCACHE_FREE_PATTERN, 1969 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL)); 1970 } 1971 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + 1972 m_infree(MC_MBUF_BIGCL); 1973 1974 VERIFY(slab_is_detached(sp)); 1975 /* And finally switch class */ 1976 class = MC_BIGCL; 1977 } 1978 1979 /* Reinsert the slab to the class's slab list */ 1980 if (slab_is_detached(sp)) 1981 slab_insert(sp, class); 1982} 1983 1984/* 1985 * Common allocator for rudimentary objects called by the CPU cache layer 1986 * during an allocation request whenever there is no available element in the 1987 * bucket layer. It returns one or more elements from the appropriate global 1988 * freelist. If the freelist is empty, it will attempt to populate it and 1989 * retry the allocation. 1990 */ 1991static unsigned int 1992mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait) 1993{ 1994 mbuf_class_t class = (mbuf_class_t)arg; 1995 unsigned int need = num; 1996 mcache_obj_t **list = *plist; 1997 1998 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); 1999 ASSERT(need > 0); 2000 2001 lck_mtx_lock(mbuf_mlock); 2002 2003 for (;;) { 2004 if ((*list = slab_alloc(class, wait)) != NULL) { 2005 (*list)->obj_next = NULL; 2006 list = *plist = &(*list)->obj_next; 2007 2008 if (--need == 0) { 2009 /* 2010 * If the number of elements in freelist has 2011 * dropped below low watermark, asynchronously 2012 * populate the freelist now rather than doing 2013 * it later when we run out of elements. 2014 */ 2015 if (!mbuf_cached_above(class, wait) && 2016 m_infree(class) < m_total(class) >> 5) { 2017 (void) freelist_populate(class, 1, 2018 M_DONTWAIT); 2019 } 2020 break; 2021 } 2022 } else { 2023 VERIFY(m_infree(class) == 0 || class == MC_CL); 2024 2025 (void) freelist_populate(class, 1, 2026 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT); 2027 2028 if (m_infree(class) > 0) 2029 continue; 2030 2031 /* Check if there's anything at the cache layer */ 2032 if (mbuf_cached_above(class, wait)) 2033 break; 2034 2035 /* watchdog checkpoint */ 2036 mbuf_watchdog(); 2037 2038 /* We have nothing and cannot block; give up */ 2039 if (wait & MCR_NOSLEEP) { 2040 if (!(wait & MCR_TRYHARD)) { 2041 m_fail_cnt(class)++; 2042 mbstat.m_drops++; 2043 break; 2044 } 2045 } 2046 2047 /* 2048 * If the freelist is still empty and the caller is 2049 * willing to be blocked, sleep on the wait channel 2050 * until an element is available. Otherwise, if 2051 * MCR_TRYHARD is set, do our best to satisfy the 2052 * request without having to go to sleep. 2053 */ 2054 if (mbuf_worker_ready && 2055 mbuf_sleep(class, need, wait)) 2056 break; 2057 2058 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2059 } 2060 } 2061 2062 m_alloc_cnt(class) += num - need; 2063 lck_mtx_unlock(mbuf_mlock); 2064 2065 return (num - need); 2066} 2067 2068/* 2069 * Common de-allocator for rudimentary objects called by the CPU cache 2070 * layer when one or more elements need to be returned to the appropriate 2071 * global freelist. 2072 */ 2073static void 2074mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged) 2075{ 2076 mbuf_class_t class = (mbuf_class_t)arg; 2077 mcache_obj_t *nlist; 2078 unsigned int num = 0; 2079 int w; 2080 2081 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); 2082 2083 lck_mtx_lock(mbuf_mlock); 2084 2085 for (;;) { 2086 nlist = list->obj_next; 2087 list->obj_next = NULL; 2088 slab_free(class, list); 2089 ++num; 2090 if ((list = nlist) == NULL) 2091 break; 2092 } 2093 m_free_cnt(class) += num; 2094 2095 if ((w = mb_waiters) > 0) 2096 mb_waiters = 0; 2097 2098 lck_mtx_unlock(mbuf_mlock); 2099 2100 if (w != 0) 2101 wakeup(mb_waitchan); 2102} 2103 2104/* 2105 * Common auditor for rudimentary objects called by the CPU cache layer 2106 * during an allocation or free request. For the former, this is called 2107 * after the objects are obtained from either the bucket or slab layer 2108 * and before they are returned to the caller. For the latter, this is 2109 * called immediately during free and before placing the objects into 2110 * the bucket or slab layer. 2111 */ 2112static void 2113mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) 2114{ 2115 mbuf_class_t class = (mbuf_class_t)arg; 2116 mcache_audit_t *mca; 2117 2118 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); 2119 2120 while (list != NULL) { 2121 lck_mtx_lock(mbuf_mlock); 2122 mca = mcl_audit_buf2mca(class, list); 2123 2124 /* Do the sanity checks */ 2125 if (class == MC_MBUF) { 2126 mcl_audit_mbuf(mca, list, FALSE, alloc); 2127 ASSERT(mca->mca_uflags & MB_SCVALID); 2128 } else { 2129 mcl_audit_cluster(mca, list, m_maxsize(class), 2130 alloc, TRUE); 2131 ASSERT(!(mca->mca_uflags & MB_SCVALID)); 2132 } 2133 /* Record this transaction */ 2134 if (mcltrace) 2135 mcache_buffer_log(mca, list, m_cache(class), &mb_start); 2136 2137 if (alloc) 2138 mca->mca_uflags |= MB_INUSE; 2139 else 2140 mca->mca_uflags &= ~MB_INUSE; 2141 /* Unpair the object (unconditionally) */ 2142 mca->mca_uptr = NULL; 2143 lck_mtx_unlock(mbuf_mlock); 2144 2145 list = list->obj_next; 2146 } 2147} 2148 2149/* 2150 * Common notify routine for all caches. It is called by mcache when 2151 * one or more objects get freed. We use this indication to trigger 2152 * the wakeup of any sleeping threads so that they can retry their 2153 * allocation requests. 2154 */ 2155static void 2156mbuf_slab_notify(void *arg, u_int32_t reason) 2157{ 2158 mbuf_class_t class = (mbuf_class_t)arg; 2159 int w; 2160 2161 ASSERT(MBUF_CLASS_VALID(class)); 2162 2163 if (reason != MCN_RETRYALLOC) 2164 return; 2165 2166 lck_mtx_lock(mbuf_mlock); 2167 if ((w = mb_waiters) > 0) { 2168 m_notified(class)++; 2169 mb_waiters = 0; 2170 } 2171 lck_mtx_unlock(mbuf_mlock); 2172 2173 if (w != 0) 2174 wakeup(mb_waitchan); 2175} 2176 2177/* 2178 * Obtain object(s) from the composite class's freelist. 2179 */ 2180static unsigned int 2181cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num) 2182{ 2183 unsigned int need = num; 2184 mcl_slab_t *sp, *clsp, *nsp; 2185 struct mbuf *m; 2186 mcache_obj_t **list = *plist; 2187 void *cl; 2188 2189 VERIFY(need > 0); 2190 VERIFY(class != MC_MBUF_16KCL || njcl > 0); 2191 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2192 2193 /* Get what we can from the freelist */ 2194 while ((*list = m_cobjlist(class)) != NULL) { 2195 MRANGE(*list); 2196 2197 m = (struct mbuf *)*list; 2198 sp = slab_get(m); 2199 cl = m->m_ext.ext_buf; 2200 clsp = slab_get(cl); 2201 VERIFY(m->m_flags == M_EXT && cl != NULL); 2202 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m)); 2203 2204 if (class == MC_MBUF_CL) { 2205 VERIFY(clsp->sl_refcnt >= 1 && 2206 clsp->sl_refcnt <= NCLPBG); 2207 } else { 2208 VERIFY(clsp->sl_refcnt == 1); 2209 } 2210 2211 if (class == MC_MBUF_16KCL) { 2212 int k; 2213 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { 2214 nsp = nsp->sl_next; 2215 /* Next slab must already be present */ 2216 VERIFY(nsp != NULL); 2217 VERIFY(nsp->sl_refcnt == 1); 2218 } 2219 } 2220 2221 if ((m_cobjlist(class) = (*list)->obj_next) != NULL && 2222 !MBUF_IN_MAP(m_cobjlist(class))) { 2223 slab_nextptr_panic(sp, m_cobjlist(class)); 2224 /* NOTREACHED */ 2225 } 2226 (*list)->obj_next = NULL; 2227 list = *plist = &(*list)->obj_next; 2228 2229 if (--need == 0) 2230 break; 2231 } 2232 m_infree(class) -= (num - need); 2233 2234 return (num - need); 2235} 2236 2237/* 2238 * Place object(s) back into a composite class's freelist. 2239 */ 2240static unsigned int 2241cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) 2242{ 2243 mcache_obj_t *o, *tail; 2244 unsigned int num = 0; 2245 struct mbuf *m, *ms; 2246 mcache_audit_t *mca = NULL; 2247 mcache_obj_t *ref_list = NULL; 2248 mcl_slab_t *clsp, *nsp; 2249 void *cl; 2250 mbuf_class_t cl_class; 2251 2252 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); 2253 VERIFY(class != MC_MBUF_16KCL || njcl > 0); 2254 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2255 2256 if (class == MC_MBUF_CL) { 2257 cl_class = MC_CL; 2258 } else if (class == MC_MBUF_BIGCL) { 2259 cl_class = MC_BIGCL; 2260 } else { 2261 VERIFY(class == MC_MBUF_16KCL); 2262 cl_class = MC_16KCL; 2263 } 2264 2265 o = tail = list; 2266 2267 while ((m = ms = (struct mbuf *)o) != NULL) { 2268 mcache_obj_t *rfa, *nexto = o->obj_next; 2269 2270 /* Do the mbuf sanity checks */ 2271 if (mclaudit != NULL) { 2272 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 2273 if (mclverify) { 2274 mcache_audit_free_verify(mca, m, 0, 2275 m_maxsize(MC_MBUF)); 2276 } 2277 ms = MCA_SAVED_MBUF_PTR(mca); 2278 } 2279 2280 /* Do the cluster sanity checks */ 2281 cl = ms->m_ext.ext_buf; 2282 clsp = slab_get(cl); 2283 if (mclverify) { 2284 size_t size = m_maxsize(cl_class); 2285 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class, 2286 (mcache_obj_t *)cl), cl, 0, size); 2287 } 2288 VERIFY(ms->m_type == MT_FREE); 2289 VERIFY(ms->m_flags == M_EXT); 2290 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); 2291 if (cl_class == MC_CL) { 2292 VERIFY(clsp->sl_refcnt >= 1 && 2293 clsp->sl_refcnt <= NCLPBG); 2294 } else { 2295 VERIFY(clsp->sl_refcnt == 1); 2296 } 2297 if (cl_class == MC_16KCL) { 2298 int k; 2299 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { 2300 nsp = nsp->sl_next; 2301 /* Next slab must already be present */ 2302 VERIFY(nsp != NULL); 2303 VERIFY(nsp->sl_refcnt == 1); 2304 } 2305 } 2306 2307 /* 2308 * If we're asked to purge, restore the actual mbuf using 2309 * contents of the shadow structure (if auditing is enabled) 2310 * and clear EXTF_COMPOSITE flag from the mbuf, as we are 2311 * about to free it and the attached cluster into their caches. 2312 */ 2313 if (purged) { 2314 /* Restore constructed mbuf fields */ 2315 if (mclaudit != NULL) 2316 mcl_audit_restore_mbuf(m, mca, TRUE); 2317 2318 MEXT_REF(m) = 0; 2319 MEXT_FLAGS(m) = 0; 2320 2321 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m); 2322 rfa->obj_next = ref_list; 2323 ref_list = rfa; 2324 MEXT_RFA(m) = NULL; 2325 2326 m->m_type = MT_FREE; 2327 m->m_flags = m->m_len = 0; 2328 m->m_next = m->m_nextpkt = NULL; 2329 2330 /* Save mbuf fields and make auditing happy */ 2331 if (mclaudit != NULL) 2332 mcl_audit_mbuf(mca, o, FALSE, FALSE); 2333 2334 VERIFY(m_total(class) > 0); 2335 m_total(class)--; 2336 2337 /* Free the mbuf */ 2338 o->obj_next = NULL; 2339 slab_free(MC_MBUF, o); 2340 2341 /* And free the cluster */ 2342 ((mcache_obj_t *)cl)->obj_next = NULL; 2343 if (class == MC_MBUF_CL) 2344 slab_free(MC_CL, cl); 2345 else if (class == MC_MBUF_BIGCL) 2346 slab_free(MC_BIGCL, cl); 2347 else 2348 slab_free(MC_16KCL, cl); 2349 } 2350 2351 ++num; 2352 tail = o; 2353 o = nexto; 2354 } 2355 2356 if (!purged) { 2357 tail->obj_next = m_cobjlist(class); 2358 m_cobjlist(class) = list; 2359 m_infree(class) += num; 2360 } else if (ref_list != NULL) { 2361 mcache_free_ext(ref_cache, ref_list); 2362 } 2363 2364 return (num); 2365} 2366 2367/* 2368 * Common allocator for composite objects called by the CPU cache layer 2369 * during an allocation request whenever there is no available element in 2370 * the bucket layer. It returns one or more composite elements from the 2371 * appropriate global freelist. If the freelist is empty, it will attempt 2372 * to obtain the rudimentary objects from their caches and construct them 2373 * into composite mbuf + cluster objects. 2374 */ 2375static unsigned int 2376mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, 2377 int wait) 2378{ 2379 mbuf_class_t class = (mbuf_class_t)arg; 2380 mbuf_class_t cl_class = 0; 2381 unsigned int num = 0, cnum = 0, want = needed; 2382 mcache_obj_t *ref_list = NULL; 2383 mcache_obj_t *mp_list = NULL; 2384 mcache_obj_t *clp_list = NULL; 2385 mcache_obj_t **list; 2386 struct ext_ref *rfa; 2387 struct mbuf *m; 2388 void *cl; 2389 2390 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); 2391 ASSERT(needed > 0); 2392 2393 VERIFY(class != MC_MBUF_16KCL || njcl > 0); 2394 2395 /* There should not be any slab for this class */ 2396 VERIFY(m_slab_cnt(class) == 0 && 2397 m_slablist(class).tqh_first == NULL && 2398 m_slablist(class).tqh_last == NULL); 2399 2400 lck_mtx_lock(mbuf_mlock); 2401 2402 /* Try using the freelist first */ 2403 num = cslab_alloc(class, plist, needed); 2404 list = *plist; 2405 if (num == needed) { 2406 m_alloc_cnt(class) += num; 2407 lck_mtx_unlock(mbuf_mlock); 2408 return (needed); 2409 } 2410 2411 lck_mtx_unlock(mbuf_mlock); 2412 2413 /* 2414 * We could not satisfy the request using the freelist alone; 2415 * allocate from the appropriate rudimentary caches and use 2416 * whatever we can get to construct the composite objects. 2417 */ 2418 needed -= num; 2419 2420 /* 2421 * Mark these allocation requests as coming from a composite cache. 2422 * Also, if the caller is willing to be blocked, mark the request 2423 * with MCR_FAILOK such that we don't end up sleeping at the mbuf 2424 * slab layer waiting for the individual object when one or more 2425 * of the already-constructed composite objects are available. 2426 */ 2427 wait |= MCR_COMP; 2428 if (!(wait & MCR_NOSLEEP)) 2429 wait |= MCR_FAILOK; 2430 2431 /* allocate mbufs */ 2432 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait); 2433 if (needed == 0) { 2434 ASSERT(mp_list == NULL); 2435 goto fail; 2436 } 2437 2438 /* allocate clusters */ 2439 if (class == MC_MBUF_CL) { 2440 cl_class = MC_CL; 2441 } else if (class == MC_MBUF_BIGCL) { 2442 cl_class = MC_BIGCL; 2443 } else { 2444 VERIFY(class == MC_MBUF_16KCL); 2445 cl_class = MC_16KCL; 2446 } 2447 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait); 2448 if (needed == 0) { 2449 ASSERT(clp_list == NULL); 2450 goto fail; 2451 } 2452 2453 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait); 2454 if (needed == 0) { 2455 ASSERT(ref_list == NULL); 2456 goto fail; 2457 } 2458 2459 /* 2460 * By this time "needed" is MIN(mbuf, cluster, ref). Any left 2461 * overs will get freed accordingly before we return to caller. 2462 */ 2463 for (cnum = 0; cnum < needed; cnum++) { 2464 struct mbuf *ms; 2465 2466 m = ms = (struct mbuf *)mp_list; 2467 mp_list = mp_list->obj_next; 2468 2469 cl = clp_list; 2470 clp_list = clp_list->obj_next; 2471 ((mcache_obj_t *)cl)->obj_next = NULL; 2472 2473 rfa = (struct ext_ref *)ref_list; 2474 ref_list = ref_list->obj_next; 2475 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL; 2476 2477 /* 2478 * If auditing is enabled, construct the shadow mbuf 2479 * in the audit structure instead of in the actual one. 2480 * mbuf_cslab_audit() will take care of restoring the 2481 * contents after the integrity check. 2482 */ 2483 if (mclaudit != NULL) { 2484 mcache_audit_t *mca, *cl_mca; 2485 2486 lck_mtx_lock(mbuf_mlock); 2487 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 2488 ms = MCA_SAVED_MBUF_PTR(mca); 2489 cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl); 2490 2491 /* 2492 * Pair them up. Note that this is done at the time 2493 * the mbuf+cluster objects are constructed. This 2494 * information should be treated as "best effort" 2495 * debugging hint since more than one mbufs can refer 2496 * to a cluster. In that case, the cluster might not 2497 * be freed along with the mbuf it was paired with. 2498 */ 2499 mca->mca_uptr = cl_mca; 2500 cl_mca->mca_uptr = mca; 2501 2502 ASSERT(mca->mca_uflags & MB_SCVALID); 2503 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID)); 2504 lck_mtx_unlock(mbuf_mlock); 2505 2506 /* Technically, they are in the freelist */ 2507 if (mclverify) { 2508 size_t size; 2509 2510 mcache_set_pattern(MCACHE_FREE_PATTERN, m, 2511 m_maxsize(MC_MBUF)); 2512 2513 if (class == MC_MBUF_CL) 2514 size = m_maxsize(MC_CL); 2515 else if (class == MC_MBUF_BIGCL) 2516 size = m_maxsize(MC_BIGCL); 2517 else 2518 size = m_maxsize(MC_16KCL); 2519 2520 mcache_set_pattern(MCACHE_FREE_PATTERN, cl, 2521 size); 2522 } 2523 } 2524 2525 MBUF_INIT(ms, 0, MT_FREE); 2526 if (class == MC_MBUF_16KCL) { 2527 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); 2528 } else if (class == MC_MBUF_BIGCL) { 2529 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); 2530 } else { 2531 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); 2532 } 2533 VERIFY(ms->m_flags == M_EXT); 2534 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); 2535 2536 *list = (mcache_obj_t *)m; 2537 (*list)->obj_next = NULL; 2538 list = *plist = &(*list)->obj_next; 2539 } 2540 2541fail: 2542 /* 2543 * Free up what's left of the above. 2544 */ 2545 if (mp_list != NULL) 2546 mcache_free_ext(m_cache(MC_MBUF), mp_list); 2547 if (clp_list != NULL) 2548 mcache_free_ext(m_cache(cl_class), clp_list); 2549 if (ref_list != NULL) 2550 mcache_free_ext(ref_cache, ref_list); 2551 2552 lck_mtx_lock(mbuf_mlock); 2553 if (num > 0 || cnum > 0) { 2554 m_total(class) += cnum; 2555 VERIFY(m_total(class) <= m_maxlimit(class)); 2556 m_alloc_cnt(class) += num + cnum; 2557 } 2558 if ((num + cnum) < want) 2559 m_fail_cnt(class) += (want - (num + cnum)); 2560 lck_mtx_unlock(mbuf_mlock); 2561 2562 return (num + cnum); 2563} 2564 2565/* 2566 * Common de-allocator for composite objects called by the CPU cache 2567 * layer when one or more elements need to be returned to the appropriate 2568 * global freelist. 2569 */ 2570static void 2571mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged) 2572{ 2573 mbuf_class_t class = (mbuf_class_t)arg; 2574 unsigned int num; 2575 int w; 2576 2577 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); 2578 2579 lck_mtx_lock(mbuf_mlock); 2580 2581 num = cslab_free(class, list, purged); 2582 m_free_cnt(class) += num; 2583 2584 if ((w = mb_waiters) > 0) 2585 mb_waiters = 0; 2586 2587 lck_mtx_unlock(mbuf_mlock); 2588 2589 if (w != 0) 2590 wakeup(mb_waitchan); 2591} 2592 2593/* 2594 * Common auditor for composite objects called by the CPU cache layer 2595 * during an allocation or free request. For the former, this is called 2596 * after the objects are obtained from either the bucket or slab layer 2597 * and before they are returned to the caller. For the latter, this is 2598 * called immediately during free and before placing the objects into 2599 * the bucket or slab layer. 2600 */ 2601static void 2602mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) 2603{ 2604 mbuf_class_t class = (mbuf_class_t)arg; 2605 mcache_audit_t *mca; 2606 struct mbuf *m, *ms; 2607 mcl_slab_t *clsp, *nsp; 2608 size_t size; 2609 void *cl; 2610 2611 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); 2612 2613 while ((m = ms = (struct mbuf *)list) != NULL) { 2614 lck_mtx_lock(mbuf_mlock); 2615 /* Do the mbuf sanity checks and record its transaction */ 2616 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 2617 mcl_audit_mbuf(mca, m, TRUE, alloc); 2618 if (mcltrace) 2619 mcache_buffer_log(mca, m, m_cache(class), &mb_start); 2620 2621 if (alloc) 2622 mca->mca_uflags |= MB_COMP_INUSE; 2623 else 2624 mca->mca_uflags &= ~MB_COMP_INUSE; 2625 2626 /* 2627 * Use the shadow mbuf in the audit structure if we are 2628 * freeing, since the contents of the actual mbuf has been 2629 * pattern-filled by the above call to mcl_audit_mbuf(). 2630 */ 2631 if (!alloc && mclverify) 2632 ms = MCA_SAVED_MBUF_PTR(mca); 2633 2634 /* Do the cluster sanity checks and record its transaction */ 2635 cl = ms->m_ext.ext_buf; 2636 clsp = slab_get(cl); 2637 VERIFY(ms->m_flags == M_EXT && cl != NULL); 2638 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); 2639 if (class == MC_MBUF_CL) 2640 VERIFY(clsp->sl_refcnt >= 1 && 2641 clsp->sl_refcnt <= NCLPBG); 2642 else 2643 VERIFY(clsp->sl_refcnt == 1); 2644 2645 if (class == MC_MBUF_16KCL) { 2646 int k; 2647 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { 2648 nsp = nsp->sl_next; 2649 /* Next slab must already be present */ 2650 VERIFY(nsp != NULL); 2651 VERIFY(nsp->sl_refcnt == 1); 2652 } 2653 } 2654 2655 mca = mcl_audit_buf2mca(MC_CL, cl); 2656 if (class == MC_MBUF_CL) 2657 size = m_maxsize(MC_CL); 2658 else if (class == MC_MBUF_BIGCL) 2659 size = m_maxsize(MC_BIGCL); 2660 else 2661 size = m_maxsize(MC_16KCL); 2662 mcl_audit_cluster(mca, cl, size, alloc, FALSE); 2663 if (mcltrace) 2664 mcache_buffer_log(mca, cl, m_cache(class), &mb_start); 2665 2666 if (alloc) 2667 mca->mca_uflags |= MB_COMP_INUSE; 2668 else 2669 mca->mca_uflags &= ~MB_COMP_INUSE; 2670 lck_mtx_unlock(mbuf_mlock); 2671 2672 list = list->obj_next; 2673 } 2674} 2675 2676/* 2677 * Allocate some number of mbuf clusters and place on cluster freelist. 2678 */ 2679static int 2680m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) 2681{ 2682 int i; 2683 vm_size_t size = 0; 2684 int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL)); 2685 vm_offset_t page = 0; 2686 mcache_audit_t *mca_list = NULL; 2687 mcache_obj_t *con_list = NULL; 2688 mcl_slab_t *sp; 2689 2690 VERIFY(bufsize == m_maxsize(MC_BIGCL) || 2691 bufsize == m_maxsize(MC_16KCL)); 2692 2693 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2694 2695 /* 2696 * Multiple threads may attempt to populate the cluster map one 2697 * after another. Since we drop the lock below prior to acquiring 2698 * the physical page(s), our view of the cluster map may no longer 2699 * be accurate, and we could end up over-committing the pages beyond 2700 * the maximum allowed for each class. To prevent it, this entire 2701 * operation (including the page mapping) is serialized. 2702 */ 2703 while (mb_clalloc_busy) { 2704 mb_clalloc_waiters++; 2705 (void) msleep(mb_clalloc_waitchan, mbuf_mlock, 2706 (PZERO-1), "m_clalloc", NULL); 2707 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2708 } 2709 2710 /* We are busy now; tell everyone else to go away */ 2711 mb_clalloc_busy = TRUE; 2712 2713 /* 2714 * Honor the caller's wish to block or not block. We have a way 2715 * to grow the pool asynchronously using the mbuf worker thread. 2716 */ 2717 i = m_howmany(num, bufsize); 2718 if (i == 0 || (wait & M_DONTWAIT)) 2719 goto out; 2720 2721 lck_mtx_unlock(mbuf_mlock); 2722 2723 size = round_page(i * bufsize); 2724 page = kmem_mb_alloc(mb_map, size, large_buffer); 2725 2726 /* 2727 * If we did ask for "n" 16KB physically contiguous chunks 2728 * and didn't get them, then please try again without this 2729 * restriction. 2730 */ 2731 if (large_buffer && page == 0) 2732 page = kmem_mb_alloc(mb_map, size, 0); 2733 2734 if (page == 0) { 2735 if (bufsize == m_maxsize(MC_BIGCL)) { 2736 /* Try for 1 page if failed, only 4KB request */ 2737 size = NBPG; 2738 page = kmem_mb_alloc(mb_map, size, 0); 2739 } 2740 2741 if (page == 0) { 2742 lck_mtx_lock(mbuf_mlock); 2743 goto out; 2744 } 2745 } 2746 2747 VERIFY(IS_P2ALIGNED(page, NBPG)); 2748 numpages = size / NBPG; 2749 2750 /* If auditing is enabled, allocate the audit structures now */ 2751 if (mclaudit != NULL) { 2752 int needed; 2753 2754 /* 2755 * Yes, I realize this is a waste of memory for clusters 2756 * that never get transformed into mbufs, as we may end 2757 * up with NMBPBG-1 unused audit structures per cluster. 2758 * But doing so tremendously simplifies the allocation 2759 * strategy, since at this point we are not holding the 2760 * mbuf lock and the caller is okay to be blocked. 2761 */ 2762 if (bufsize == m_maxsize(MC_BIGCL)) { 2763 needed = numpages * NMBPBG; 2764 2765 i = mcache_alloc_ext(mcl_audit_con_cache, 2766 &con_list, needed, MCR_SLEEP); 2767 2768 VERIFY(con_list != NULL && i == needed); 2769 } else { 2770 needed = numpages / NSLABSP16KB; 2771 } 2772 2773 i = mcache_alloc_ext(mcache_audit_cache, 2774 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP); 2775 2776 VERIFY(mca_list != NULL && i == needed); 2777 } 2778 2779 lck_mtx_lock(mbuf_mlock); 2780 2781 for (i = 0; i < numpages; i++, page += NBPG) { 2782 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG; 2783 ppnum_t new_page = pmap_find_phys(kernel_pmap, page); 2784 mbuf_class_t class = MC_BIGCL; 2785 2786 /* 2787 * If there is a mapper the appropriate I/O page is returned; 2788 * zero out the page to discard its past contents to prevent 2789 * exposing leftover kernel memory. 2790 */ 2791 VERIFY(offset < mcl_pages); 2792 if (mcl_paddr_base != 0) { 2793 bzero((void *)(uintptr_t) page, page_size); 2794 new_page = IOMapperInsertPage(mcl_paddr_base, 2795 offset, new_page); 2796 } 2797 mcl_paddr[offset] = new_page; 2798 2799 /* Pattern-fill this fresh page */ 2800 if (mclverify) { 2801 mcache_set_pattern(MCACHE_FREE_PATTERN, 2802 (caddr_t)page, NBPG); 2803 } 2804 if (bufsize == m_maxsize(MC_BIGCL)) { 2805 union mbigcluster *mbc = (union mbigcluster *)page; 2806 2807 /* One for the entire page */ 2808 sp = slab_get(mbc); 2809 if (mclaudit != NULL) { 2810 mcl_audit_init(mbc, &mca_list, &con_list, 2811 AUDIT_CONTENTS_SIZE, NMBPBG); 2812 } 2813 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); 2814 slab_init(sp, MC_BIGCL, SLF_MAPPED, 2815 mbc, mbc, bufsize, 0, 1); 2816 2817 /* Insert this slab */ 2818 slab_insert(sp, MC_BIGCL); 2819 2820 /* Update stats now since slab_get() drops the lock */ 2821 mbstat.m_bigclfree = ++m_infree(MC_BIGCL) + 2822 m_infree(MC_MBUF_BIGCL); 2823 mbstat.m_bigclusters = ++m_total(MC_BIGCL); 2824 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); 2825 class = MC_BIGCL; 2826 } else if ((i % NSLABSP16KB) == 0) { 2827 union m16kcluster *m16kcl = (union m16kcluster *)page; 2828 mcl_slab_t *nsp; 2829 int k; 2830 2831 VERIFY(njcl > 0); 2832 /* One for the entire 16KB */ 2833 sp = slab_get(m16kcl); 2834 if (mclaudit != NULL) 2835 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1); 2836 2837 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); 2838 slab_init(sp, MC_16KCL, SLF_MAPPED, 2839 m16kcl, m16kcl, bufsize, 0, 1); 2840 2841 /* 2842 * 2nd-Nth page's slab is part of the first one, 2843 * where N is NSLABSP16KB. 2844 */ 2845 for (k = 1; k < NSLABSP16KB; k++) { 2846 nsp = slab_get(((union mbigcluster *)page) + k); 2847 VERIFY(nsp->sl_refcnt == 0 && 2848 nsp->sl_flags == 0); 2849 slab_init(nsp, MC_16KCL, 2850 SLF_MAPPED | SLF_PARTIAL, 2851 m16kcl, NULL, 0, 0, 0); 2852 } 2853 2854 /* Insert this slab */ 2855 slab_insert(sp, MC_16KCL); 2856 2857 /* Update stats now since slab_get() drops the lock */ 2858 m_infree(MC_16KCL)++; 2859 m_total(MC_16KCL)++; 2860 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); 2861 class = MC_16KCL; 2862 } 2863 if (!mb_peak_newreport && mbuf_report_usage(class)) 2864 mb_peak_newreport = TRUE; 2865 } 2866 VERIFY(mca_list == NULL && con_list == NULL); 2867 2868 /* We're done; let others enter */ 2869 mb_clalloc_busy = FALSE; 2870 if (mb_clalloc_waiters > 0) { 2871 mb_clalloc_waiters = 0; 2872 wakeup(mb_clalloc_waitchan); 2873 } 2874 2875 if (bufsize == m_maxsize(MC_BIGCL)) 2876 return (numpages); 2877 2878 VERIFY(bufsize == m_maxsize(MC_16KCL)); 2879 return (numpages / NSLABSP16KB); 2880 2881out: 2882 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2883 2884 /* We're done; let others enter */ 2885 mb_clalloc_busy = FALSE; 2886 if (mb_clalloc_waiters > 0) { 2887 mb_clalloc_waiters = 0; 2888 wakeup(mb_clalloc_waitchan); 2889 } 2890 2891 /* 2892 * When non-blocking we kick a thread if we have to grow the 2893 * pool or if the number of free clusters is less than requested. 2894 */ 2895 if (bufsize == m_maxsize(MC_BIGCL)) { 2896 if (i > 0) { 2897 /* 2898 * Remember total number of 4KB clusters needed 2899 * at this time. 2900 */ 2901 i += m_total(MC_BIGCL); 2902 if (i > mbuf_expand_big) { 2903 mbuf_expand_big = i; 2904 if (mbuf_worker_ready) 2905 wakeup((caddr_t)&mbuf_worker_run); 2906 } 2907 } 2908 2909 if (m_infree(MC_BIGCL) >= num) 2910 return (1); 2911 } else { 2912 if (i > 0) { 2913 /* 2914 * Remember total number of 16KB clusters needed 2915 * at this time. 2916 */ 2917 i += m_total(MC_16KCL); 2918 if (i > mbuf_expand_16k) { 2919 mbuf_expand_16k = i; 2920 if (mbuf_worker_ready) 2921 wakeup((caddr_t)&mbuf_worker_run); 2922 } 2923 } 2924 2925 if (m_infree(MC_16KCL) >= num) 2926 return (1); 2927 } 2928 return (0); 2929} 2930 2931/* 2932 * Populate the global freelist of the corresponding buffer class. 2933 */ 2934static int 2935freelist_populate(mbuf_class_t class, unsigned int num, int wait) 2936{ 2937 mcache_obj_t *o = NULL; 2938 int i, numpages = 0, count; 2939 2940 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL || 2941 class == MC_16KCL); 2942 2943 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2944 2945 switch (class) { 2946 case MC_MBUF: 2947 case MC_CL: 2948 case MC_BIGCL: 2949 numpages = (num * m_size(class) + NBPG - 1) / NBPG; 2950 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL)); 2951 2952 /* Respect the 4KB clusters minimum limit */ 2953 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) && 2954 m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) { 2955 if (class != MC_BIGCL || (wait & MCR_COMP)) 2956 return (0); 2957 } 2958 if (class == MC_BIGCL) 2959 return (i != 0); 2960 break; 2961 2962 case MC_16KCL: 2963 return (m_clalloc(num, wait, m_maxsize(class)) != 0); 2964 /* NOTREACHED */ 2965 2966 default: 2967 VERIFY(0); 2968 /* NOTREACHED */ 2969 } 2970 2971 VERIFY(class == MC_MBUF || class == MC_CL); 2972 2973 /* how many objects will we cut the page into? */ 2974 int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG); 2975 2976 for (count = 0; count < numpages; count++) { 2977 2978 /* respect totals, minlimit, maxlimit */ 2979 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) || 2980 m_total(class) >= m_maxlimit(class)) 2981 break; 2982 2983 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL) 2984 break; 2985 2986 struct mbuf *m = (struct mbuf *)o; 2987 union mcluster *c = (union mcluster *)o; 2988 mcl_slab_t *sp = slab_get(o); 2989 mcache_audit_t *mca = NULL; 2990 2991 VERIFY(slab_is_detached(sp) && 2992 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); 2993 2994 /* 2995 * Make sure that the cluster is unmolested 2996 * while in freelist 2997 */ 2998 if (mclverify) { 2999 mca = mcl_audit_buf2mca(MC_BIGCL, o); 3000 mcache_audit_free_verify(mca, o, 0, 3001 m_maxsize(MC_BIGCL)); 3002 } 3003 3004 /* Reinitialize it as an mbuf or 2K slab */ 3005 slab_init(sp, class, sp->sl_flags, 3006 sp->sl_base, NULL, sp->sl_len, 0, numobj); 3007 3008 VERIFY(o == (mcache_obj_t *)sp->sl_base); 3009 VERIFY(sp->sl_head == NULL); 3010 3011 VERIFY(m_total(MC_BIGCL) > 0); 3012 m_total(MC_BIGCL)--; 3013 mbstat.m_bigclusters = m_total(MC_BIGCL); 3014 3015 m_total(class) += numobj; 3016 m_infree(class) += numobj; 3017 3018 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL)); 3019 VERIFY(m_total(class) <= m_maxlimit(class)); 3020 if (!mb_peak_newreport && mbuf_report_usage(class)) 3021 mb_peak_newreport = TRUE; 3022 3023 i = numobj; 3024 if (class == MC_MBUF) { 3025 mbstat.m_mbufs = m_total(MC_MBUF); 3026 mtype_stat_add(MT_FREE, NMBPBG); 3027 while (i--) { 3028 /* 3029 * If auditing is enabled, construct the 3030 * shadow mbuf in the audit structure 3031 * instead of the actual one. 3032 * mbuf_slab_audit() will take care of 3033 * restoring the contents after the 3034 * integrity check. 3035 */ 3036 if (mclaudit != NULL) { 3037 struct mbuf *ms; 3038 mca = mcl_audit_buf2mca(MC_MBUF, 3039 (mcache_obj_t *)m); 3040 ms = MCA_SAVED_MBUF_PTR(mca); 3041 ms->m_type = MT_FREE; 3042 } else { 3043 m->m_type = MT_FREE; 3044 } 3045 m->m_next = sp->sl_head; 3046 sp->sl_head = (void *)m++; 3047 } 3048 } else { /* MC_CL */ 3049 mbstat.m_clfree = 3050 m_infree(MC_CL) + m_infree(MC_MBUF_CL); 3051 mbstat.m_clusters = m_total(MC_CL); 3052 while (i--) { 3053 c->mcl_next = sp->sl_head; 3054 sp->sl_head = (void *)c++; 3055 } 3056 } 3057 3058 /* Insert into the mbuf or 2k slab list */ 3059 slab_insert(sp, class); 3060 3061 if ((i = mb_waiters) > 0) 3062 mb_waiters = 0; 3063 if (i != 0) 3064 wakeup(mb_waitchan); 3065 } 3066 return (count != 0); 3067} 3068 3069/* 3070 * For each class, initialize the freelist to hold m_minlimit() objects. 3071 */ 3072static void 3073freelist_init(mbuf_class_t class) 3074{ 3075 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 3076 3077 VERIFY(class == MC_CL || class == MC_BIGCL); 3078 VERIFY(m_total(class) == 0); 3079 VERIFY(m_minlimit(class) > 0); 3080 3081 while (m_total(class) < m_minlimit(class)) 3082 (void) freelist_populate(class, m_minlimit(class), M_WAIT); 3083 3084 VERIFY(m_total(class) >= m_minlimit(class)); 3085} 3086 3087/* 3088 * (Inaccurately) check if it might be worth a trip back to the 3089 * mcache layer due the availability of objects there. We'll 3090 * end up back here if there's nothing up there. 3091 */ 3092static boolean_t 3093mbuf_cached_above(mbuf_class_t class, int wait) 3094{ 3095 switch (class) { 3096 case MC_MBUF: 3097 if (wait & MCR_COMP) 3098 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) || 3099 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL))); 3100 break; 3101 3102 case MC_CL: 3103 if (wait & MCR_COMP) 3104 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL))); 3105 break; 3106 3107 case MC_BIGCL: 3108 if (wait & MCR_COMP) 3109 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL))); 3110 break; 3111 3112 case MC_16KCL: 3113 if (wait & MCR_COMP) 3114 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL))); 3115 break; 3116 3117 case MC_MBUF_CL: 3118 case MC_MBUF_BIGCL: 3119 case MC_MBUF_16KCL: 3120 break; 3121 3122 default: 3123 VERIFY(0); 3124 /* NOTREACHED */ 3125 } 3126 3127 return (!mcache_bkt_isempty(m_cache(class))); 3128} 3129 3130/* 3131 * If possible, convert constructed objects to raw ones. 3132 */ 3133static boolean_t 3134mbuf_steal(mbuf_class_t class, unsigned int num) 3135{ 3136 mcache_obj_t *top = NULL; 3137 mcache_obj_t **list = ⊤ 3138 unsigned int tot = 0; 3139 3140 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 3141 3142 switch (class) { 3143 case MC_MBUF: 3144 case MC_CL: 3145 case MC_BIGCL: 3146 case MC_16KCL: 3147 return (FALSE); 3148 3149 case MC_MBUF_CL: 3150 case MC_MBUF_BIGCL: 3151 case MC_MBUF_16KCL: 3152 /* Get the required number of constructed objects if possible */ 3153 if (m_infree(class) > m_minlimit(class)) { 3154 tot = cslab_alloc(class, &list, 3155 MIN(num, m_infree(class))); 3156 } 3157 3158 /* And destroy them to get back the raw objects */ 3159 if (top != NULL) 3160 (void) cslab_free(class, top, 1); 3161 break; 3162 3163 default: 3164 VERIFY(0); 3165 /* NOTREACHED */ 3166 } 3167 3168 return (tot == num); 3169} 3170 3171static void 3172m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp) 3173{ 3174 int m, bmap = 0; 3175 3176 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 3177 3178 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); 3179 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); 3180 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); 3181 3182 /* 3183 * This logic can be made smarter; for now, simply mark 3184 * all other related classes as potential victims. 3185 */ 3186 switch (class) { 3187 case MC_MBUF: 3188 m_wantpurge(MC_CL)++; 3189 m_wantpurge(MC_BIGCL)++; 3190 m_wantpurge(MC_MBUF_CL)++; 3191 m_wantpurge(MC_MBUF_BIGCL)++; 3192 break; 3193 3194 case MC_CL: 3195 m_wantpurge(MC_MBUF)++; 3196 m_wantpurge(MC_BIGCL)++; 3197 m_wantpurge(MC_MBUF_BIGCL)++; 3198 if (!comp) 3199 m_wantpurge(MC_MBUF_CL)++; 3200 break; 3201 3202 case MC_BIGCL: 3203 m_wantpurge(MC_MBUF)++; 3204 m_wantpurge(MC_CL)++; 3205 m_wantpurge(MC_MBUF_CL)++; 3206 if (!comp) 3207 m_wantpurge(MC_MBUF_BIGCL)++; 3208 break; 3209 3210 case MC_16KCL: 3211 if (!comp) 3212 m_wantpurge(MC_MBUF_16KCL)++; 3213 break; 3214 3215 default: 3216 VERIFY(0); 3217 /* NOTREACHED */ 3218 } 3219 3220 /* 3221 * Run through each marked class and check if we really need to 3222 * purge (and therefore temporarily disable) the per-CPU caches 3223 * layer used by the class. If so, remember the classes since 3224 * we are going to drop the lock below prior to purging. 3225 */ 3226 for (m = 0; m < NELEM(mbuf_table); m++) { 3227 if (m_wantpurge(m) > 0) { 3228 m_wantpurge(m) = 0; 3229 /* 3230 * Try hard to steal the required number of objects 3231 * from the freelist of other mbuf classes. Only 3232 * purge and disable the per-CPU caches layer when 3233 * we don't have enough; it's the last resort. 3234 */ 3235 if (!mbuf_steal(m, num)) 3236 bmap |= (1 << m); 3237 } 3238 } 3239 3240 lck_mtx_unlock(mbuf_mlock); 3241 3242 if (bmap != 0) { 3243 /* signal the domains to drain */ 3244 net_drain_domains(); 3245 3246 /* Sigh; we have no other choices but to ask mcache to purge */ 3247 for (m = 0; m < NELEM(mbuf_table); m++) { 3248 if ((bmap & (1 << m)) && 3249 mcache_purge_cache(m_cache(m), TRUE)) { 3250 lck_mtx_lock(mbuf_mlock); 3251 m_purge_cnt(m)++; 3252 mbstat.m_drain++; 3253 lck_mtx_unlock(mbuf_mlock); 3254 } 3255 } 3256 } else { 3257 /* 3258 * Request mcache to reap extra elements from all of its caches; 3259 * note that all reaps are serialized and happen only at a fixed 3260 * interval. 3261 */ 3262 mcache_reap(); 3263 } 3264 lck_mtx_lock(mbuf_mlock); 3265} 3266 3267static inline struct mbuf * 3268m_get_common(int wait, short type, int hdr) 3269{ 3270 struct mbuf *m; 3271 int mcflags = MSLEEPF(wait); 3272 3273 /* Is this due to a non-blocking retry? If so, then try harder */ 3274 if (mcflags & MCR_NOSLEEP) 3275 mcflags |= MCR_TRYHARD; 3276 3277 m = mcache_alloc(m_cache(MC_MBUF), mcflags); 3278 if (m != NULL) { 3279 MBUF_INIT(m, hdr, type); 3280 mtype_stat_inc(type); 3281 mtype_stat_dec(MT_FREE); 3282#if CONFIG_MACF_NET 3283 if (hdr && mac_init_mbuf(m, wait) != 0) { 3284 m_free(m); 3285 return (NULL); 3286 } 3287#endif /* MAC_NET */ 3288 } 3289 return (m); 3290} 3291 3292/* 3293 * Space allocation routines; these are also available as macros 3294 * for critical paths. 3295 */ 3296#define _M_GET(wait, type) m_get_common(wait, type, 0) 3297#define _M_GETHDR(wait, type) m_get_common(wait, type, 1) 3298#define _M_RETRY(wait, type) _M_GET(wait, type) 3299#define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type) 3300#define _MGET(m, how, type) ((m) = _M_GET(how, type)) 3301#define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type)) 3302 3303struct mbuf * 3304m_get(int wait, int type) 3305{ 3306 return (_M_GET(wait, type)); 3307} 3308 3309struct mbuf * 3310m_gethdr(int wait, int type) 3311{ 3312 return (_M_GETHDR(wait, type)); 3313} 3314 3315struct mbuf * 3316m_retry(int wait, int type) 3317{ 3318 return (_M_RETRY(wait, type)); 3319} 3320 3321struct mbuf * 3322m_retryhdr(int wait, int type) 3323{ 3324 return (_M_RETRYHDR(wait, type)); 3325} 3326 3327struct mbuf * 3328m_getclr(int wait, int type) 3329{ 3330 struct mbuf *m; 3331 3332 _MGET(m, wait, type); 3333 if (m != NULL) 3334 bzero(MTOD(m, caddr_t), MLEN); 3335 return (m); 3336} 3337 3338struct mbuf * 3339m_free(struct mbuf *m) 3340{ 3341 struct mbuf *n = m->m_next; 3342 3343 if (m->m_type == MT_FREE) 3344 panic("m_free: freeing an already freed mbuf"); 3345 3346 if (m->m_flags & M_PKTHDR) { 3347 /* Check for scratch area overflow */ 3348 m_redzone_verify(m); 3349 /* Free the aux data and tags if there is any */ 3350 m_tag_delete_chain(m, NULL); 3351 } 3352 3353 if (m->m_flags & M_EXT) { 3354 u_int32_t refcnt; 3355 u_int32_t composite; 3356 3357 refcnt = m_decref(m); 3358 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); 3359 if (refcnt == 0 && !composite) { 3360 if (m->m_ext.ext_free == NULL) { 3361 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); 3362 } else if (m->m_ext.ext_free == m_bigfree) { 3363 mcache_free(m_cache(MC_BIGCL), 3364 m->m_ext.ext_buf); 3365 } else if (m->m_ext.ext_free == m_16kfree) { 3366 mcache_free(m_cache(MC_16KCL), 3367 m->m_ext.ext_buf); 3368 } else { 3369 (*(m->m_ext.ext_free))(m->m_ext.ext_buf, 3370 m->m_ext.ext_size, m->m_ext.ext_arg); 3371 } 3372 mcache_free(ref_cache, MEXT_RFA(m)); 3373 MEXT_RFA(m) = NULL; 3374 } else if (refcnt == 0 && composite) { 3375 VERIFY(m->m_type != MT_FREE); 3376 3377 mtype_stat_dec(m->m_type); 3378 mtype_stat_inc(MT_FREE); 3379 3380 m->m_type = MT_FREE; 3381 m->m_flags = M_EXT; 3382 m->m_len = 0; 3383 m->m_next = m->m_nextpkt = NULL; 3384 3385 MEXT_FLAGS(m) &= ~EXTF_READONLY; 3386 3387 /* "Free" into the intermediate cache */ 3388 if (m->m_ext.ext_free == NULL) { 3389 mcache_free(m_cache(MC_MBUF_CL), m); 3390 } else if (m->m_ext.ext_free == m_bigfree) { 3391 mcache_free(m_cache(MC_MBUF_BIGCL), m); 3392 } else { 3393 VERIFY(m->m_ext.ext_free == m_16kfree); 3394 mcache_free(m_cache(MC_MBUF_16KCL), m); 3395 } 3396 return (n); 3397 } 3398 } 3399 3400 if (m->m_type != MT_FREE) { 3401 mtype_stat_dec(m->m_type); 3402 mtype_stat_inc(MT_FREE); 3403 } 3404 3405 m->m_type = MT_FREE; 3406 m->m_flags = m->m_len = 0; 3407 m->m_next = m->m_nextpkt = NULL; 3408 3409 mcache_free(m_cache(MC_MBUF), m); 3410 3411 return (n); 3412} 3413 3414__private_extern__ struct mbuf * 3415m_clattach(struct mbuf *m, int type, caddr_t extbuf, 3416 void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg, 3417 int wait) 3418{ 3419 struct ext_ref *rfa = NULL; 3420 3421 if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL) 3422 return (NULL); 3423 3424 if (m->m_flags & M_EXT) { 3425 u_int32_t refcnt; 3426 u_int32_t composite; 3427 3428 refcnt = m_decref(m); 3429 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); 3430 if (refcnt == 0 && !composite) { 3431 if (m->m_ext.ext_free == NULL) { 3432 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); 3433 } else if (m->m_ext.ext_free == m_bigfree) { 3434 mcache_free(m_cache(MC_BIGCL), 3435 m->m_ext.ext_buf); 3436 } else if (m->m_ext.ext_free == m_16kfree) { 3437 mcache_free(m_cache(MC_16KCL), 3438 m->m_ext.ext_buf); 3439 } else { 3440 (*(m->m_ext.ext_free))(m->m_ext.ext_buf, 3441 m->m_ext.ext_size, m->m_ext.ext_arg); 3442 } 3443 /* Re-use the reference structure */ 3444 rfa = MEXT_RFA(m); 3445 } else if (refcnt == 0 && composite) { 3446 VERIFY(m->m_type != MT_FREE); 3447 3448 mtype_stat_dec(m->m_type); 3449 mtype_stat_inc(MT_FREE); 3450 3451 m->m_type = MT_FREE; 3452 m->m_flags = M_EXT; 3453 m->m_len = 0; 3454 m->m_next = m->m_nextpkt = NULL; 3455 3456 MEXT_FLAGS(m) &= ~EXTF_READONLY; 3457 3458 /* "Free" into the intermediate cache */ 3459 if (m->m_ext.ext_free == NULL) { 3460 mcache_free(m_cache(MC_MBUF_CL), m); 3461 } else if (m->m_ext.ext_free == m_bigfree) { 3462 mcache_free(m_cache(MC_MBUF_BIGCL), m); 3463 } else { 3464 VERIFY(m->m_ext.ext_free == m_16kfree); 3465 mcache_free(m_cache(MC_MBUF_16KCL), m); 3466 } 3467 /* 3468 * Allocate a new mbuf, since we didn't divorce 3469 * the composite mbuf + cluster pair above. 3470 */ 3471 if ((m = _M_GETHDR(wait, type)) == NULL) 3472 return (NULL); 3473 } 3474 } 3475 3476 if (rfa == NULL && 3477 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) { 3478 m_free(m); 3479 return (NULL); 3480 } 3481 3482 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0); 3483 3484 return (m); 3485} 3486 3487/* 3488 * Perform `fast' allocation mbuf clusters from a cache of recently-freed 3489 * clusters. (If the cache is empty, new clusters are allocated en-masse.) 3490 */ 3491struct mbuf * 3492m_getcl(int wait, int type, int flags) 3493{ 3494 struct mbuf *m; 3495 int mcflags = MSLEEPF(wait); 3496 int hdr = (flags & M_PKTHDR); 3497 3498 /* Is this due to a non-blocking retry? If so, then try harder */ 3499 if (mcflags & MCR_NOSLEEP) 3500 mcflags |= MCR_TRYHARD; 3501 3502 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags); 3503 if (m != NULL) { 3504 u_int32_t flag; 3505 struct ext_ref *rfa; 3506 void *cl; 3507 3508 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); 3509 cl = m->m_ext.ext_buf; 3510 rfa = MEXT_RFA(m); 3511 3512 ASSERT(cl != NULL && rfa != NULL); 3513 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL); 3514 3515 flag = MEXT_FLAGS(m); 3516 3517 MBUF_INIT(m, hdr, type); 3518 MBUF_CL_INIT(m, cl, rfa, 1, flag); 3519 3520 mtype_stat_inc(type); 3521 mtype_stat_dec(MT_FREE); 3522#if CONFIG_MACF_NET 3523 if (hdr && mac_init_mbuf(m, wait) != 0) { 3524 m_freem(m); 3525 return (NULL); 3526 } 3527#endif /* MAC_NET */ 3528 } 3529 return (m); 3530} 3531 3532/* m_mclget() add an mbuf cluster to a normal mbuf */ 3533struct mbuf * 3534m_mclget(struct mbuf *m, int wait) 3535{ 3536 struct ext_ref *rfa; 3537 3538 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) 3539 return (m); 3540 3541 m->m_ext.ext_buf = m_mclalloc(wait); 3542 if (m->m_ext.ext_buf != NULL) { 3543 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); 3544 } else { 3545 mcache_free(ref_cache, rfa); 3546 } 3547 return (m); 3548} 3549 3550/* Allocate an mbuf cluster */ 3551caddr_t 3552m_mclalloc(int wait) 3553{ 3554 int mcflags = MSLEEPF(wait); 3555 3556 /* Is this due to a non-blocking retry? If so, then try harder */ 3557 if (mcflags & MCR_NOSLEEP) 3558 mcflags |= MCR_TRYHARD; 3559 3560 return (mcache_alloc(m_cache(MC_CL), mcflags)); 3561} 3562 3563/* Free an mbuf cluster */ 3564void 3565m_mclfree(caddr_t p) 3566{ 3567 mcache_free(m_cache(MC_CL), p); 3568} 3569 3570/* 3571 * mcl_hasreference() checks if a cluster of an mbuf is referenced by 3572 * another mbuf; see comments in m_incref() regarding EXTF_READONLY. 3573 */ 3574int 3575m_mclhasreference(struct mbuf *m) 3576{ 3577 if (!(m->m_flags & M_EXT)) 3578 return (0); 3579 3580 ASSERT(MEXT_RFA(m) != NULL); 3581 3582 return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0); 3583} 3584 3585__private_extern__ caddr_t 3586m_bigalloc(int wait) 3587{ 3588 int mcflags = MSLEEPF(wait); 3589 3590 /* Is this due to a non-blocking retry? If so, then try harder */ 3591 if (mcflags & MCR_NOSLEEP) 3592 mcflags |= MCR_TRYHARD; 3593 3594 return (mcache_alloc(m_cache(MC_BIGCL), mcflags)); 3595} 3596 3597__private_extern__ void 3598m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg) 3599{ 3600 mcache_free(m_cache(MC_BIGCL), p); 3601} 3602 3603/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */ 3604__private_extern__ struct mbuf * 3605m_mbigget(struct mbuf *m, int wait) 3606{ 3607 struct ext_ref *rfa; 3608 3609 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) 3610 return (m); 3611 3612 m->m_ext.ext_buf = m_bigalloc(wait); 3613 if (m->m_ext.ext_buf != NULL) { 3614 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); 3615 } else { 3616 mcache_free(ref_cache, rfa); 3617 } 3618 return (m); 3619} 3620 3621__private_extern__ caddr_t 3622m_16kalloc(int wait) 3623{ 3624 int mcflags = MSLEEPF(wait); 3625 3626 /* Is this due to a non-blocking retry? If so, then try harder */ 3627 if (mcflags & MCR_NOSLEEP) 3628 mcflags |= MCR_TRYHARD; 3629 3630 return (mcache_alloc(m_cache(MC_16KCL), mcflags)); 3631} 3632 3633__private_extern__ void 3634m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg) 3635{ 3636 mcache_free(m_cache(MC_16KCL), p); 3637} 3638 3639/* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */ 3640__private_extern__ struct mbuf * 3641m_m16kget(struct mbuf *m, int wait) 3642{ 3643 struct ext_ref *rfa; 3644 3645 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) 3646 return (m); 3647 3648 m->m_ext.ext_buf = m_16kalloc(wait); 3649 if (m->m_ext.ext_buf != NULL) { 3650 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); 3651 } else { 3652 mcache_free(ref_cache, rfa); 3653 } 3654 return (m); 3655} 3656 3657/* 3658 * "Move" mbuf pkthdr from "from" to "to". 3659 * "from" must have M_PKTHDR set, and "to" must be empty. 3660 */ 3661void 3662m_copy_pkthdr(struct mbuf *to, struct mbuf *from) 3663{ 3664 VERIFY(from->m_flags & M_PKTHDR); 3665 3666 /* Check for scratch area overflow */ 3667 m_redzone_verify(from); 3668 3669 if (to->m_flags & M_PKTHDR) { 3670 /* Check for scratch area overflow */ 3671 m_redzone_verify(to); 3672 /* We will be taking over the tags of 'to' */ 3673 m_tag_delete_chain(to, NULL); 3674 } 3675 to->m_pkthdr = from->m_pkthdr; /* especially tags */ 3676 m_classifier_init(from, 0); /* purge classifier info */ 3677 m_tag_init(from, 1); /* purge all tags from src */ 3678 m_scratch_init(from); /* clear src scratch area */ 3679 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); 3680 if ((to->m_flags & M_EXT) == 0) 3681 to->m_data = to->m_pktdat; 3682 m_redzone_init(to); /* setup red zone on dst */ 3683} 3684 3685/* 3686 * Duplicate "from"'s mbuf pkthdr in "to". 3687 * "from" must have M_PKTHDR set, and "to" must be empty. 3688 * In particular, this does a deep copy of the packet tags. 3689 */ 3690static int 3691m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how) 3692{ 3693 VERIFY(from->m_flags & M_PKTHDR); 3694 3695 /* Check for scratch area overflow */ 3696 m_redzone_verify(from); 3697 3698 if (to->m_flags & M_PKTHDR) { 3699 /* Check for scratch area overflow */ 3700 m_redzone_verify(to); 3701 /* We will be taking over the tags of 'to' */ 3702 m_tag_delete_chain(to, NULL); 3703 } 3704 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); 3705 if ((to->m_flags & M_EXT) == 0) 3706 to->m_data = to->m_pktdat; 3707 to->m_pkthdr = from->m_pkthdr; 3708 m_redzone_init(to); /* setup red zone on dst */ 3709 m_tag_init(to, 0); /* preserve dst static tags */ 3710 return (m_tag_copy_chain(to, from, how)); 3711} 3712 3713void 3714m_copy_pftag(struct mbuf *to, struct mbuf *from) 3715{ 3716 to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag; 3717#if PF_ECN 3718 to->m_pkthdr.pf_mtag.pftag_hdr = NULL; 3719 to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6); 3720#endif /* PF_ECN */ 3721} 3722 3723void 3724m_classifier_init(struct mbuf *m, uint32_t pktf_mask) 3725{ 3726 VERIFY(m->m_flags & M_PKTHDR); 3727 3728 m->m_pkthdr.pkt_proto = 0; 3729 m->m_pkthdr.pkt_flowsrc = 0; 3730 m->m_pkthdr.pkt_flowid = 0; 3731 m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */ 3732 /* preserve service class and interface info for loopback packets */ 3733 if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) 3734 (void) m_set_service_class(m, MBUF_SC_BE); 3735 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) 3736 m->m_pkthdr.pkt_ifainfo = 0; 3737#if MEASURE_BW 3738 m->m_pkthdr.pkt_bwseq = 0; 3739#endif /* MEASURE_BW */ 3740} 3741 3742void 3743m_copy_classifier(struct mbuf *to, struct mbuf *from) 3744{ 3745 VERIFY(to->m_flags & M_PKTHDR); 3746 VERIFY(from->m_flags & M_PKTHDR); 3747 3748 to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto; 3749 to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc; 3750 to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid; 3751 to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags; 3752 (void) m_set_service_class(to, from->m_pkthdr.pkt_svc); 3753 to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo; 3754#if MEASURE_BW 3755 to->m_pkthdr.pkt_bwseq = from->m_pkthdr.pkt_bwseq; 3756#endif /* MEASURE_BW */ 3757} 3758 3759/* 3760 * Return a list of mbuf hdrs that point to clusters. Try for num_needed; 3761 * if wantall is not set, return whatever number were available. Set up the 3762 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these 3763 * are chained on the m_nextpkt field. Any packets requested beyond this 3764 * are chained onto the last packet header's m_next field. The size of 3765 * the cluster is controlled by the parameter bufsize. 3766 */ 3767__private_extern__ struct mbuf * 3768m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, 3769 int wait, int wantall, size_t bufsize) 3770{ 3771 struct mbuf *m; 3772 struct mbuf **np, *top; 3773 unsigned int pnum, needed = *num_needed; 3774 mcache_obj_t *mp_list = NULL; 3775 int mcflags = MSLEEPF(wait); 3776 u_int32_t flag; 3777 struct ext_ref *rfa; 3778 mcache_t *cp; 3779 void *cl; 3780 3781 ASSERT(bufsize == m_maxsize(MC_CL) || 3782 bufsize == m_maxsize(MC_BIGCL) || 3783 bufsize == m_maxsize(MC_16KCL)); 3784 3785 /* 3786 * Caller must first check for njcl because this 3787 * routine is internal and not exposed/used via KPI. 3788 */ 3789 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0); 3790 3791 top = NULL; 3792 np = ⊤ 3793 pnum = 0; 3794 3795 /* 3796 * The caller doesn't want all the requested buffers; only some. 3797 * Try hard to get what we can, but don't block. This effectively 3798 * overrides MCR_SLEEP, since this thread will not go to sleep 3799 * if we can't get all the buffers. 3800 */ 3801 if (!wantall || (mcflags & MCR_NOSLEEP)) 3802 mcflags |= MCR_TRYHARD; 3803 3804 /* Allocate the composite mbuf + cluster elements from the cache */ 3805 if (bufsize == m_maxsize(MC_CL)) 3806 cp = m_cache(MC_MBUF_CL); 3807 else if (bufsize == m_maxsize(MC_BIGCL)) 3808 cp = m_cache(MC_MBUF_BIGCL); 3809 else 3810 cp = m_cache(MC_MBUF_16KCL); 3811 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags); 3812 3813 for (pnum = 0; pnum < needed; pnum++) { 3814 m = (struct mbuf *)mp_list; 3815 mp_list = mp_list->obj_next; 3816 3817 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); 3818 cl = m->m_ext.ext_buf; 3819 rfa = MEXT_RFA(m); 3820 3821 ASSERT(cl != NULL && rfa != NULL); 3822 VERIFY(MBUF_IS_COMPOSITE(m)); 3823 3824 flag = MEXT_FLAGS(m); 3825 3826 MBUF_INIT(m, num_with_pkthdrs, MT_DATA); 3827 if (bufsize == m_maxsize(MC_16KCL)) { 3828 MBUF_16KCL_INIT(m, cl, rfa, 1, flag); 3829 } else if (bufsize == m_maxsize(MC_BIGCL)) { 3830 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag); 3831 } else { 3832 MBUF_CL_INIT(m, cl, rfa, 1, flag); 3833 } 3834 3835 if (num_with_pkthdrs > 0) { 3836 --num_with_pkthdrs; 3837#if CONFIG_MACF_NET 3838 if (mac_mbuf_label_init(m, wait) != 0) { 3839 m_freem(m); 3840 break; 3841 } 3842#endif /* MAC_NET */ 3843 } 3844 3845 *np = m; 3846 if (num_with_pkthdrs > 0) 3847 np = &m->m_nextpkt; 3848 else 3849 np = &m->m_next; 3850 } 3851 ASSERT(pnum != *num_needed || mp_list == NULL); 3852 if (mp_list != NULL) 3853 mcache_free_ext(cp, mp_list); 3854 3855 if (pnum > 0) { 3856 mtype_stat_add(MT_DATA, pnum); 3857 mtype_stat_sub(MT_FREE, pnum); 3858 } 3859 3860 if (wantall && (pnum != *num_needed)) { 3861 if (top != NULL) 3862 m_freem_list(top); 3863 return (NULL); 3864 } 3865 3866 if (pnum > *num_needed) { 3867 printf("%s: File a radar related to <rdar://10146739>. \ 3868 needed = %u, pnum = %u, num_needed = %u \n", 3869 __func__, needed, pnum, *num_needed); 3870 } 3871 3872 *num_needed = pnum; 3873 return (top); 3874} 3875 3876/* 3877 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if 3878 * wantall is not set, return whatever number were available. The size of 3879 * each mbuf in the list is controlled by the parameter packetlen. Each 3880 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf 3881 * in the chain is called a segment. If maxsegments is not null and the 3882 * value pointed to is not null, this specify the maximum number of segments 3883 * for a chain of mbufs. If maxsegments is zero or the value pointed to 3884 * is zero the caller does not have any restriction on the number of segments. 3885 * The actual number of segments of a mbuf chain is return in the value 3886 * pointed to by maxsegments. 3887 */ 3888__private_extern__ struct mbuf * 3889m_allocpacket_internal(unsigned int *numlist, size_t packetlen, 3890 unsigned int *maxsegments, int wait, int wantall, size_t wantsize) 3891{ 3892 struct mbuf **np, *top, *first = NULL; 3893 size_t bufsize, r_bufsize; 3894 unsigned int num = 0; 3895 unsigned int nsegs = 0; 3896 unsigned int needed, resid; 3897 int mcflags = MSLEEPF(wait); 3898 mcache_obj_t *mp_list = NULL, *rmp_list = NULL; 3899 mcache_t *cp = NULL, *rcp = NULL; 3900 3901 if (*numlist == 0) 3902 return (NULL); 3903 3904 top = NULL; 3905 np = ⊤ 3906 3907 if (wantsize == 0) { 3908 if (packetlen <= MINCLSIZE) { 3909 bufsize = packetlen; 3910 } else if (packetlen > m_maxsize(MC_CL)) { 3911 /* Use 4KB if jumbo cluster pool isn't available */ 3912 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0) 3913 bufsize = m_maxsize(MC_BIGCL); 3914 else 3915 bufsize = m_maxsize(MC_16KCL); 3916 } else { 3917 bufsize = m_maxsize(MC_CL); 3918 } 3919 } else if (wantsize == m_maxsize(MC_CL) || 3920 wantsize == m_maxsize(MC_BIGCL) || 3921 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) { 3922 bufsize = wantsize; 3923 } else { 3924 return (NULL); 3925 } 3926 3927 if (bufsize <= MHLEN) { 3928 nsegs = 1; 3929 } else if (bufsize <= MINCLSIZE) { 3930 if (maxsegments != NULL && *maxsegments == 1) { 3931 bufsize = m_maxsize(MC_CL); 3932 nsegs = 1; 3933 } else { 3934 nsegs = 2; 3935 } 3936 } else if (bufsize == m_maxsize(MC_16KCL)) { 3937 VERIFY(njcl > 0); 3938 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1; 3939 } else if (bufsize == m_maxsize(MC_BIGCL)) { 3940 nsegs = ((packetlen - 1) >> PGSHIFT) + 1; 3941 } else { 3942 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1; 3943 } 3944 if (maxsegments != NULL) { 3945 if (*maxsegments && nsegs > *maxsegments) { 3946 *maxsegments = nsegs; 3947 return (NULL); 3948 } 3949 *maxsegments = nsegs; 3950 } 3951 3952 /* 3953 * The caller doesn't want all the requested buffers; only some. 3954 * Try hard to get what we can, but don't block. This effectively 3955 * overrides MCR_SLEEP, since this thread will not go to sleep 3956 * if we can't get all the buffers. 3957 */ 3958 if (!wantall || (mcflags & MCR_NOSLEEP)) 3959 mcflags |= MCR_TRYHARD; 3960 3961 /* 3962 * Simple case where all elements in the lists/chains are mbufs. 3963 * Unless bufsize is greater than MHLEN, each segment chain is made 3964 * up of exactly 1 mbuf. Otherwise, each segment chain is made up 3965 * of 2 mbufs; the second one is used for the residual data, i.e. 3966 * the remaining data that cannot fit into the first mbuf. 3967 */ 3968 if (bufsize <= MINCLSIZE) { 3969 /* Allocate the elements in one shot from the mbuf cache */ 3970 ASSERT(bufsize <= MHLEN || nsegs == 2); 3971 cp = m_cache(MC_MBUF); 3972 needed = mcache_alloc_ext(cp, &mp_list, 3973 (*numlist) * nsegs, mcflags); 3974 3975 /* 3976 * The number of elements must be even if we are to use an 3977 * mbuf (instead of a cluster) to store the residual data. 3978 * If we couldn't allocate the requested number of mbufs, 3979 * trim the number down (if it's odd) in order to avoid 3980 * creating a partial segment chain. 3981 */ 3982 if (bufsize > MHLEN && (needed & 0x1)) 3983 needed--; 3984 3985 while (num < needed) { 3986 struct mbuf *m; 3987 3988 m = (struct mbuf *)mp_list; 3989 mp_list = mp_list->obj_next; 3990 ASSERT(m != NULL); 3991 3992 MBUF_INIT(m, 1, MT_DATA); 3993#if CONFIG_MACF_NET 3994 if (mac_init_mbuf(m, wait) != 0) { 3995 m_free(m); 3996 break; 3997 } 3998#endif /* MAC_NET */ 3999 num++; 4000 if (bufsize > MHLEN) { 4001 /* A second mbuf for this segment chain */ 4002 m->m_next = (struct mbuf *)mp_list; 4003 mp_list = mp_list->obj_next; 4004 ASSERT(m->m_next != NULL); 4005 4006 MBUF_INIT(m->m_next, 0, MT_DATA); 4007 num++; 4008 } 4009 *np = m; 4010 np = &m->m_nextpkt; 4011 } 4012 ASSERT(num != *numlist || mp_list == NULL); 4013 4014 if (num > 0) { 4015 mtype_stat_add(MT_DATA, num); 4016 mtype_stat_sub(MT_FREE, num); 4017 } 4018 num /= nsegs; 4019 4020 /* We've got them all; return to caller */ 4021 if (num == *numlist) 4022 return (top); 4023 4024 goto fail; 4025 } 4026 4027 /* 4028 * Complex cases where elements are made up of one or more composite 4029 * mbufs + cluster, depending on packetlen. Each N-segment chain can 4030 * be illustrated as follows: 4031 * 4032 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N] 4033 * 4034 * Every composite mbuf + cluster element comes from the intermediate 4035 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency, 4036 * the last composite element will come from the MC_MBUF_CL cache, 4037 * unless the residual data is larger than 2KB where we use the 4038 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual 4039 * data is defined as extra data beyond the first element that cannot 4040 * fit into the previous element, i.e. there is no residual data if 4041 * the chain only has 1 segment. 4042 */ 4043 r_bufsize = bufsize; 4044 resid = packetlen > bufsize ? packetlen % bufsize : 0; 4045 if (resid > 0) { 4046 /* There is residual data; figure out the cluster size */ 4047 if (wantsize == 0 && packetlen > MINCLSIZE) { 4048 /* 4049 * Caller didn't request that all of the segments 4050 * in the chain use the same cluster size; use the 4051 * smaller of the cluster sizes. 4052 */ 4053 if (njcl > 0 && resid > m_maxsize(MC_BIGCL)) 4054 r_bufsize = m_maxsize(MC_16KCL); 4055 else if (resid > m_maxsize(MC_CL)) 4056 r_bufsize = m_maxsize(MC_BIGCL); 4057 else 4058 r_bufsize = m_maxsize(MC_CL); 4059 } else { 4060 /* Use the same cluster size as the other segments */ 4061 resid = 0; 4062 } 4063 } 4064 4065 needed = *numlist; 4066 if (resid > 0) { 4067 /* 4068 * Attempt to allocate composite mbuf + cluster elements for 4069 * the residual data in each chain; record the number of such 4070 * elements that can be allocated so that we know how many 4071 * segment chains we can afford to create. 4072 */ 4073 if (r_bufsize <= m_maxsize(MC_CL)) 4074 rcp = m_cache(MC_MBUF_CL); 4075 else if (r_bufsize <= m_maxsize(MC_BIGCL)) 4076 rcp = m_cache(MC_MBUF_BIGCL); 4077 else 4078 rcp = m_cache(MC_MBUF_16KCL); 4079 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags); 4080 4081 if (needed == 0) 4082 goto fail; 4083 4084 /* This is temporarily reduced for calculation */ 4085 ASSERT(nsegs > 1); 4086 nsegs--; 4087 } 4088 4089 /* 4090 * Attempt to allocate the rest of the composite mbuf + cluster 4091 * elements for the number of segment chains that we need. 4092 */ 4093 if (bufsize <= m_maxsize(MC_CL)) 4094 cp = m_cache(MC_MBUF_CL); 4095 else if (bufsize <= m_maxsize(MC_BIGCL)) 4096 cp = m_cache(MC_MBUF_BIGCL); 4097 else 4098 cp = m_cache(MC_MBUF_16KCL); 4099 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags); 4100 4101 /* Round it down to avoid creating a partial segment chain */ 4102 needed = (needed / nsegs) * nsegs; 4103 if (needed == 0) 4104 goto fail; 4105 4106 if (resid > 0) { 4107 /* 4108 * We're about to construct the chain(s); take into account 4109 * the number of segments we have created above to hold the 4110 * residual data for each chain, as well as restore the 4111 * original count of segments per chain. 4112 */ 4113 ASSERT(nsegs > 0); 4114 needed += needed / nsegs; 4115 nsegs++; 4116 } 4117 4118 for (;;) { 4119 struct mbuf *m; 4120 u_int32_t flag; 4121 struct ext_ref *rfa; 4122 void *cl; 4123 int pkthdr; 4124 4125 ++num; 4126 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) { 4127 m = (struct mbuf *)mp_list; 4128 mp_list = mp_list->obj_next; 4129 } else { 4130 m = (struct mbuf *)rmp_list; 4131 rmp_list = rmp_list->obj_next; 4132 } 4133 ASSERT(m != NULL); 4134 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); 4135 VERIFY(m->m_ext.ext_free == NULL || 4136 m->m_ext.ext_free == m_bigfree || 4137 m->m_ext.ext_free == m_16kfree); 4138 4139 cl = m->m_ext.ext_buf; 4140 rfa = MEXT_RFA(m); 4141 4142 ASSERT(cl != NULL && rfa != NULL); 4143 VERIFY(MBUF_IS_COMPOSITE(m)); 4144 4145 flag = MEXT_FLAGS(m); 4146 4147 pkthdr = (nsegs == 1 || (num % nsegs) == 1); 4148 if (pkthdr) 4149 first = m; 4150 MBUF_INIT(m, pkthdr, MT_DATA); 4151 if (m->m_ext.ext_free == m_16kfree) { 4152 MBUF_16KCL_INIT(m, cl, rfa, 1, flag); 4153 } else if (m->m_ext.ext_free == m_bigfree) { 4154 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag); 4155 } else { 4156 MBUF_CL_INIT(m, cl, rfa, 1, flag); 4157 } 4158#if CONFIG_MACF_NET 4159 if (pkthdr && mac_init_mbuf(m, wait) != 0) { 4160 --num; 4161 m_freem(m); 4162 break; 4163 } 4164#endif /* MAC_NET */ 4165 4166 *np = m; 4167 if ((num % nsegs) == 0) 4168 np = &first->m_nextpkt; 4169 else 4170 np = &m->m_next; 4171 4172 if (num == needed) 4173 break; 4174 } 4175 4176 if (num > 0) { 4177 mtype_stat_add(MT_DATA, num); 4178 mtype_stat_sub(MT_FREE, num); 4179 } 4180 4181 num /= nsegs; 4182 4183 /* We've got them all; return to caller */ 4184 if (num == *numlist) { 4185 ASSERT(mp_list == NULL && rmp_list == NULL); 4186 return (top); 4187 } 4188 4189fail: 4190 /* Free up what's left of the above */ 4191 if (mp_list != NULL) 4192 mcache_free_ext(cp, mp_list); 4193 if (rmp_list != NULL) 4194 mcache_free_ext(rcp, rmp_list); 4195 if (wantall && top != NULL) { 4196 m_freem(top); 4197 return (NULL); 4198 } 4199 *numlist = num; 4200 return (top); 4201} 4202 4203/* 4204 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated 4205 * packets on receive ring. 4206 */ 4207__private_extern__ struct mbuf * 4208m_getpacket_how(int wait) 4209{ 4210 unsigned int num_needed = 1; 4211 4212 return (m_getpackets_internal(&num_needed, 1, wait, 1, 4213 m_maxsize(MC_CL))); 4214} 4215 4216/* 4217 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated 4218 * packets on receive ring. 4219 */ 4220struct mbuf * 4221m_getpacket(void) 4222{ 4223 unsigned int num_needed = 1; 4224 4225 return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1, 4226 m_maxsize(MC_CL))); 4227} 4228 4229/* 4230 * Return a list of mbuf hdrs that point to clusters. Try for num_needed; 4231 * if this can't be met, return whatever number were available. Set up the 4232 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These 4233 * are chained on the m_nextpkt field. Any packets requested beyond this are 4234 * chained onto the last packet header's m_next field. 4235 */ 4236struct mbuf * 4237m_getpackets(int num_needed, int num_with_pkthdrs, int how) 4238{ 4239 unsigned int n = num_needed; 4240 4241 return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0, 4242 m_maxsize(MC_CL))); 4243} 4244 4245/* 4246 * Return a list of mbuf hdrs set up as packet hdrs chained together 4247 * on the m_nextpkt field 4248 */ 4249struct mbuf * 4250m_getpackethdrs(int num_needed, int how) 4251{ 4252 struct mbuf *m; 4253 struct mbuf **np, *top; 4254 4255 top = NULL; 4256 np = ⊤ 4257 4258 while (num_needed--) { 4259 m = _M_RETRYHDR(how, MT_DATA); 4260 if (m == NULL) 4261 break; 4262 4263 *np = m; 4264 np = &m->m_nextpkt; 4265 } 4266 4267 return (top); 4268} 4269 4270/* 4271 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count 4272 * for mbufs packets freed. Used by the drivers. 4273 */ 4274int 4275m_freem_list(struct mbuf *m) 4276{ 4277 struct mbuf *nextpkt; 4278 mcache_obj_t *mp_list = NULL; 4279 mcache_obj_t *mcl_list = NULL; 4280 mcache_obj_t *mbc_list = NULL; 4281 mcache_obj_t *m16k_list = NULL; 4282 mcache_obj_t *m_mcl_list = NULL; 4283 mcache_obj_t *m_mbc_list = NULL; 4284 mcache_obj_t *m_m16k_list = NULL; 4285 mcache_obj_t *ref_list = NULL; 4286 int pktcount = 0; 4287 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0; 4288 4289 while (m != NULL) { 4290 pktcount++; 4291 4292 nextpkt = m->m_nextpkt; 4293 m->m_nextpkt = NULL; 4294 4295 while (m != NULL) { 4296 struct mbuf *next = m->m_next; 4297 mcache_obj_t *o, *rfa; 4298 u_int32_t refcnt, composite; 4299 4300 if (m->m_type == MT_FREE) 4301 panic("m_free: freeing an already freed mbuf"); 4302 4303 if (m->m_type != MT_FREE) 4304 mt_free++; 4305 4306 if (m->m_flags & M_PKTHDR) { 4307 /* Check for scratch area overflow */ 4308 m_redzone_verify(m); 4309 /* Free the aux data and tags if there is any */ 4310 m_tag_delete_chain(m, NULL); 4311 } 4312 4313 if (!(m->m_flags & M_EXT)) 4314 goto simple_free; 4315 4316 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf; 4317 refcnt = m_decref(m); 4318 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); 4319 if (refcnt == 0 && !composite) { 4320 if (m->m_ext.ext_free == NULL) { 4321 o->obj_next = mcl_list; 4322 mcl_list = o; 4323 } else if (m->m_ext.ext_free == m_bigfree) { 4324 o->obj_next = mbc_list; 4325 mbc_list = o; 4326 } else if (m->m_ext.ext_free == m_16kfree) { 4327 o->obj_next = m16k_list; 4328 m16k_list = o; 4329 } else { 4330 (*(m->m_ext.ext_free))((caddr_t)o, 4331 m->m_ext.ext_size, 4332 m->m_ext.ext_arg); 4333 } 4334 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m); 4335 rfa->obj_next = ref_list; 4336 ref_list = rfa; 4337 MEXT_RFA(m) = NULL; 4338 } else if (refcnt == 0 && composite) { 4339 VERIFY(m->m_type != MT_FREE); 4340 /* 4341 * Amortize the costs of atomic operations 4342 * by doing them at the end, if possible. 4343 */ 4344 if (m->m_type == MT_DATA) 4345 mt_data++; 4346 else if (m->m_type == MT_HEADER) 4347 mt_header++; 4348 else if (m->m_type == MT_SONAME) 4349 mt_soname++; 4350 else if (m->m_type == MT_TAG) 4351 mt_tag++; 4352 else 4353 mtype_stat_dec(m->m_type); 4354 4355 m->m_type = MT_FREE; 4356 m->m_flags = M_EXT; 4357 m->m_len = 0; 4358 m->m_next = m->m_nextpkt = NULL; 4359 4360 MEXT_FLAGS(m) &= ~EXTF_READONLY; 4361 4362 /* "Free" into the intermediate cache */ 4363 o = (mcache_obj_t *)m; 4364 if (m->m_ext.ext_free == NULL) { 4365 o->obj_next = m_mcl_list; 4366 m_mcl_list = o; 4367 } else if (m->m_ext.ext_free == m_bigfree) { 4368 o->obj_next = m_mbc_list; 4369 m_mbc_list = o; 4370 } else { 4371 VERIFY(m->m_ext.ext_free == m_16kfree); 4372 o->obj_next = m_m16k_list; 4373 m_m16k_list = o; 4374 } 4375 m = next; 4376 continue; 4377 } 4378simple_free: 4379 /* 4380 * Amortize the costs of atomic operations 4381 * by doing them at the end, if possible. 4382 */ 4383 if (m->m_type == MT_DATA) 4384 mt_data++; 4385 else if (m->m_type == MT_HEADER) 4386 mt_header++; 4387 else if (m->m_type == MT_SONAME) 4388 mt_soname++; 4389 else if (m->m_type == MT_TAG) 4390 mt_tag++; 4391 else if (m->m_type != MT_FREE) 4392 mtype_stat_dec(m->m_type); 4393 4394 m->m_type = MT_FREE; 4395 m->m_flags = m->m_len = 0; 4396 m->m_next = m->m_nextpkt = NULL; 4397 4398 ((mcache_obj_t *)m)->obj_next = mp_list; 4399 mp_list = (mcache_obj_t *)m; 4400 4401 m = next; 4402 } 4403 4404 m = nextpkt; 4405 } 4406 4407 if (mt_free > 0) 4408 mtype_stat_add(MT_FREE, mt_free); 4409 if (mt_data > 0) 4410 mtype_stat_sub(MT_DATA, mt_data); 4411 if (mt_header > 0) 4412 mtype_stat_sub(MT_HEADER, mt_header); 4413 if (mt_soname > 0) 4414 mtype_stat_sub(MT_SONAME, mt_soname); 4415 if (mt_tag > 0) 4416 mtype_stat_sub(MT_TAG, mt_tag); 4417 4418 if (mp_list != NULL) 4419 mcache_free_ext(m_cache(MC_MBUF), mp_list); 4420 if (mcl_list != NULL) 4421 mcache_free_ext(m_cache(MC_CL), mcl_list); 4422 if (mbc_list != NULL) 4423 mcache_free_ext(m_cache(MC_BIGCL), mbc_list); 4424 if (m16k_list != NULL) 4425 mcache_free_ext(m_cache(MC_16KCL), m16k_list); 4426 if (m_mcl_list != NULL) 4427 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list); 4428 if (m_mbc_list != NULL) 4429 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list); 4430 if (m_m16k_list != NULL) 4431 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list); 4432 if (ref_list != NULL) 4433 mcache_free_ext(ref_cache, ref_list); 4434 4435 return (pktcount); 4436} 4437 4438void 4439m_freem(struct mbuf *m) 4440{ 4441 while (m != NULL) 4442 m = m_free(m); 4443} 4444 4445/* 4446 * Mbuffer utility routines. 4447 */ 4448 4449/* 4450 * Compute the amount of space available before the current start 4451 * of data in an mbuf. 4452 */ 4453int 4454m_leadingspace(struct mbuf *m) 4455{ 4456 if (m->m_flags & M_EXT) { 4457 if (MCLHASREFERENCE(m)) 4458 return (0); 4459 return (m->m_data - m->m_ext.ext_buf); 4460 } 4461 if (m->m_flags & M_PKTHDR) 4462 return (m->m_data - m->m_pktdat); 4463 return (m->m_data - m->m_dat); 4464} 4465 4466/* 4467 * Compute the amount of space available after the end of data in an mbuf. 4468 */ 4469int 4470m_trailingspace(struct mbuf *m) 4471{ 4472 if (m->m_flags & M_EXT) { 4473 if (MCLHASREFERENCE(m)) 4474 return (0); 4475 return (m->m_ext.ext_buf + m->m_ext.ext_size - 4476 (m->m_data + m->m_len)); 4477 } 4478 return (&m->m_dat[MLEN] - (m->m_data + m->m_len)); 4479} 4480 4481/* 4482 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain, 4483 * copy junk along. Does not adjust packet header length. 4484 */ 4485struct mbuf * 4486m_prepend(struct mbuf *m, int len, int how) 4487{ 4488 struct mbuf *mn; 4489 4490 _MGET(mn, how, m->m_type); 4491 if (mn == NULL) { 4492 m_freem(m); 4493 return (NULL); 4494 } 4495 if (m->m_flags & M_PKTHDR) { 4496 M_COPY_PKTHDR(mn, m); 4497 m->m_flags &= ~M_PKTHDR; 4498 } 4499 mn->m_next = m; 4500 m = mn; 4501 if (len < MHLEN) 4502 MH_ALIGN(m, len); 4503 m->m_len = len; 4504 return (m); 4505} 4506 4507/* 4508 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to 4509 * chain, copy junk along, and adjust length. 4510 */ 4511struct mbuf * 4512m_prepend_2(struct mbuf *m, int len, int how) 4513{ 4514 if (M_LEADINGSPACE(m) >= len) { 4515 m->m_data -= len; 4516 m->m_len += len; 4517 } else { 4518 m = m_prepend(m, len, how); 4519 } 4520 if ((m) && (m->m_flags & M_PKTHDR)) 4521 m->m_pkthdr.len += len; 4522 return (m); 4523} 4524 4525/* 4526 * Make a copy of an mbuf chain starting "off0" bytes from the beginning, 4527 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. 4528 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller. 4529 */ 4530int MCFail; 4531 4532struct mbuf * 4533m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode) 4534{ 4535 struct mbuf *n, *mhdr = NULL, **np; 4536 int off = off0; 4537 struct mbuf *top; 4538 int copyhdr = 0; 4539 4540 if (off < 0 || len < 0) 4541 panic("m_copym: invalid offset %d or len %d", off, len); 4542 4543 VERIFY((mode != M_COPYM_MUST_COPY_HDR && 4544 mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR)); 4545 4546 if ((off == 0 && (m->m_flags & M_PKTHDR)) || 4547 mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) { 4548 mhdr = m; 4549 copyhdr = 1; 4550 } 4551 4552 while (off >= m->m_len) { 4553 if (m->m_next == NULL) 4554 panic("m_copym: invalid mbuf chain"); 4555 off -= m->m_len; 4556 m = m->m_next; 4557 } 4558 np = ⊤ 4559 top = NULL; 4560 4561 while (len > 0) { 4562 if (m == NULL) { 4563 if (len != M_COPYALL) 4564 panic("m_copym: len != M_COPYALL"); 4565 break; 4566 } 4567 4568 if (copyhdr) 4569 n = _M_RETRYHDR(wait, m->m_type); 4570 else 4571 n = _M_RETRY(wait, m->m_type); 4572 *np = n; 4573 4574 if (n == NULL) 4575 goto nospace; 4576 4577 if (copyhdr != 0) { 4578 if ((mode == M_COPYM_MOVE_HDR) || 4579 (mode == M_COPYM_MUST_MOVE_HDR)) { 4580 M_COPY_PKTHDR(n, mhdr); 4581 } else if ((mode == M_COPYM_COPY_HDR) || 4582 (mode == M_COPYM_MUST_COPY_HDR)) { 4583 if (m_dup_pkthdr(n, mhdr, wait) == 0) 4584 goto nospace; 4585 } 4586 if (len == M_COPYALL) 4587 n->m_pkthdr.len -= off0; 4588 else 4589 n->m_pkthdr.len = len; 4590 copyhdr = 0; 4591 /* 4592 * There is data to copy from the packet header mbuf 4593 * if it is empty or it is before the starting offset 4594 */ 4595 if (mhdr != m) { 4596 np = &n->m_next; 4597 continue; 4598 } 4599 } 4600 n->m_len = MIN(len, (m->m_len - off)); 4601 if (m->m_flags & M_EXT) { 4602 n->m_ext = m->m_ext; 4603 m_incref(m); 4604 n->m_data = m->m_data + off; 4605 n->m_flags |= M_EXT; 4606 } else { 4607 /* 4608 * Limit to the capacity of the destination 4609 */ 4610 if (n->m_flags & M_PKTHDR) 4611 n->m_len = MIN(n->m_len, MHLEN); 4612 else 4613 n->m_len = MIN(n->m_len, MLEN); 4614 4615 if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE) 4616 panic("%s n %p copy overflow", 4617 __func__, n); 4618 4619 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t), 4620 (unsigned)n->m_len); 4621 } 4622 if (len != M_COPYALL) 4623 len -= n->m_len; 4624 off = 0; 4625 m = m->m_next; 4626 np = &n->m_next; 4627 } 4628 4629 if (top == NULL) 4630 MCFail++; 4631 4632 return (top); 4633nospace: 4634 4635 m_freem(top); 4636 MCFail++; 4637 return (NULL); 4638} 4639 4640 4641struct mbuf * 4642m_copym(struct mbuf *m, int off0, int len, int wait) 4643{ 4644 return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR)); 4645} 4646 4647/* 4648 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated 4649 * within this routine also, the last mbuf and offset accessed are passed 4650 * out and can be passed back in to avoid having to rescan the entire mbuf 4651 * list (normally hung off of the socket) 4652 */ 4653struct mbuf * 4654m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait, 4655 struct mbuf **m_lastm, int *m_off, uint32_t mode) 4656{ 4657 struct mbuf *m = m0, *n, **np = NULL; 4658 int off = off0, len = len0; 4659 struct mbuf *top = NULL; 4660 int mcflags = MSLEEPF(wait); 4661 int copyhdr = 0; 4662 int type = 0; 4663 mcache_obj_t *list = NULL; 4664 int needed = 0; 4665 4666 if (off == 0 && (m->m_flags & M_PKTHDR)) 4667 copyhdr = 1; 4668 4669 if (m_lastm != NULL && *m_lastm != NULL) { 4670 m = *m_lastm; 4671 off = *m_off; 4672 } else { 4673 while (off >= m->m_len) { 4674 off -= m->m_len; 4675 m = m->m_next; 4676 } 4677 } 4678 4679 n = m; 4680 while (len > 0) { 4681 needed++; 4682 ASSERT(n != NULL); 4683 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0))); 4684 n = n->m_next; 4685 } 4686 needed++; 4687 len = len0; 4688 4689 /* 4690 * If the caller doesn't want to be put to sleep, mark it with 4691 * MCR_TRYHARD so that we may reclaim buffers from other places 4692 * before giving up. 4693 */ 4694 if (mcflags & MCR_NOSLEEP) 4695 mcflags |= MCR_TRYHARD; 4696 4697 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed, 4698 mcflags) != needed) 4699 goto nospace; 4700 4701 needed = 0; 4702 while (len > 0) { 4703 n = (struct mbuf *)list; 4704 list = list->obj_next; 4705 ASSERT(n != NULL && m != NULL); 4706 4707 type = (top == NULL) ? MT_HEADER : m->m_type; 4708 MBUF_INIT(n, (top == NULL), type); 4709#if CONFIG_MACF_NET 4710 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) { 4711 mtype_stat_inc(MT_HEADER); 4712 mtype_stat_dec(MT_FREE); 4713 m_free(n); 4714 goto nospace; 4715 } 4716#endif /* MAC_NET */ 4717 4718 if (top == NULL) { 4719 top = n; 4720 np = &top->m_next; 4721 continue; 4722 } else { 4723 needed++; 4724 *np = n; 4725 } 4726 4727 if (copyhdr) { 4728 if ((mode == M_COPYM_MOVE_HDR) || 4729 (mode == M_COPYM_MUST_MOVE_HDR)) { 4730 M_COPY_PKTHDR(n, m); 4731 } else if ((mode == M_COPYM_COPY_HDR) || 4732 (mode == M_COPYM_MUST_COPY_HDR)) { 4733 if (m_dup_pkthdr(n, m, wait) == 0) 4734 goto nospace; 4735 } 4736 n->m_pkthdr.len = len; 4737 copyhdr = 0; 4738 } 4739 n->m_len = MIN(len, (m->m_len - off)); 4740 4741 if (m->m_flags & M_EXT) { 4742 n->m_ext = m->m_ext; 4743 m_incref(m); 4744 n->m_data = m->m_data + off; 4745 n->m_flags |= M_EXT; 4746 } else { 4747 if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE) 4748 panic("%s n %p copy overflow", 4749 __func__, n); 4750 4751 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t), 4752 (unsigned)n->m_len); 4753 } 4754 len -= n->m_len; 4755 4756 if (len == 0) { 4757 if (m_lastm != NULL && m_off != NULL) { 4758 if ((off + n->m_len) == m->m_len) { 4759 *m_lastm = m->m_next; 4760 *m_off = 0; 4761 } else { 4762 *m_lastm = m; 4763 *m_off = off + n->m_len; 4764 } 4765 } 4766 break; 4767 } 4768 off = 0; 4769 m = m->m_next; 4770 np = &n->m_next; 4771 } 4772 4773 mtype_stat_inc(MT_HEADER); 4774 mtype_stat_add(type, needed); 4775 mtype_stat_sub(MT_FREE, needed + 1); 4776 4777 ASSERT(list == NULL); 4778 return (top); 4779 4780nospace: 4781 if (list != NULL) 4782 mcache_free_ext(m_cache(MC_MBUF), list); 4783 if (top != NULL) 4784 m_freem(top); 4785 MCFail++; 4786 return (NULL); 4787} 4788 4789/* 4790 * Copy data from an mbuf chain starting "off" bytes from the beginning, 4791 * continuing for "len" bytes, into the indicated buffer. 4792 */ 4793void 4794m_copydata(struct mbuf *m, int off, int len, void *vp) 4795{ 4796 unsigned count; 4797 char *cp = vp; 4798 4799 if (off < 0 || len < 0) 4800 panic("m_copydata: invalid offset %d or len %d", off, len); 4801 4802 while (off > 0) { 4803 if (m == NULL) 4804 panic("m_copydata: invalid mbuf chain"); 4805 if (off < m->m_len) 4806 break; 4807 off -= m->m_len; 4808 m = m->m_next; 4809 } 4810 while (len > 0) { 4811 if (m == NULL) 4812 panic("m_copydata: invalid mbuf chain"); 4813 count = MIN(m->m_len - off, len); 4814 bcopy(MTOD(m, caddr_t) + off, cp, count); 4815 len -= count; 4816 cp += count; 4817 off = 0; 4818 m = m->m_next; 4819 } 4820} 4821 4822/* 4823 * Concatenate mbuf chain n to m. Both chains must be of the same type 4824 * (e.g. MT_DATA). Any m_pkthdr is not updated. 4825 */ 4826void 4827m_cat(struct mbuf *m, struct mbuf *n) 4828{ 4829 while (m->m_next) 4830 m = m->m_next; 4831 while (n) { 4832 if ((m->m_flags & M_EXT) || 4833 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) { 4834 /* just join the two chains */ 4835 m->m_next = n; 4836 return; 4837 } 4838 /* splat the data from one into the other */ 4839 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len, 4840 (u_int)n->m_len); 4841 m->m_len += n->m_len; 4842 n = m_free(n); 4843 } 4844} 4845 4846void 4847m_adj(struct mbuf *mp, int req_len) 4848{ 4849 int len = req_len; 4850 struct mbuf *m; 4851 int count; 4852 4853 if ((m = mp) == NULL) 4854 return; 4855 if (len >= 0) { 4856 /* 4857 * Trim from head. 4858 */ 4859 while (m != NULL && len > 0) { 4860 if (m->m_len <= len) { 4861 len -= m->m_len; 4862 m->m_len = 0; 4863 m = m->m_next; 4864 } else { 4865 m->m_len -= len; 4866 m->m_data += len; 4867 len = 0; 4868 } 4869 } 4870 m = mp; 4871 if (m->m_flags & M_PKTHDR) 4872 m->m_pkthdr.len -= (req_len - len); 4873 } else { 4874 /* 4875 * Trim from tail. Scan the mbuf chain, 4876 * calculating its length and finding the last mbuf. 4877 * If the adjustment only affects this mbuf, then just 4878 * adjust and return. Otherwise, rescan and truncate 4879 * after the remaining size. 4880 */ 4881 len = -len; 4882 count = 0; 4883 for (;;) { 4884 count += m->m_len; 4885 if (m->m_next == (struct mbuf *)0) 4886 break; 4887 m = m->m_next; 4888 } 4889 if (m->m_len >= len) { 4890 m->m_len -= len; 4891 m = mp; 4892 if (m->m_flags & M_PKTHDR) 4893 m->m_pkthdr.len -= len; 4894 return; 4895 } 4896 count -= len; 4897 if (count < 0) 4898 count = 0; 4899 /* 4900 * Correct length for chain is "count". 4901 * Find the mbuf with last data, adjust its length, 4902 * and toss data from remaining mbufs on chain. 4903 */ 4904 m = mp; 4905 if (m->m_flags & M_PKTHDR) 4906 m->m_pkthdr.len = count; 4907 for (; m; m = m->m_next) { 4908 if (m->m_len >= count) { 4909 m->m_len = count; 4910 break; 4911 } 4912 count -= m->m_len; 4913 } 4914 while ((m = m->m_next)) 4915 m->m_len = 0; 4916 } 4917} 4918 4919/* 4920 * Rearange an mbuf chain so that len bytes are contiguous 4921 * and in the data area of an mbuf (so that mtod and dtom 4922 * will work for a structure of size len). Returns the resulting 4923 * mbuf chain on success, frees it and returns null on failure. 4924 * If there is room, it will add up to max_protohdr-len extra bytes to the 4925 * contiguous region in an attempt to avoid being called next time. 4926 */ 4927int MPFail; 4928 4929struct mbuf * 4930m_pullup(struct mbuf *n, int len) 4931{ 4932 struct mbuf *m; 4933 int count; 4934 int space; 4935 4936 /* 4937 * If first mbuf has no cluster, and has room for len bytes 4938 * without shifting current data, pullup into it, 4939 * otherwise allocate a new mbuf to prepend to the chain. 4940 */ 4941 if ((n->m_flags & M_EXT) == 0 && 4942 n->m_data + len < &n->m_dat[MLEN] && n->m_next) { 4943 if (n->m_len >= len) 4944 return (n); 4945 m = n; 4946 n = n->m_next; 4947 len -= m->m_len; 4948 } else { 4949 if (len > MHLEN) 4950 goto bad; 4951 _MGET(m, M_DONTWAIT, n->m_type); 4952 if (m == 0) 4953 goto bad; 4954 m->m_len = 0; 4955 if (n->m_flags & M_PKTHDR) { 4956 M_COPY_PKTHDR(m, n); 4957 n->m_flags &= ~M_PKTHDR; 4958 } 4959 } 4960 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 4961 do { 4962 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len); 4963 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len, 4964 (unsigned)count); 4965 len -= count; 4966 m->m_len += count; 4967 n->m_len -= count; 4968 space -= count; 4969 if (n->m_len) 4970 n->m_data += count; 4971 else 4972 n = m_free(n); 4973 } while (len > 0 && n); 4974 if (len > 0) { 4975 (void) m_free(m); 4976 goto bad; 4977 } 4978 m->m_next = n; 4979 return (m); 4980bad: 4981 m_freem(n); 4982 MPFail++; 4983 return (0); 4984} 4985 4986/* 4987 * Like m_pullup(), except a new mbuf is always allocated, and we allow 4988 * the amount of empty space before the data in the new mbuf to be specified 4989 * (in the event that the caller expects to prepend later). 4990 */ 4991__private_extern__ int MSFail = 0; 4992 4993__private_extern__ struct mbuf * 4994m_copyup(struct mbuf *n, int len, int dstoff) 4995{ 4996 struct mbuf *m; 4997 int count, space; 4998 4999 if (len > (MHLEN - dstoff)) 5000 goto bad; 5001 MGET(m, M_DONTWAIT, n->m_type); 5002 if (m == NULL) 5003 goto bad; 5004 m->m_len = 0; 5005 if (n->m_flags & M_PKTHDR) { 5006 m_copy_pkthdr(m, n); 5007 n->m_flags &= ~M_PKTHDR; 5008 } 5009 m->m_data += dstoff; 5010 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 5011 do { 5012 count = min(min(max(len, max_protohdr), space), n->m_len); 5013 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), 5014 (unsigned)count); 5015 len -= count; 5016 m->m_len += count; 5017 n->m_len -= count; 5018 space -= count; 5019 if (n->m_len) 5020 n->m_data += count; 5021 else 5022 n = m_free(n); 5023 } while (len > 0 && n); 5024 if (len > 0) { 5025 (void) m_free(m); 5026 goto bad; 5027 } 5028 m->m_next = n; 5029 return (m); 5030bad: 5031 m_freem(n); 5032 MSFail++; 5033 return (NULL); 5034} 5035 5036/* 5037 * Partition an mbuf chain in two pieces, returning the tail -- 5038 * all but the first len0 bytes. In case of failure, it returns NULL and 5039 * attempts to restore the chain to its original state. 5040 */ 5041struct mbuf * 5042m_split(struct mbuf *m0, int len0, int wait) 5043{ 5044 return (m_split0(m0, len0, wait, 1)); 5045} 5046 5047static struct mbuf * 5048m_split0(struct mbuf *m0, int len0, int wait, int copyhdr) 5049{ 5050 struct mbuf *m, *n; 5051 unsigned len = len0, remain; 5052 5053 for (m = m0; m && len > m->m_len; m = m->m_next) 5054 len -= m->m_len; 5055 if (m == NULL) 5056 return (NULL); 5057 remain = m->m_len - len; 5058 if (copyhdr && (m0->m_flags & M_PKTHDR)) { 5059 _MGETHDR(n, wait, m0->m_type); 5060 if (n == NULL) 5061 return (NULL); 5062 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; 5063 n->m_pkthdr.len = m0->m_pkthdr.len - len0; 5064 m0->m_pkthdr.len = len0; 5065 if (m->m_flags & M_EXT) 5066 goto extpacket; 5067 if (remain > MHLEN) { 5068 /* m can't be the lead packet */ 5069 MH_ALIGN(n, 0); 5070 n->m_next = m_split(m, len, wait); 5071 if (n->m_next == NULL) { 5072 (void) m_free(n); 5073 return (NULL); 5074 } else 5075 return (n); 5076 } else 5077 MH_ALIGN(n, remain); 5078 } else if (remain == 0) { 5079 n = m->m_next; 5080 m->m_next = NULL; 5081 return (n); 5082 } else { 5083 _MGET(n, wait, m->m_type); 5084 if (n == NULL) 5085 return (NULL); 5086 M_ALIGN(n, remain); 5087 } 5088extpacket: 5089 if (m->m_flags & M_EXT) { 5090 n->m_flags |= M_EXT; 5091 n->m_ext = m->m_ext; 5092 m_incref(m); 5093 n->m_data = m->m_data + len; 5094 } else { 5095 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain); 5096 } 5097 n->m_len = remain; 5098 m->m_len = len; 5099 n->m_next = m->m_next; 5100 m->m_next = NULL; 5101 return (n); 5102} 5103 5104/* 5105 * Routine to copy from device local memory into mbufs. 5106 */ 5107struct mbuf * 5108m_devget(char *buf, int totlen, int off0, struct ifnet *ifp, 5109 void (*copy)(const void *, void *, size_t)) 5110{ 5111 struct mbuf *m; 5112 struct mbuf *top = NULL, **mp = ⊤ 5113 int off = off0, len; 5114 char *cp; 5115 char *epkt; 5116 5117 cp = buf; 5118 epkt = cp + totlen; 5119 if (off) { 5120 /* 5121 * If 'off' is non-zero, packet is trailer-encapsulated, 5122 * so we have to skip the type and length fields. 5123 */ 5124 cp += off + 2 * sizeof (u_int16_t); 5125 totlen -= 2 * sizeof (u_int16_t); 5126 } 5127 _MGETHDR(m, M_DONTWAIT, MT_DATA); 5128 if (m == NULL) 5129 return (NULL); 5130 m->m_pkthdr.rcvif = ifp; 5131 m->m_pkthdr.len = totlen; 5132 m->m_len = MHLEN; 5133 5134 while (totlen > 0) { 5135 if (top != NULL) { 5136 _MGET(m, M_DONTWAIT, MT_DATA); 5137 if (m == NULL) { 5138 m_freem(top); 5139 return (NULL); 5140 } 5141 m->m_len = MLEN; 5142 } 5143 len = MIN(totlen, epkt - cp); 5144 if (len >= MINCLSIZE) { 5145 MCLGET(m, M_DONTWAIT); 5146 if (m->m_flags & M_EXT) { 5147 m->m_len = len = MIN(len, m_maxsize(MC_CL)); 5148 } else { 5149 /* give up when it's out of cluster mbufs */ 5150 if (top != NULL) 5151 m_freem(top); 5152 m_freem(m); 5153 return (NULL); 5154 } 5155 } else { 5156 /* 5157 * Place initial small packet/header at end of mbuf. 5158 */ 5159 if (len < m->m_len) { 5160 if (top == NULL && 5161 len + max_linkhdr <= m->m_len) 5162 m->m_data += max_linkhdr; 5163 m->m_len = len; 5164 } else { 5165 len = m->m_len; 5166 } 5167 } 5168 if (copy) 5169 copy(cp, MTOD(m, caddr_t), (unsigned)len); 5170 else 5171 bcopy(cp, MTOD(m, caddr_t), (unsigned)len); 5172 cp += len; 5173 *mp = m; 5174 mp = &m->m_next; 5175 totlen -= len; 5176 if (cp == epkt) 5177 cp = buf; 5178 } 5179 return (top); 5180} 5181 5182#ifndef MBUF_GROWTH_NORMAL_THRESH 5183#define MBUF_GROWTH_NORMAL_THRESH 25 5184#endif 5185 5186/* 5187 * Cluster freelist allocation check. 5188 */ 5189static int 5190m_howmany(int num, size_t bufsize) 5191{ 5192 int i = 0, j = 0; 5193 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters; 5194 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree; 5195 u_int32_t sumclusters, freeclusters; 5196 u_int32_t percent_pool, percent_kmem; 5197 u_int32_t mb_growth, mb_growth_thresh; 5198 5199 VERIFY(bufsize == m_maxsize(MC_BIGCL) || 5200 bufsize == m_maxsize(MC_16KCL)); 5201 5202 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 5203 5204 /* Numbers in 2K cluster units */ 5205 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT; 5206 m_clusters = m_total(MC_CL); 5207 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT; 5208 m_16kclusters = m_total(MC_16KCL); 5209 sumclusters = m_mbclusters + m_clusters + m_bigclusters; 5210 5211 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT; 5212 m_clfree = m_infree(MC_CL); 5213 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT; 5214 m_16kclfree = m_infree(MC_16KCL); 5215 freeclusters = m_mbfree + m_clfree + m_bigclfree; 5216 5217 /* Bail if we've maxed out the mbuf memory map */ 5218 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) || 5219 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) && 5220 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) { 5221 return (0); 5222 } 5223 5224 if (bufsize == m_maxsize(MC_BIGCL)) { 5225 /* Under minimum */ 5226 if (m_bigclusters < m_minlimit(MC_BIGCL)) 5227 return (m_minlimit(MC_BIGCL) - m_bigclusters); 5228 5229 percent_pool = 5230 ((sumclusters - freeclusters) * 100) / sumclusters; 5231 percent_kmem = (sumclusters * 100) / nclusters; 5232 5233 /* 5234 * If a light/normal user, grow conservatively (75%) 5235 * If a heavy user, grow aggressively (50%) 5236 */ 5237 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) 5238 mb_growth = MB_GROWTH_NORMAL; 5239 else 5240 mb_growth = MB_GROWTH_AGGRESSIVE; 5241 5242 if (percent_kmem < 5) { 5243 /* For initial allocations */ 5244 i = num; 5245 } else { 5246 /* Return if >= MBIGCL_LOWAT clusters available */ 5247 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT && 5248 m_total(MC_BIGCL) >= 5249 MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) 5250 return (0); 5251 5252 /* Ensure at least num clusters are accessible */ 5253 if (num >= m_infree(MC_BIGCL)) 5254 i = num - m_infree(MC_BIGCL); 5255 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) 5256 j = num - (m_total(MC_BIGCL) - 5257 m_minlimit(MC_BIGCL)); 5258 5259 i = MAX(i, j); 5260 5261 /* 5262 * Grow pool if percent_pool > 75 (normal growth) 5263 * or percent_pool > 50 (aggressive growth). 5264 */ 5265 mb_growth_thresh = 100 - (100 / (1 << mb_growth)); 5266 if (percent_pool > mb_growth_thresh) 5267 j = ((sumclusters + num) >> mb_growth) - 5268 freeclusters; 5269 i = MAX(i, j); 5270 } 5271 5272 /* Check to ensure we didn't go over limits */ 5273 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) 5274 i = m_maxlimit(MC_BIGCL) - m_bigclusters; 5275 if ((i << 1) + sumclusters >= nclusters) 5276 i = (nclusters - sumclusters) >> 1; 5277 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL)); 5278 VERIFY(sumclusters + (i << 1) <= nclusters); 5279 5280 } else { /* 16K CL */ 5281 VERIFY(njcl > 0); 5282 /* Under minimum */ 5283 if (m_16kclusters < MIN16KCL) 5284 return (MIN16KCL - m_16kclusters); 5285 if (m_16kclfree >= M16KCL_LOWAT) 5286 return (0); 5287 5288 /* Ensure at least num clusters are available */ 5289 if (num >= m_16kclfree) 5290 i = num - m_16kclfree; 5291 5292 /* Always grow 16KCL pool aggressively */ 5293 if (((m_16kclusters + num) >> 1) > m_16kclfree) 5294 j = ((m_16kclusters + num) >> 1) - m_16kclfree; 5295 i = MAX(i, j); 5296 5297 /* Check to ensure we don't go over limit */ 5298 if (i + m_16kclusters >= m_maxlimit(MC_16KCL)) 5299 i = m_maxlimit(MC_16KCL) - m_16kclusters; 5300 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL)); 5301 } 5302 return (i); 5303} 5304/* 5305 * Return the number of bytes in the mbuf chain, m. 5306 */ 5307unsigned int 5308m_length(struct mbuf *m) 5309{ 5310 struct mbuf *m0; 5311 unsigned int pktlen; 5312 5313 if (m->m_flags & M_PKTHDR) 5314 return (m->m_pkthdr.len); 5315 5316 pktlen = 0; 5317 for (m0 = m; m0 != NULL; m0 = m0->m_next) 5318 pktlen += m0->m_len; 5319 return (pktlen); 5320} 5321 5322/* 5323 * Copy data from a buffer back into the indicated mbuf chain, 5324 * starting "off" bytes from the beginning, extending the mbuf 5325 * chain if necessary. 5326 */ 5327void 5328m_copyback(struct mbuf *m0, int off, int len, const void *cp) 5329{ 5330#if DEBUG 5331 struct mbuf *origm = m0; 5332 int error; 5333#endif /* DEBUG */ 5334 5335 if (m0 == NULL) 5336 return; 5337 5338#if DEBUG 5339 error = 5340#endif /* DEBUG */ 5341 m_copyback0(&m0, off, len, cp, 5342 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT); 5343 5344#if DEBUG 5345 if (error != 0 || (m0 != NULL && origm != m0)) 5346 panic("m_copyback"); 5347#endif /* DEBUG */ 5348} 5349 5350struct mbuf * 5351m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how) 5352{ 5353 int error; 5354 5355 /* don't support chain expansion */ 5356 VERIFY(off + len <= m_length(m0)); 5357 5358 error = m_copyback0(&m0, off, len, cp, 5359 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how); 5360 if (error) { 5361 /* 5362 * no way to recover from partial success. 5363 * just free the chain. 5364 */ 5365 m_freem(m0); 5366 return (NULL); 5367 } 5368 return (m0); 5369} 5370 5371/* 5372 * m_makewritable: ensure the specified range writable. 5373 */ 5374int 5375m_makewritable(struct mbuf **mp, int off, int len, int how) 5376{ 5377 int error; 5378#if DEBUG 5379 struct mbuf *n; 5380 int origlen, reslen; 5381 5382 origlen = m_length(*mp); 5383#endif /* DEBUG */ 5384 5385#if 0 /* M_COPYALL is large enough */ 5386 if (len == M_COPYALL) 5387 len = m_length(*mp) - off; /* XXX */ 5388#endif 5389 5390 error = m_copyback0(mp, off, len, NULL, 5391 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how); 5392 5393#if DEBUG 5394 reslen = 0; 5395 for (n = *mp; n; n = n->m_next) 5396 reslen += n->m_len; 5397 if (origlen != reslen) 5398 panic("m_makewritable: length changed"); 5399 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len) 5400 panic("m_makewritable: inconsist"); 5401#endif /* DEBUG */ 5402 5403 return (error); 5404} 5405 5406static int 5407m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags, 5408 int how) 5409{ 5410 int mlen; 5411 struct mbuf *m, *n; 5412 struct mbuf **mp; 5413 int totlen = 0; 5414 const char *cp = vp; 5415 5416 VERIFY(mp0 != NULL); 5417 VERIFY(*mp0 != NULL); 5418 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL); 5419 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL); 5420 5421 /* 5422 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW, 5423 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive. 5424 */ 5425 5426 VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0); 5427 5428 mp = mp0; 5429 m = *mp; 5430 while (off > (mlen = m->m_len)) { 5431 off -= mlen; 5432 totlen += mlen; 5433 if (m->m_next == NULL) { 5434 int tspace; 5435extend: 5436 if (!(flags & M_COPYBACK0_EXTEND)) 5437 goto out; 5438 5439 /* 5440 * try to make some space at the end of "m". 5441 */ 5442 5443 mlen = m->m_len; 5444 if (off + len >= MINCLSIZE && 5445 !(m->m_flags & M_EXT) && m->m_len == 0) { 5446 MCLGET(m, how); 5447 } 5448 tspace = M_TRAILINGSPACE(m); 5449 if (tspace > 0) { 5450 tspace = MIN(tspace, off + len); 5451 VERIFY(tspace > 0); 5452 bzero(mtod(m, char *) + m->m_len, 5453 MIN(off, tspace)); 5454 m->m_len += tspace; 5455 off += mlen; 5456 totlen -= mlen; 5457 continue; 5458 } 5459 5460 /* 5461 * need to allocate an mbuf. 5462 */ 5463 5464 if (off + len >= MINCLSIZE) { 5465 n = m_getcl(how, m->m_type, 0); 5466 } else { 5467 n = _M_GET(how, m->m_type); 5468 } 5469 if (n == NULL) { 5470 goto out; 5471 } 5472 n->m_len = 0; 5473 n->m_len = MIN(M_TRAILINGSPACE(n), off + len); 5474 bzero(mtod(n, char *), MIN(n->m_len, off)); 5475 m->m_next = n; 5476 } 5477 mp = &m->m_next; 5478 m = m->m_next; 5479 } 5480 while (len > 0) { 5481 mlen = m->m_len - off; 5482 if (mlen != 0 && m_mclhasreference(m)) { 5483 char *datap; 5484 int eatlen; 5485 5486 /* 5487 * this mbuf is read-only. 5488 * allocate a new writable mbuf and try again. 5489 */ 5490 5491#if DIAGNOSTIC 5492 if (!(flags & M_COPYBACK0_COW)) 5493 panic("m_copyback0: read-only"); 5494#endif /* DIAGNOSTIC */ 5495 5496 /* 5497 * if we're going to write into the middle of 5498 * a mbuf, split it first. 5499 */ 5500 if (off > 0 && len < mlen) { 5501 n = m_split0(m, off, how, 0); 5502 if (n == NULL) 5503 goto enobufs; 5504 m->m_next = n; 5505 mp = &m->m_next; 5506 m = n; 5507 off = 0; 5508 continue; 5509 } 5510 5511 /* 5512 * XXX TODO coalesce into the trailingspace of 5513 * the previous mbuf when possible. 5514 */ 5515 5516 /* 5517 * allocate a new mbuf. copy packet header if needed. 5518 */ 5519 n = _M_GET(how, m->m_type); 5520 if (n == NULL) 5521 goto enobufs; 5522 if (off == 0 && (m->m_flags & M_PKTHDR)) { 5523 M_COPY_PKTHDR(n, m); 5524 n->m_len = MHLEN; 5525 } else { 5526 if (len >= MINCLSIZE) 5527 MCLGET(n, M_DONTWAIT); 5528 n->m_len = 5529 (n->m_flags & M_EXT) ? MCLBYTES : MLEN; 5530 } 5531 if (n->m_len > len) 5532 n->m_len = len; 5533 5534 /* 5535 * free the region which has been overwritten. 5536 * copying data from old mbufs if requested. 5537 */ 5538 if (flags & M_COPYBACK0_PRESERVE) 5539 datap = mtod(n, char *); 5540 else 5541 datap = NULL; 5542 eatlen = n->m_len; 5543 VERIFY(off == 0 || eatlen >= mlen); 5544 if (off > 0) { 5545 VERIFY(len >= mlen); 5546 m->m_len = off; 5547 m->m_next = n; 5548 if (datap) { 5549 m_copydata(m, off, mlen, datap); 5550 datap += mlen; 5551 } 5552 eatlen -= mlen; 5553 mp = &m->m_next; 5554 m = m->m_next; 5555 } 5556 while (m != NULL && m_mclhasreference(m) && 5557 n->m_type == m->m_type && eatlen > 0) { 5558 mlen = MIN(eatlen, m->m_len); 5559 if (datap) { 5560 m_copydata(m, 0, mlen, datap); 5561 datap += mlen; 5562 } 5563 m->m_data += mlen; 5564 m->m_len -= mlen; 5565 eatlen -= mlen; 5566 if (m->m_len == 0) 5567 *mp = m = m_free(m); 5568 } 5569 if (eatlen > 0) 5570 n->m_len -= eatlen; 5571 n->m_next = m; 5572 *mp = m = n; 5573 continue; 5574 } 5575 mlen = MIN(mlen, len); 5576 if (flags & M_COPYBACK0_COPYBACK) { 5577 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen); 5578 cp += mlen; 5579 } 5580 len -= mlen; 5581 mlen += off; 5582 off = 0; 5583 totlen += mlen; 5584 if (len == 0) 5585 break; 5586 if (m->m_next == NULL) { 5587 goto extend; 5588 } 5589 mp = &m->m_next; 5590 m = m->m_next; 5591 } 5592out: 5593 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) { 5594 VERIFY(flags & M_COPYBACK0_EXTEND); 5595 m->m_pkthdr.len = totlen; 5596 } 5597 5598 return (0); 5599 5600enobufs: 5601 return (ENOBUFS); 5602} 5603 5604uint64_t 5605mcl_to_paddr(char *addr) 5606{ 5607 vm_offset_t base_phys; 5608 5609 if (!MBUF_IN_MAP(addr)) 5610 return (0); 5611 base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)]; 5612 5613 if (base_phys == 0) 5614 return (0); 5615 return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK))); 5616} 5617 5618/* 5619 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft. 5620 * And really copy the thing. That way, we don't "precompute" checksums 5621 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for 5622 * small packets, don't dup into a cluster. That way received packets 5623 * don't take up too much room in the sockbuf (cf. sbspace()). 5624 */ 5625int MDFail; 5626 5627struct mbuf * 5628m_dup(struct mbuf *m, int how) 5629{ 5630 struct mbuf *n, **np; 5631 struct mbuf *top; 5632 int copyhdr = 0; 5633 5634 np = ⊤ 5635 top = NULL; 5636 if (m->m_flags & M_PKTHDR) 5637 copyhdr = 1; 5638 5639 /* 5640 * Quick check: if we have one mbuf and its data fits in an 5641 * mbuf with packet header, just copy and go. 5642 */ 5643 if (m->m_next == NULL) { 5644 /* Then just move the data into an mbuf and be done... */ 5645 if (copyhdr) { 5646 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) { 5647 if ((n = _M_GETHDR(how, m->m_type)) == NULL) 5648 return (NULL); 5649 n->m_len = m->m_len; 5650 m_dup_pkthdr(n, m, how); 5651 bcopy(m->m_data, n->m_data, m->m_len); 5652 return (n); 5653 } 5654 } else if (m->m_len <= MLEN) { 5655 if ((n = _M_GET(how, m->m_type)) == NULL) 5656 return (NULL); 5657 bcopy(m->m_data, n->m_data, m->m_len); 5658 n->m_len = m->m_len; 5659 return (n); 5660 } 5661 } 5662 while (m != NULL) { 5663#if BLUE_DEBUG 5664 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len, 5665 m->m_data); 5666#endif 5667 if (copyhdr) 5668 n = _M_GETHDR(how, m->m_type); 5669 else 5670 n = _M_GET(how, m->m_type); 5671 if (n == NULL) 5672 goto nospace; 5673 if (m->m_flags & M_EXT) { 5674 if (m->m_len <= m_maxsize(MC_CL)) 5675 MCLGET(n, how); 5676 else if (m->m_len <= m_maxsize(MC_BIGCL)) 5677 n = m_mbigget(n, how); 5678 else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0) 5679 n = m_m16kget(n, how); 5680 if (!(n->m_flags & M_EXT)) { 5681 (void) m_free(n); 5682 goto nospace; 5683 } 5684 } 5685 *np = n; 5686 if (copyhdr) { 5687 /* Don't use M_COPY_PKTHDR: preserve m_data */ 5688 m_dup_pkthdr(n, m, how); 5689 copyhdr = 0; 5690 if (!(n->m_flags & M_EXT)) 5691 n->m_data = n->m_pktdat; 5692 } 5693 n->m_len = m->m_len; 5694 /* 5695 * Get the dup on the same bdry as the original 5696 * Assume that the two mbufs have the same offset to data area 5697 * (up to word boundaries) 5698 */ 5699 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len); 5700 m = m->m_next; 5701 np = &n->m_next; 5702#if BLUE_DEBUG 5703 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len, 5704 n->m_data); 5705#endif 5706 } 5707 5708 if (top == NULL) 5709 MDFail++; 5710 return (top); 5711 5712nospace: 5713 m_freem(top); 5714 MDFail++; 5715 return (NULL); 5716} 5717 5718#define MBUF_MULTIPAGES(m) \ 5719 (((m)->m_flags & M_EXT) && \ 5720 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \ 5721 (!IS_P2ALIGNED((m)->m_data, NBPG) && \ 5722 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len)))) 5723 5724static struct mbuf * 5725m_expand(struct mbuf *m, struct mbuf **last) 5726{ 5727 struct mbuf *top = NULL; 5728 struct mbuf **nm = ⊤ 5729 uintptr_t data0, data; 5730 unsigned int len0, len; 5731 5732 VERIFY(MBUF_MULTIPAGES(m)); 5733 VERIFY(m->m_next == NULL); 5734 data0 = (uintptr_t)m->m_data; 5735 len0 = m->m_len; 5736 *last = top; 5737 5738 for (;;) { 5739 struct mbuf *n; 5740 5741 data = data0; 5742 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG) 5743 len = NBPG; 5744 else if (!IS_P2ALIGNED(data, NBPG) && 5745 P2ROUNDUP(data, NBPG) < (data + len0)) 5746 len = P2ROUNDUP(data, NBPG) - data; 5747 else 5748 len = len0; 5749 5750 VERIFY(len > 0); 5751 VERIFY(m->m_flags & M_EXT); 5752 m->m_data = (void *)data; 5753 m->m_len = len; 5754 5755 *nm = *last = m; 5756 nm = &m->m_next; 5757 m->m_next = NULL; 5758 5759 data0 += len; 5760 len0 -= len; 5761 if (len0 == 0) 5762 break; 5763 5764 n = _M_RETRY(M_DONTWAIT, MT_DATA); 5765 if (n == NULL) { 5766 m_freem(top); 5767 top = *last = NULL; 5768 break; 5769 } 5770 5771 n->m_ext = m->m_ext; 5772 m_incref(m); 5773 n->m_flags |= M_EXT; 5774 m = n; 5775 } 5776 return (top); 5777} 5778 5779struct mbuf * 5780m_normalize(struct mbuf *m) 5781{ 5782 struct mbuf *top = NULL; 5783 struct mbuf **nm = ⊤ 5784 boolean_t expanded = FALSE; 5785 5786 while (m != NULL) { 5787 struct mbuf *n; 5788 5789 n = m->m_next; 5790 m->m_next = NULL; 5791 5792 /* Does the data cross one or more page boundaries? */ 5793 if (MBUF_MULTIPAGES(m)) { 5794 struct mbuf *last; 5795 if ((m = m_expand(m, &last)) == NULL) { 5796 m_freem(n); 5797 m_freem(top); 5798 top = NULL; 5799 break; 5800 } 5801 *nm = m; 5802 nm = &last->m_next; 5803 expanded = TRUE; 5804 } else { 5805 *nm = m; 5806 nm = &m->m_next; 5807 } 5808 m = n; 5809 } 5810 if (expanded) 5811 atomic_add_32(&mb_normalized, 1); 5812 return (top); 5813} 5814 5815/* 5816 * Append the specified data to the indicated mbuf chain, 5817 * Extend the mbuf chain if the new data does not fit in 5818 * existing space. 5819 * 5820 * Return 1 if able to complete the job; otherwise 0. 5821 */ 5822int 5823m_append(struct mbuf *m0, int len, caddr_t cp) 5824{ 5825 struct mbuf *m, *n; 5826 int remainder, space; 5827 5828 for (m = m0; m->m_next != NULL; m = m->m_next) 5829 ; 5830 remainder = len; 5831 space = M_TRAILINGSPACE(m); 5832 if (space > 0) { 5833 /* 5834 * Copy into available space. 5835 */ 5836 if (space > remainder) 5837 space = remainder; 5838 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 5839 m->m_len += space; 5840 cp += space, remainder -= space; 5841 } 5842 while (remainder > 0) { 5843 /* 5844 * Allocate a new mbuf; could check space 5845 * and allocate a cluster instead. 5846 */ 5847 n = m_get(M_WAITOK, m->m_type); 5848 if (n == NULL) 5849 break; 5850 n->m_len = min(MLEN, remainder); 5851 bcopy(cp, mtod(n, caddr_t), n->m_len); 5852 cp += n->m_len; 5853 remainder -= n->m_len; 5854 m->m_next = n; 5855 m = n; 5856 } 5857 if (m0->m_flags & M_PKTHDR) 5858 m0->m_pkthdr.len += len - remainder; 5859 return (remainder == 0); 5860} 5861 5862struct mbuf * 5863m_last(struct mbuf *m) 5864{ 5865 while (m->m_next != NULL) 5866 m = m->m_next; 5867 return (m); 5868} 5869 5870unsigned int 5871m_fixhdr(struct mbuf *m0) 5872{ 5873 u_int len; 5874 5875 VERIFY(m0->m_flags & M_PKTHDR); 5876 5877 len = m_length2(m0, NULL); 5878 m0->m_pkthdr.len = len; 5879 return (len); 5880} 5881 5882unsigned int 5883m_length2(struct mbuf *m0, struct mbuf **last) 5884{ 5885 struct mbuf *m; 5886 u_int len; 5887 5888 len = 0; 5889 for (m = m0; m != NULL; m = m->m_next) { 5890 len += m->m_len; 5891 if (m->m_next == NULL) 5892 break; 5893 } 5894 if (last != NULL) 5895 *last = m; 5896 return (len); 5897} 5898 5899/* 5900 * Defragment a mbuf chain, returning the shortest possible chain of mbufs 5901 * and clusters. If allocation fails and this cannot be completed, NULL will 5902 * be returned, but the passed in chain will be unchanged. Upon success, 5903 * the original chain will be freed, and the new chain will be returned. 5904 * 5905 * If a non-packet header is passed in, the original mbuf (chain?) will 5906 * be returned unharmed. 5907 * 5908 * If offset is specfied, the first mbuf in the chain will have a leading 5909 * space of the amount stated by the "off" parameter. 5910 * 5911 * This routine requires that the m_pkthdr.header field of the original 5912 * mbuf chain is cleared by the caller. 5913 */ 5914struct mbuf * 5915m_defrag_offset(struct mbuf *m0, u_int32_t off, int how) 5916{ 5917 struct mbuf *m_new = NULL, *m_final = NULL; 5918 int progress = 0, length, pktlen; 5919 5920 if (!(m0->m_flags & M_PKTHDR)) 5921 return (m0); 5922 5923 VERIFY(off < MHLEN); 5924 m_fixhdr(m0); /* Needed sanity check */ 5925 5926 pktlen = m0->m_pkthdr.len + off; 5927 if (pktlen > MHLEN) 5928 m_final = m_getcl(how, MT_DATA, M_PKTHDR); 5929 else 5930 m_final = m_gethdr(how, MT_DATA); 5931 5932 if (m_final == NULL) 5933 goto nospace; 5934 5935 if (off > 0) { 5936 pktlen -= off; 5937 m_final->m_data += off; 5938 } 5939 5940 /* 5941 * Caller must have handled the contents pointed to by this 5942 * pointer before coming here, as otherwise it will point to 5943 * the original mbuf which will get freed upon success. 5944 */ 5945 VERIFY(m0->m_pkthdr.pkt_hdr == NULL); 5946 5947 if (m_dup_pkthdr(m_final, m0, how) == 0) 5948 goto nospace; 5949 5950 m_new = m_final; 5951 5952 while (progress < pktlen) { 5953 length = pktlen - progress; 5954 if (length > MCLBYTES) 5955 length = MCLBYTES; 5956 length -= ((m_new == m_final) ? off : 0); 5957 5958 if (m_new == NULL) { 5959 if (length > MLEN) 5960 m_new = m_getcl(how, MT_DATA, 0); 5961 else 5962 m_new = m_get(how, MT_DATA); 5963 if (m_new == NULL) 5964 goto nospace; 5965 } 5966 5967 m_copydata(m0, progress, length, mtod(m_new, caddr_t)); 5968 progress += length; 5969 m_new->m_len = length; 5970 if (m_new != m_final) 5971 m_cat(m_final, m_new); 5972 m_new = NULL; 5973 } 5974 m_freem(m0); 5975 m0 = m_final; 5976 return (m0); 5977nospace: 5978 if (m_final) 5979 m_freem(m_final); 5980 return (NULL); 5981} 5982 5983struct mbuf * 5984m_defrag(struct mbuf *m0, int how) 5985{ 5986 return (m_defrag_offset(m0, 0, how)); 5987} 5988 5989void 5990m_mchtype(struct mbuf *m, int t) 5991{ 5992 mtype_stat_inc(t); 5993 mtype_stat_dec(m->m_type); 5994 (m)->m_type = t; 5995} 5996 5997void * 5998m_mtod(struct mbuf *m) 5999{ 6000 return (MTOD(m, void *)); 6001} 6002 6003struct mbuf * 6004m_dtom(void *x) 6005{ 6006 return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1))); 6007} 6008 6009void 6010m_mcheck(struct mbuf *m) 6011{ 6012 _MCHECK(m); 6013} 6014 6015/* 6016 * Return a pointer to mbuf/offset of location in mbuf chain. 6017 */ 6018struct mbuf * 6019m_getptr(struct mbuf *m, int loc, int *off) 6020{ 6021 6022 while (loc >= 0) { 6023 /* Normal end of search. */ 6024 if (m->m_len > loc) { 6025 *off = loc; 6026 return (m); 6027 } else { 6028 loc -= m->m_len; 6029 if (m->m_next == NULL) { 6030 if (loc == 0) { 6031 /* Point at the end of valid data. */ 6032 *off = m->m_len; 6033 return (m); 6034 } 6035 return (NULL); 6036 } 6037 m = m->m_next; 6038 } 6039 } 6040 return (NULL); 6041} 6042 6043/* 6044 * Inform the corresponding mcache(s) that there's a waiter below. 6045 */ 6046static void 6047mbuf_waiter_inc(mbuf_class_t class, boolean_t comp) 6048{ 6049 mcache_waiter_inc(m_cache(class)); 6050 if (comp) { 6051 if (class == MC_CL) { 6052 mcache_waiter_inc(m_cache(MC_MBUF_CL)); 6053 } else if (class == MC_BIGCL) { 6054 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); 6055 } else if (class == MC_16KCL) { 6056 mcache_waiter_inc(m_cache(MC_MBUF_16KCL)); 6057 } else { 6058 mcache_waiter_inc(m_cache(MC_MBUF_CL)); 6059 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); 6060 } 6061 } 6062} 6063 6064/* 6065 * Inform the corresponding mcache(s) that there's no more waiter below. 6066 */ 6067static void 6068mbuf_waiter_dec(mbuf_class_t class, boolean_t comp) 6069{ 6070 mcache_waiter_dec(m_cache(class)); 6071 if (comp) { 6072 if (class == MC_CL) { 6073 mcache_waiter_dec(m_cache(MC_MBUF_CL)); 6074 } else if (class == MC_BIGCL) { 6075 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); 6076 } else if (class == MC_16KCL) { 6077 mcache_waiter_dec(m_cache(MC_MBUF_16KCL)); 6078 } else { 6079 mcache_waiter_dec(m_cache(MC_MBUF_CL)); 6080 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); 6081 } 6082 } 6083} 6084 6085/* 6086 * Called during slab (blocking and non-blocking) allocation. If there 6087 * is at least one waiter, and the time since the first waiter is blocked 6088 * is greater than the watchdog timeout, panic the system. 6089 */ 6090static void 6091mbuf_watchdog(void) 6092{ 6093 struct timeval now; 6094 unsigned int since; 6095 6096 if (mb_waiters == 0 || !mb_watchdog) 6097 return; 6098 6099 microuptime(&now); 6100 since = now.tv_sec - mb_wdtstart.tv_sec; 6101 if (since >= MB_WDT_MAXTIME) { 6102 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__, 6103 mb_waiters, since, mbuf_dump()); 6104 /* NOTREACHED */ 6105 } 6106} 6107 6108/* 6109 * Called during blocking allocation. Returns TRUE if one or more objects 6110 * are available at the per-CPU caches layer and that allocation should be 6111 * retried at that level. 6112 */ 6113static boolean_t 6114mbuf_sleep(mbuf_class_t class, unsigned int num, int wait) 6115{ 6116 boolean_t mcache_retry = FALSE; 6117 6118 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 6119 6120 /* Check if there's anything at the cache layer */ 6121 if (mbuf_cached_above(class, wait)) { 6122 mcache_retry = TRUE; 6123 goto done; 6124 } 6125 6126 /* Nothing? Then try hard to get it from somewhere */ 6127 m_reclaim(class, num, (wait & MCR_COMP)); 6128 6129 /* We tried hard and got something? */ 6130 if (m_infree(class) > 0) { 6131 mbstat.m_wait++; 6132 goto done; 6133 } else if (mbuf_cached_above(class, wait)) { 6134 mbstat.m_wait++; 6135 mcache_retry = TRUE; 6136 goto done; 6137 } else if (wait & MCR_TRYHARD) { 6138 mcache_retry = TRUE; 6139 goto done; 6140 } 6141 6142 /* 6143 * There's really nothing for us right now; inform the 6144 * cache(s) that there is a waiter below and go to sleep. 6145 */ 6146 mbuf_waiter_inc(class, (wait & MCR_COMP)); 6147 6148 VERIFY(!(wait & MCR_NOSLEEP)); 6149 6150 /* 6151 * If this is the first waiter, arm the watchdog timer. Otherwise 6152 * check if we need to panic the system due to watchdog timeout. 6153 */ 6154 if (mb_waiters == 0) 6155 microuptime(&mb_wdtstart); 6156 else 6157 mbuf_watchdog(); 6158 6159 mb_waiters++; 6160 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL); 6161 6162 /* We are now up; stop getting notified until next round */ 6163 mbuf_waiter_dec(class, (wait & MCR_COMP)); 6164 6165 /* We waited and got something */ 6166 if (m_infree(class) > 0) { 6167 mbstat.m_wait++; 6168 goto done; 6169 } else if (mbuf_cached_above(class, wait)) { 6170 mbstat.m_wait++; 6171 mcache_retry = TRUE; 6172 } 6173done: 6174 return (mcache_retry); 6175} 6176 6177static void 6178mbuf_worker_thread(void) 6179{ 6180 int mbuf_expand; 6181 6182 while (1) { 6183 lck_mtx_lock(mbuf_mlock); 6184 6185 mbuf_expand = 0; 6186 if (mbuf_expand_mcl) { 6187 int n; 6188 6189 /* Adjust to current number of cluster in use */ 6190 n = mbuf_expand_mcl - 6191 (m_total(MC_CL) - m_infree(MC_CL)); 6192 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) 6193 n = m_maxlimit(MC_CL) - m_total(MC_CL); 6194 mbuf_expand_mcl = 0; 6195 6196 if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0) 6197 mbuf_expand++; 6198 } 6199 if (mbuf_expand_big) { 6200 int n; 6201 6202 /* Adjust to current number of 4 KB cluster in use */ 6203 n = mbuf_expand_big - 6204 (m_total(MC_BIGCL) - m_infree(MC_BIGCL)); 6205 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) 6206 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL); 6207 mbuf_expand_big = 0; 6208 6209 if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0) 6210 mbuf_expand++; 6211 } 6212 if (mbuf_expand_16k) { 6213 int n; 6214 6215 /* Adjust to current number of 16 KB cluster in use */ 6216 n = mbuf_expand_16k - 6217 (m_total(MC_16KCL) - m_infree(MC_16KCL)); 6218 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) 6219 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL); 6220 mbuf_expand_16k = 0; 6221 6222 if (n > 0) 6223 (void) freelist_populate(MC_16KCL, n, M_WAIT); 6224 } 6225 6226 /* 6227 * Because we can run out of memory before filling the mbuf 6228 * map, we should not allocate more clusters than they are 6229 * mbufs -- otherwise we could have a large number of useless 6230 * clusters allocated. 6231 */ 6232 if (mbuf_expand) { 6233 while (m_total(MC_MBUF) < 6234 (m_total(MC_BIGCL) + m_total(MC_CL))) { 6235 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) 6236 break; 6237 } 6238 } 6239 6240 lck_mtx_unlock(mbuf_mlock); 6241 6242 assert_wait(&mbuf_worker_run, THREAD_UNINT); 6243 (void) thread_block((thread_continue_t)mbuf_worker_thread); 6244 } 6245} 6246 6247static void 6248mbuf_worker_thread_init(void) 6249{ 6250 mbuf_worker_ready++; 6251 mbuf_worker_thread(); 6252} 6253 6254static mcl_slab_t * 6255slab_get(void *buf) 6256{ 6257 mcl_slabg_t *slg; 6258 unsigned int ix, k; 6259 6260 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 6261 6262 VERIFY(MBUF_IN_MAP(buf)); 6263 ix = ((char *)buf - (char *)mbutl) >> MBSHIFT; 6264 VERIFY(ix < maxslabgrp); 6265 6266 if ((slg = slabstbl[ix]) == NULL) { 6267 /* 6268 * In the current implementation, we never shrink the slabs 6269 * table; if we attempt to reallocate a cluster group when 6270 * it's already allocated, panic since this is a sign of a 6271 * memory corruption (slabstbl[ix] got nullified). 6272 */ 6273 ++slabgrp; 6274 VERIFY(ix < slabgrp); 6275 /* 6276 * Slabs expansion can only be done single threaded; when 6277 * we get here, it must be as a result of m_clalloc() which 6278 * is serialized and therefore mb_clalloc_busy must be set. 6279 */ 6280 VERIFY(mb_clalloc_busy); 6281 lck_mtx_unlock(mbuf_mlock); 6282 6283 /* This is a new buffer; create the slabs group for it */ 6284 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP, 6285 M_WAITOK | M_ZERO); 6286 VERIFY(slg != NULL); 6287 6288 lck_mtx_lock(mbuf_mlock); 6289 /* 6290 * No other thread could have gone into m_clalloc() after 6291 * we dropped the lock above, so verify that it's true. 6292 */ 6293 VERIFY(mb_clalloc_busy); 6294 6295 slabstbl[ix] = slg; 6296 6297 /* Chain each slab in the group to its forward neighbor */ 6298 for (k = 1; k < NSLABSPMB; k++) 6299 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k]; 6300 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL); 6301 6302 /* And chain the last slab in the previous group to this */ 6303 if (ix > 0) { 6304 VERIFY(slabstbl[ix - 1]-> 6305 slg_slab[NSLABSPMB - 1].sl_next == NULL); 6306 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next = 6307 &slg->slg_slab[0]; 6308 } 6309 } 6310 6311 ix = MTOBG(buf) % NSLABSPMB; 6312 VERIFY(ix < NSLABSPMB); 6313 6314 return (&slg->slg_slab[ix]); 6315} 6316 6317static void 6318slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags, 6319 void *base, void *head, unsigned int len, int refcnt, int chunks) 6320{ 6321 sp->sl_class = class; 6322 sp->sl_flags = flags; 6323 sp->sl_base = base; 6324 sp->sl_head = head; 6325 sp->sl_len = len; 6326 sp->sl_refcnt = refcnt; 6327 sp->sl_chunks = chunks; 6328 slab_detach(sp); 6329} 6330 6331static void 6332slab_insert(mcl_slab_t *sp, mbuf_class_t class) 6333{ 6334 VERIFY(slab_is_detached(sp)); 6335 m_slab_cnt(class)++; 6336 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link); 6337 sp->sl_flags &= ~SLF_DETACHED; 6338 if (class == MC_16KCL) { 6339 int k; 6340 for (k = 1; k < NSLABSP16KB; k++) { 6341 sp = sp->sl_next; 6342 /* Next slab must already be present */ 6343 VERIFY(sp != NULL); 6344 VERIFY(slab_is_detached(sp)); 6345 sp->sl_flags &= ~SLF_DETACHED; 6346 } 6347 } 6348} 6349 6350static void 6351slab_remove(mcl_slab_t *sp, mbuf_class_t class) 6352{ 6353 VERIFY(!slab_is_detached(sp)); 6354 VERIFY(m_slab_cnt(class) > 0); 6355 m_slab_cnt(class)--; 6356 TAILQ_REMOVE(&m_slablist(class), sp, sl_link); 6357 slab_detach(sp); 6358 if (class == MC_16KCL) { 6359 int k; 6360 for (k = 1; k < NSLABSP16KB; k++) { 6361 sp = sp->sl_next; 6362 /* Next slab must already be present */ 6363 VERIFY(sp != NULL); 6364 VERIFY(!slab_is_detached(sp)); 6365 slab_detach(sp); 6366 } 6367 } 6368} 6369 6370static boolean_t 6371slab_inrange(mcl_slab_t *sp, void *buf) 6372{ 6373 return ((uintptr_t)buf >= (uintptr_t)sp->sl_base && 6374 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len)); 6375} 6376 6377#undef panic 6378 6379static void 6380slab_nextptr_panic(mcl_slab_t *sp, void *addr) 6381{ 6382 int i; 6383 unsigned int chunk_len = sp->sl_len / sp->sl_chunks; 6384 uintptr_t buf = (uintptr_t)sp->sl_base; 6385 6386 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) { 6387 void *next = ((mcache_obj_t *)buf)->obj_next; 6388 if (next != addr) 6389 continue; 6390 if (!mclverify) { 6391 if (next != NULL && !MBUF_IN_MAP(next)) { 6392 mcache_t *cp = m_cache(sp->sl_class); 6393 panic("%s: %s buffer %p in slab %p modified " 6394 "after free at offset 0: %p out of range " 6395 "[%p-%p)\n", __func__, cp->mc_name, 6396 (void *)buf, sp, next, mbutl, embutl); 6397 /* NOTREACHED */ 6398 } 6399 } else { 6400 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class, 6401 (mcache_obj_t *)buf); 6402 mcl_audit_verify_nextptr(next, mca); 6403 } 6404 } 6405} 6406 6407static void 6408slab_detach(mcl_slab_t *sp) 6409{ 6410 sp->sl_link.tqe_next = (mcl_slab_t *)-1; 6411 sp->sl_link.tqe_prev = (mcl_slab_t **)-1; 6412 sp->sl_flags |= SLF_DETACHED; 6413} 6414 6415static boolean_t 6416slab_is_detached(mcl_slab_t *sp) 6417{ 6418 return ((intptr_t)sp->sl_link.tqe_next == -1 && 6419 (intptr_t)sp->sl_link.tqe_prev == -1 && 6420 (sp->sl_flags & SLF_DETACHED)); 6421} 6422 6423static void 6424mcl_audit_init(void *buf, mcache_audit_t **mca_list, 6425 mcache_obj_t **con_list, size_t con_size, unsigned int num) 6426{ 6427 mcache_audit_t *mca, *mca_tail; 6428 mcache_obj_t *con = NULL; 6429 boolean_t save_contents = (con_list != NULL); 6430 unsigned int i, ix; 6431 6432 ASSERT(num <= NMBPBG); 6433 ASSERT(con_list == NULL || con_size != 0); 6434 6435 ix = MTOBG(buf); 6436 VERIFY(ix < maxclaudit); 6437 6438 /* Make sure we haven't been here before */ 6439 for (i = 0; i < NMBPBG; i++) 6440 VERIFY(mclaudit[ix].cl_audit[i] == NULL); 6441 6442 mca = mca_tail = *mca_list; 6443 if (save_contents) 6444 con = *con_list; 6445 6446 for (i = 0; i < num; i++) { 6447 mcache_audit_t *next; 6448 6449 next = mca->mca_next; 6450 bzero(mca, sizeof (*mca)); 6451 mca->mca_next = next; 6452 mclaudit[ix].cl_audit[i] = mca; 6453 6454 /* Attach the contents buffer if requested */ 6455 if (save_contents) { 6456 mcl_saved_contents_t *msc = 6457 (mcl_saved_contents_t *)(void *)con; 6458 6459 VERIFY(msc != NULL); 6460 VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t))); 6461 VERIFY(con_size == sizeof (*msc)); 6462 mca->mca_contents_size = con_size; 6463 mca->mca_contents = msc; 6464 con = con->obj_next; 6465 bzero(mca->mca_contents, mca->mca_contents_size); 6466 } 6467 6468 mca_tail = mca; 6469 mca = mca->mca_next; 6470 } 6471 6472 if (save_contents) 6473 *con_list = con; 6474 6475 *mca_list = mca_tail->mca_next; 6476 mca_tail->mca_next = NULL; 6477} 6478 6479static void 6480mcl_audit_free(void *buf, unsigned int num) 6481{ 6482 unsigned int i, ix; 6483 mcache_audit_t *mca, *mca_list; 6484 6485 ix = MTOBG(buf); 6486 VERIFY(ix < maxclaudit); 6487 6488 if (mclaudit[ix].cl_audit[0] != NULL) { 6489 mca_list = mclaudit[ix].cl_audit[0]; 6490 for (i = 0; i < num; i++) { 6491 mca = mclaudit[ix].cl_audit[i]; 6492 mclaudit[ix].cl_audit[i] = NULL; 6493 if (mca->mca_contents) 6494 mcache_free(mcl_audit_con_cache, 6495 mca->mca_contents); 6496 } 6497 mcache_free_ext(mcache_audit_cache, 6498 (mcache_obj_t *)mca_list); 6499 } 6500} 6501 6502/* 6503 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return 6504 * the corresponding audit structure for that buffer. 6505 */ 6506static mcache_audit_t * 6507mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o) 6508{ 6509 mcache_audit_t *mca = NULL; 6510 int ix = MTOBG(o); 6511 6512 VERIFY(ix < maxclaudit); 6513 VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG))); 6514 6515 switch (class) { 6516 case MC_MBUF: 6517 /* 6518 * For the mbuf case, find the index of the page 6519 * used by the mbuf and use that index to locate the 6520 * base address of the page. Then find out the 6521 * mbuf index relative to the page base and use 6522 * it to locate the audit structure. 6523 */ 6524 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG); 6525 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)]; 6526 break; 6527 6528 case MC_CL: 6529 /* 6530 * Same thing as above, but for 2KB clusters in a page. 6531 */ 6532 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG); 6533 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)]; 6534 break; 6535 6536 case MC_BIGCL: 6537 case MC_16KCL: 6538 /* 6539 * Same as above, but only return the first element. 6540 */ 6541 mca = mclaudit[ix].cl_audit[0]; 6542 break; 6543 6544 default: 6545 VERIFY(0); 6546 /* NOTREACHED */ 6547 } 6548 6549 return (mca); 6550} 6551 6552static void 6553mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite, 6554 boolean_t alloc) 6555{ 6556 struct mbuf *m = addr; 6557 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next; 6558 6559 VERIFY(mca->mca_contents != NULL && 6560 mca->mca_contents_size == AUDIT_CONTENTS_SIZE); 6561 6562 if (mclverify) 6563 mcl_audit_verify_nextptr(next, mca); 6564 6565 if (!alloc) { 6566 /* Save constructed mbuf fields */ 6567 mcl_audit_save_mbuf(m, mca); 6568 if (mclverify) { 6569 mcache_set_pattern(MCACHE_FREE_PATTERN, m, 6570 m_maxsize(MC_MBUF)); 6571 } 6572 ((mcache_obj_t *)m)->obj_next = next; 6573 return; 6574 } 6575 6576 /* Check if the buffer has been corrupted while in freelist */ 6577 if (mclverify) { 6578 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF)); 6579 } 6580 /* Restore constructed mbuf fields */ 6581 mcl_audit_restore_mbuf(m, mca, composite); 6582} 6583 6584static void 6585mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite) 6586{ 6587 struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca); 6588 6589 if (composite) { 6590 struct mbuf *next = m->m_next; 6591 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL && 6592 MBUF_IS_COMPOSITE(ms)); 6593 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE); 6594 /* 6595 * We could have hand-picked the mbuf fields and restore 6596 * them individually, but that will be a maintenance 6597 * headache. Instead, restore everything that was saved; 6598 * the mbuf layer will recheck and reinitialize anyway. 6599 */ 6600 bcopy(ms, m, MCA_SAVED_MBUF_SIZE); 6601 m->m_next = next; 6602 } else { 6603 /* 6604 * For a regular mbuf (no cluster attached) there's nothing 6605 * to restore other than the type field, which is expected 6606 * to be MT_FREE. 6607 */ 6608 m->m_type = ms->m_type; 6609 } 6610 _MCHECK(m); 6611} 6612 6613static void 6614mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca) 6615{ 6616 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE); 6617 _MCHECK(m); 6618 bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE); 6619} 6620 6621static void 6622mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc, 6623 boolean_t save_next) 6624{ 6625 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next; 6626 6627 if (!alloc) { 6628 if (mclverify) { 6629 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size); 6630 } 6631 if (save_next) { 6632 mcl_audit_verify_nextptr(next, mca); 6633 ((mcache_obj_t *)addr)->obj_next = next; 6634 } 6635 } else if (mclverify) { 6636 /* Check if the buffer has been corrupted while in freelist */ 6637 mcl_audit_verify_nextptr(next, mca); 6638 mcache_audit_free_verify_set(mca, addr, 0, size); 6639 } 6640} 6641 6642static void 6643mcl_audit_scratch(mcache_audit_t *mca) 6644{ 6645 void *stack[MCACHE_STACK_DEPTH + 1]; 6646 mcl_scratch_audit_t *msa; 6647 struct timeval now; 6648 6649 VERIFY(mca->mca_contents != NULL); 6650 msa = MCA_SAVED_SCRATCH_PTR(mca); 6651 6652 msa->msa_pthread = msa->msa_thread; 6653 msa->msa_thread = current_thread(); 6654 bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack)); 6655 msa->msa_pdepth = msa->msa_depth; 6656 bzero(stack, sizeof (stack)); 6657 msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1; 6658 bcopy(&stack[1], msa->msa_stack, sizeof (msa->msa_stack)); 6659 6660 msa->msa_ptstamp = msa->msa_tstamp; 6661 microuptime(&now); 6662 /* tstamp is in ms relative to base_ts */ 6663 msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000); 6664 if ((now.tv_sec - mb_start.tv_sec) > 0) 6665 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000); 6666} 6667 6668static void 6669mcl_audit_mcheck_panic(struct mbuf *m) 6670{ 6671 mcache_audit_t *mca; 6672 6673 MRANGE(m); 6674 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 6675 6676 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n", 6677 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca)); 6678 /* NOTREACHED */ 6679} 6680 6681static void 6682mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca) 6683{ 6684 if (next != NULL && !MBUF_IN_MAP(next) && 6685 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) { 6686 panic("mcl_audit: buffer %p modified after free at offset 0: " 6687 "%p out of range [%p-%p)\n%s\n", 6688 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca)); 6689 /* NOTREACHED */ 6690 } 6691} 6692 6693/* This function turns on mbuf leak detection */ 6694static void 6695mleak_activate(void) 6696{ 6697 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR; 6698 PE_parse_boot_argn("mleak_sample_factor", 6699 &mleak_table.mleak_sample_factor, 6700 sizeof (mleak_table.mleak_sample_factor)); 6701 6702 if (mleak_table.mleak_sample_factor == 0) 6703 mclfindleak = 0; 6704 6705 if (mclfindleak == 0) 6706 return; 6707 6708 vm_size_t alloc_size = 6709 mleak_alloc_buckets * sizeof (struct mallocation); 6710 vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace); 6711 6712 MALLOC(mleak_allocations, struct mallocation *, alloc_size, 6713 M_TEMP, M_WAITOK | M_ZERO); 6714 VERIFY(mleak_allocations != NULL); 6715 6716 MALLOC(mleak_traces, struct mtrace *, trace_size, 6717 M_TEMP, M_WAITOK | M_ZERO); 6718 VERIFY(mleak_traces != NULL); 6719 6720 MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES), 6721 M_TEMP, M_WAITOK | M_ZERO); 6722 VERIFY(mleak_stat != NULL); 6723 mleak_stat->ml_cnt = MLEAK_NUM_TRACES; 6724#ifdef __LP64__ 6725 mleak_stat->ml_isaddr64 = 1; 6726#endif /* __LP64__ */ 6727} 6728 6729static void 6730mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc) 6731{ 6732 int temp; 6733 6734 if (mclfindleak == 0) 6735 return; 6736 6737 if (!alloc) 6738 return (mleak_free(addr)); 6739 6740 temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1); 6741 6742 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) { 6743 uintptr_t bt[MLEAK_STACK_DEPTH]; 6744 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH); 6745 mleak_log(bt, addr, logged, num); 6746 } 6747} 6748 6749/* 6750 * This function records the allocation in the mleak_allocations table 6751 * and the backtrace in the mleak_traces table; if allocation slot is in use, 6752 * replace old allocation with new one if the trace slot is in use, return 6753 * (or increment refcount if same trace). 6754 */ 6755static boolean_t 6756mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num) 6757{ 6758 struct mallocation *allocation; 6759 struct mtrace *trace; 6760 uint32_t trace_index; 6761 6762 /* Quit if someone else modifying the tables */ 6763 if (!lck_mtx_try_lock_spin(mleak_lock)) { 6764 mleak_table.total_conflicts++; 6765 return (FALSE); 6766 } 6767 6768 allocation = &mleak_allocations[hashaddr((uintptr_t)addr, 6769 mleak_alloc_buckets)]; 6770 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets); 6771 trace = &mleak_traces[trace_index]; 6772 6773 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]); 6774 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]); 6775 6776 allocation->hitcount++; 6777 trace->hitcount++; 6778 6779 /* 6780 * If the allocation bucket we want is occupied 6781 * and the occupier has the same trace, just bail. 6782 */ 6783 if (allocation->element != NULL && 6784 trace_index == allocation->trace_index) { 6785 mleak_table.alloc_collisions++; 6786 lck_mtx_unlock(mleak_lock); 6787 return (TRUE); 6788 } 6789 6790 /* 6791 * Store the backtrace in the traces array; 6792 * Size of zero = trace bucket is free. 6793 */ 6794 if (trace->allocs > 0 && 6795 bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) { 6796 /* Different, unique trace, but the same hash! Bail out. */ 6797 trace->collisions++; 6798 mleak_table.trace_collisions++; 6799 lck_mtx_unlock(mleak_lock); 6800 return (TRUE); 6801 } else if (trace->allocs > 0) { 6802 /* Same trace, already added, so increment refcount */ 6803 trace->allocs++; 6804 } else { 6805 /* Found an unused trace bucket, so record the trace here */ 6806 if (trace->depth != 0) { 6807 /* this slot previously used but not currently in use */ 6808 mleak_table.trace_overwrites++; 6809 } 6810 mleak_table.trace_recorded++; 6811 trace->allocs = 1; 6812 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t))); 6813 trace->depth = depth; 6814 trace->collisions = 0; 6815 } 6816 6817 /* Step 2: Store the allocation record in the allocations array */ 6818 if (allocation->element != NULL) { 6819 /* 6820 * Replace an existing allocation. No need to preserve 6821 * because only a subset of the allocations are being 6822 * recorded anyway. 6823 */ 6824 mleak_table.alloc_collisions++; 6825 } else if (allocation->trace_index != 0) { 6826 mleak_table.alloc_overwrites++; 6827 } 6828 allocation->element = addr; 6829 allocation->trace_index = trace_index; 6830 allocation->count = num; 6831 mleak_table.alloc_recorded++; 6832 mleak_table.outstanding_allocs++; 6833 6834 lck_mtx_unlock(mleak_lock); 6835 return (TRUE); 6836} 6837 6838static void 6839mleak_free(mcache_obj_t *addr) 6840{ 6841 while (addr != NULL) { 6842 struct mallocation *allocation = &mleak_allocations 6843 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)]; 6844 6845 if (allocation->element == addr && 6846 allocation->trace_index < mleak_trace_buckets) { 6847 lck_mtx_lock_spin(mleak_lock); 6848 if (allocation->element == addr && 6849 allocation->trace_index < mleak_trace_buckets) { 6850 struct mtrace *trace; 6851 trace = &mleak_traces[allocation->trace_index]; 6852 /* allocs = 0 means trace bucket is unused */ 6853 if (trace->allocs > 0) 6854 trace->allocs--; 6855 if (trace->allocs == 0) 6856 trace->depth = 0; 6857 /* NULL element means alloc bucket is unused */ 6858 allocation->element = NULL; 6859 mleak_table.outstanding_allocs--; 6860 } 6861 lck_mtx_unlock(mleak_lock); 6862 } 6863 addr = addr->obj_next; 6864 } 6865} 6866 6867static void 6868mleak_sort_traces() 6869{ 6870 int i, j, k; 6871 struct mtrace *swap; 6872 6873 for(i = 0; i < MLEAK_NUM_TRACES; i++) 6874 mleak_top_trace[i] = NULL; 6875 6876 for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) 6877 { 6878 if (mleak_traces[i].allocs <= 0) 6879 continue; 6880 6881 mleak_top_trace[j] = &mleak_traces[i]; 6882 for (k = j; k > 0; k--) { 6883 if (mleak_top_trace[k]->allocs <= 6884 mleak_top_trace[k-1]->allocs) 6885 break; 6886 6887 swap = mleak_top_trace[k-1]; 6888 mleak_top_trace[k-1] = mleak_top_trace[k]; 6889 mleak_top_trace[k] = swap; 6890 } 6891 j++; 6892 } 6893 6894 j--; 6895 for(; i < mleak_trace_buckets; i++) { 6896 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) 6897 continue; 6898 6899 mleak_top_trace[j] = &mleak_traces[i]; 6900 6901 for (k = j; k > 0; k--) { 6902 if (mleak_top_trace[k]->allocs <= 6903 mleak_top_trace[k-1]->allocs) 6904 break; 6905 6906 swap = mleak_top_trace[k-1]; 6907 mleak_top_trace[k-1] = mleak_top_trace[k]; 6908 mleak_top_trace[k] = swap; 6909 } 6910 } 6911} 6912 6913static void 6914mleak_update_stats() 6915{ 6916 mleak_trace_stat_t *mltr; 6917 int i; 6918 6919 VERIFY(mleak_stat != NULL); 6920#ifdef __LP64__ 6921 VERIFY(mleak_stat->ml_isaddr64); 6922#else 6923 VERIFY(!mleak_stat->ml_isaddr64); 6924#endif /* !__LP64__ */ 6925 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES); 6926 6927 mleak_sort_traces(); 6928 6929 mltr = &mleak_stat->ml_trace[0]; 6930 bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES); 6931 for (i = 0; i < MLEAK_NUM_TRACES; i++) { 6932 int j; 6933 6934 if (mleak_top_trace[i] == NULL || 6935 mleak_top_trace[i]->allocs == 0) 6936 continue; 6937 6938 mltr->mltr_collisions = mleak_top_trace[i]->collisions; 6939 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount; 6940 mltr->mltr_allocs = mleak_top_trace[i]->allocs; 6941 mltr->mltr_depth = mleak_top_trace[i]->depth; 6942 6943 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH); 6944 for (j = 0; j < mltr->mltr_depth; j++) 6945 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j]; 6946 6947 mltr++; 6948 } 6949} 6950 6951static struct mbtypes { 6952 int mt_type; 6953 const char *mt_name; 6954} mbtypes[] = { 6955 { MT_DATA, "data" }, 6956 { MT_OOBDATA, "oob data" }, 6957 { MT_CONTROL, "ancillary data" }, 6958 { MT_HEADER, "packet headers" }, 6959 { MT_SOCKET, "socket structures" }, 6960 { MT_PCB, "protocol control blocks" }, 6961 { MT_RTABLE, "routing table entries" }, 6962 { MT_HTABLE, "IMP host table entries" }, 6963 { MT_ATABLE, "address resolution tables" }, 6964 { MT_FTABLE, "fragment reassembly queue headers" }, 6965 { MT_SONAME, "socket names and addresses" }, 6966 { MT_SOOPTS, "socket options" }, 6967 { MT_RIGHTS, "access rights" }, 6968 { MT_IFADDR, "interface addresses" }, 6969 { MT_TAG, "packet tags" }, 6970 { 0, NULL } 6971}; 6972 6973#define MBUF_DUMP_BUF_CHK() { \ 6974 clen -= k; \ 6975 if (clen < 1) \ 6976 goto done; \ 6977 c += k; \ 6978} 6979 6980static char * 6981mbuf_dump(void) 6982{ 6983 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct; 6984 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0; 6985 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0; 6986 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0; 6987 int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short); 6988 uint8_t seen[256]; 6989 struct mbtypes *mp; 6990 mb_class_stat_t *sp; 6991 mleak_trace_stat_t *mltr; 6992 char *c = mbuf_dump_buf; 6993 int i, k, clen = MBUF_DUMP_BUF_SIZE; 6994 6995 mbuf_dump_buf[0] = '\0'; 6996 6997 /* synchronize all statistics in the mbuf table */ 6998 mbuf_stat_sync(); 6999 mbuf_mtypes_sync(TRUE); 7000 7001 sp = &mb_stat->mbs_class[0]; 7002 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) { 7003 u_int32_t mem; 7004 7005 if (m_class(i) == MC_MBUF) { 7006 m_mbufs = sp->mbcl_active; 7007 } else if (m_class(i) == MC_CL) { 7008 m_clfree = sp->mbcl_total - sp->mbcl_active; 7009 } else if (m_class(i) == MC_BIGCL) { 7010 m_bigclfree = sp->mbcl_total - sp->mbcl_active; 7011 } else if (njcl > 0 && m_class(i) == MC_16KCL) { 7012 m_16kclfree = sp->mbcl_total - sp->mbcl_active; 7013 m_16kclusters = sp->mbcl_total; 7014 } else if (m_class(i) == MC_MBUF_CL) { 7015 m_mbufclfree = sp->mbcl_total - sp->mbcl_active; 7016 } else if (m_class(i) == MC_MBUF_BIGCL) { 7017 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active; 7018 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) { 7019 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active; 7020 } 7021 7022 mem = sp->mbcl_ctotal * sp->mbcl_size; 7023 totmem += mem; 7024 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) * 7025 sp->mbcl_size; 7026 7027 } 7028 7029 /* adjust free counts to include composite caches */ 7030 m_clfree += m_mbufclfree; 7031 m_bigclfree += m_mbufbigclfree; 7032 m_16kclfree += m_mbuf16kclfree; 7033 7034 totmbufs = 0; 7035 for (mp = mbtypes; mp->mt_name != NULL; mp++) 7036 totmbufs += mbstat.m_mtypes[mp->mt_type]; 7037 if (totmbufs > m_mbufs) 7038 totmbufs = m_mbufs; 7039 k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs); 7040 MBUF_DUMP_BUF_CHK(); 7041 7042 bzero(&seen, sizeof (seen)); 7043 for (mp = mbtypes; mp->mt_name != NULL; mp++) { 7044 if (mbstat.m_mtypes[mp->mt_type] != 0) { 7045 seen[mp->mt_type] = 1; 7046 k = snprintf(c, clen, "\t%u mbufs allocated to %s\n", 7047 mbstat.m_mtypes[mp->mt_type], mp->mt_name); 7048 MBUF_DUMP_BUF_CHK(); 7049 } 7050 } 7051 seen[MT_FREE] = 1; 7052 for (i = 0; i < nmbtypes; i++) 7053 if (!seen[i] && mbstat.m_mtypes[i] != 0) { 7054 k = snprintf(c, clen, "\t%u mbufs allocated to " 7055 "<mbuf type %d>\n", mbstat.m_mtypes[i], i); 7056 MBUF_DUMP_BUF_CHK(); 7057 } 7058 if ((m_mbufs - totmbufs) > 0) { 7059 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n", 7060 m_mbufs - totmbufs); 7061 MBUF_DUMP_BUF_CHK(); 7062 } 7063 k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n" 7064 "%u/%u mbuf 4KB clusters in use\n", 7065 (unsigned int)(mbstat.m_clusters - m_clfree), 7066 (unsigned int)mbstat.m_clusters, 7067 (unsigned int)(mbstat.m_bigclusters - m_bigclfree), 7068 (unsigned int)mbstat.m_bigclusters); 7069 MBUF_DUMP_BUF_CHK(); 7070 7071 if (njcl > 0) { 7072 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n", 7073 m_16kclusters - m_16kclfree, m_16kclusters, 7074 njclbytes / 1024); 7075 MBUF_DUMP_BUF_CHK(); 7076 } 7077 totused = totmem - totfree; 7078 if (totmem == 0) { 7079 totpct = 0; 7080 } else if (totused < (ULONG_MAX / 100)) { 7081 totpct = (totused * 100) / totmem; 7082 } else { 7083 u_long totmem1 = totmem / 100; 7084 u_long totused1 = totused / 100; 7085 totpct = (totused1 * 100) / totmem1; 7086 } 7087 k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% " 7088 "in use)\n", totmem / 1024, totpct); 7089 MBUF_DUMP_BUF_CHK(); 7090 7091 /* mbuf leak detection statistics */ 7092 mleak_update_stats(); 7093 7094 k = snprintf(c, clen, "\nmbuf leak detection table:\n"); 7095 MBUF_DUMP_BUF_CHK(); 7096 k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n", 7097 mleak_table.mleak_capture / mleak_table.mleak_sample_factor, 7098 mleak_table.mleak_sample_factor); 7099 MBUF_DUMP_BUF_CHK(); 7100 k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n", 7101 mleak_table.outstanding_allocs); 7102 MBUF_DUMP_BUF_CHK(); 7103 k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n", 7104 mleak_table.alloc_recorded, mleak_table.trace_recorded); 7105 MBUF_DUMP_BUF_CHK(); 7106 k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n", 7107 mleak_table.alloc_collisions, mleak_table.trace_collisions); 7108 MBUF_DUMP_BUF_CHK(); 7109 k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n", 7110 mleak_table.alloc_overwrites, mleak_table.trace_overwrites); 7111 MBUF_DUMP_BUF_CHK(); 7112 k = snprintf(c, clen, "\tlock conflicts: %llu\n\n", 7113 mleak_table.total_conflicts); 7114 MBUF_DUMP_BUF_CHK(); 7115 7116 k = snprintf(c, clen, "top %d outstanding traces:\n", 7117 mleak_stat->ml_cnt); 7118 MBUF_DUMP_BUF_CHK(); 7119 for (i = 0; i < mleak_stat->ml_cnt; i++) { 7120 mltr = &mleak_stat->ml_trace[i]; 7121 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), " 7122 "%llu hit(s), %llu collision(s)\n", (i + 1), 7123 mltr->mltr_allocs, mltr->mltr_hitcount, 7124 mltr->mltr_collisions); 7125 MBUF_DUMP_BUF_CHK(); 7126 } 7127 7128 if (mleak_stat->ml_isaddr64) 7129 k = snprintf(c, clen, MB_LEAK_HDR_64); 7130 else 7131 k = snprintf(c, clen, MB_LEAK_HDR_32); 7132 MBUF_DUMP_BUF_CHK(); 7133 7134 for (i = 0; i < MLEAK_STACK_DEPTH; i++) { 7135 int j; 7136 k = snprintf(c, clen, "%2d: ", (i + 1)); 7137 MBUF_DUMP_BUF_CHK(); 7138 for (j = 0; j < mleak_stat->ml_cnt; j++) { 7139 mltr = &mleak_stat->ml_trace[j]; 7140 if (i < mltr->mltr_depth) { 7141 if (mleak_stat->ml_isaddr64) { 7142 k = snprintf(c, clen, "0x%0llx ", 7143 (uint64_t)VM_KERNEL_UNSLIDE( 7144 mltr->mltr_addr[i])); 7145 } else { 7146 k = snprintf(c, clen, 7147 "0x%08x ", 7148 (uint32_t)VM_KERNEL_UNSLIDE( 7149 mltr->mltr_addr[i])); 7150 } 7151 } else { 7152 if (mleak_stat->ml_isaddr64) 7153 k = snprintf(c, clen, 7154 MB_LEAK_SPACING_64); 7155 else 7156 k = snprintf(c, clen, 7157 MB_LEAK_SPACING_32); 7158 } 7159 MBUF_DUMP_BUF_CHK(); 7160 } 7161 k = snprintf(c, clen, "\n"); 7162 MBUF_DUMP_BUF_CHK(); 7163 } 7164done: 7165 return (mbuf_dump_buf); 7166} 7167 7168#undef MBUF_DUMP_BUF_CHK 7169 7170/* 7171 * Convert between a regular and a packet header mbuf. Caller is responsible 7172 * for setting or clearing M_PKTHDR; this routine does the rest of the work. 7173 */ 7174int 7175m_reinit(struct mbuf *m, int hdr) 7176{ 7177 int ret = 0; 7178 7179 if (hdr) { 7180 VERIFY(!(m->m_flags & M_PKTHDR)); 7181 if (!(m->m_flags & M_EXT) && 7182 (m->m_data != m->m_dat || m->m_len > 0)) { 7183 /* 7184 * If there's no external cluster attached and the 7185 * mbuf appears to contain user data, we cannot 7186 * safely convert this to a packet header mbuf, 7187 * as the packet header structure might overlap 7188 * with the data. 7189 */ 7190 printf("%s: cannot set M_PKTHDR on altered mbuf %llx, " 7191 "m_data %llx (expected %llx), " 7192 "m_len %d (expected 0)\n", 7193 __func__, 7194 (uint64_t)VM_KERNEL_ADDRPERM(m), 7195 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data), 7196 (uint64_t)VM_KERNEL_ADDRPERM(m->m_dat), m->m_len); 7197 ret = EBUSY; 7198 } else { 7199 VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat); 7200 m->m_flags |= M_PKTHDR; 7201 MBUF_INIT_PKTHDR(m); 7202 } 7203 } else { 7204 /* Check for scratch area overflow */ 7205 m_redzone_verify(m); 7206 /* Free the aux data and tags if there is any */ 7207 m_tag_delete_chain(m, NULL); 7208 m->m_flags &= ~M_PKTHDR; 7209 } 7210 7211 return (ret); 7212} 7213 7214void 7215m_scratch_init(struct mbuf *m) 7216{ 7217 struct pkthdr *pkt = &m->m_pkthdr; 7218 7219 VERIFY(m->m_flags & M_PKTHDR); 7220 7221 /* See comments in <rdar://problem/14040693> */ 7222 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) { 7223 panic_plain("Invalid attempt to modify guarded module-private " 7224 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags); 7225 /* NOTREACHED */ 7226 } 7227 7228 bzero(&pkt->pkt_mpriv, sizeof (pkt->pkt_mpriv)); 7229} 7230 7231/* 7232 * This routine is reserved for mbuf_get_driver_scratch(); clients inside 7233 * xnu that intend on utilizing the module-private area should directly 7234 * refer to the pkt_mpriv structure in the pkthdr. They are also expected 7235 * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior 7236 * to handing it off to another module, respectively. 7237 */ 7238u_int32_t 7239m_scratch_get(struct mbuf *m, u_int8_t **p) 7240{ 7241 struct pkthdr *pkt = &m->m_pkthdr; 7242 7243 VERIFY(m->m_flags & M_PKTHDR); 7244 7245 /* See comments in <rdar://problem/14040693> */ 7246 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) { 7247 panic_plain("Invalid attempt to access guarded module-private " 7248 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags); 7249 /* NOTREACHED */ 7250 } 7251 7252 if (mcltrace) { 7253 mcache_audit_t *mca; 7254 7255 lck_mtx_lock(mbuf_mlock); 7256 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 7257 if (mca->mca_uflags & MB_SCVALID) 7258 mcl_audit_scratch(mca); 7259 lck_mtx_unlock(mbuf_mlock); 7260 } 7261 7262 *p = (u_int8_t *)&pkt->pkt_mpriv; 7263 return (sizeof (pkt->pkt_mpriv)); 7264} 7265 7266static void 7267m_redzone_init(struct mbuf *m) 7268{ 7269 VERIFY(m->m_flags & M_PKTHDR); 7270 /* 7271 * Each mbuf has a unique red zone pattern, which is a XOR 7272 * of the red zone cookie and the address of the mbuf. 7273 */ 7274 m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie; 7275} 7276 7277static void 7278m_redzone_verify(struct mbuf *m) 7279{ 7280 u_int32_t mb_redzone; 7281 7282 VERIFY(m->m_flags & M_PKTHDR); 7283 7284 mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie; 7285 if (m->m_pkthdr.redzone != mb_redzone) { 7286 panic("mbuf %p redzone violation with value 0x%x " 7287 "(instead of 0x%x, using cookie 0x%x)\n", 7288 m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie); 7289 /* NOTREACHED */ 7290 } 7291} 7292 7293/* 7294 * Send a report of mbuf usage if the usage is at least 6% of max limit 7295 * or if there has been at least 3% increase since the last report. 7296 * 7297 * The values 6% and 3% are chosen so that we can do simple arithmetic 7298 * with shift operations. 7299 */ 7300static boolean_t 7301mbuf_report_usage(mbuf_class_t cl) 7302{ 7303 /* if a report is already in progress, nothing to do */ 7304 if (mb_peak_newreport) 7305 return (TRUE); 7306 7307 if (m_total(cl) > m_peak(cl) && 7308 m_total(cl) >= (m_maxlimit(cl) >> 4) && 7309 (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5)) 7310 return (TRUE); 7311 return (FALSE); 7312} 7313 7314__private_extern__ void 7315mbuf_report_peak_usage(void) 7316{ 7317 int i = 0; 7318 u_int64_t uptime; 7319 struct nstat_sysinfo_data ns_data; 7320 uint32_t memreleased = 0; 7321 7322 uptime = net_uptime(); 7323 lck_mtx_lock(mbuf_mlock); 7324 7325 /* Generate an initial report after 1 week of uptime */ 7326 if (!mb_peak_firstreport && 7327 uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) { 7328 mb_peak_newreport = TRUE; 7329 mb_peak_firstreport = TRUE; 7330 } 7331 7332 if (!mb_peak_newreport) { 7333 lck_mtx_unlock(mbuf_mlock); 7334 return; 7335 } 7336 7337 /* 7338 * Since a report is being generated before 1 week, 7339 * we do not need to force another one later 7340 */ 7341 if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD) 7342 mb_peak_firstreport = TRUE; 7343 7344 for (i = 0; i < NELEM(mbuf_table); i++) { 7345 m_peak(m_class(i)) = m_total(m_class(i)); 7346 memreleased += m_release_cnt(i); 7347 } 7348 mb_peak_newreport = FALSE; 7349 lck_mtx_unlock(mbuf_mlock); 7350 7351 bzero(&ns_data, sizeof(ns_data)); 7352 ns_data.flags = NSTAT_SYSINFO_MBUF_STATS; 7353 ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF); 7354 ns_data.u.mb_stats.total_2kb = m_peak(MC_CL); 7355 ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL); 7356 ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak; 7357 ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached; 7358 ns_data.u.mb_stats.draincnt = mbstat.m_drain; 7359 ns_data.u.mb_stats.memreleased = memreleased; 7360 7361 nstat_sysinfo_send_data(&ns_data); 7362} 7363 7364/* 7365 * Called by the VM when there's memory pressure. 7366 */ 7367__private_extern__ void 7368m_drain(void) 7369{ 7370 mbuf_class_t mc; 7371 mcl_slab_t *sp, *sp_tmp, *nsp; 7372 unsigned int num, k, interval, released = 0; 7373 unsigned int total_mem = 0, use_mem = 0; 7374 boolean_t ret, purge_caches = FALSE; 7375 ppnum_t offset; 7376 mcache_obj_t *obj; 7377 float per; 7378 static uint64_t last_drain = 0; 7379 static unsigned char scratch[32]; 7380 static ppnum_t scratch_pa = 0; 7381 7382 if (mb_drain_maxint == 0 || mb_waiters) 7383 return; 7384 if (scratch_pa == 0) { 7385 bzero(scratch, sizeof(scratch)); 7386 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch); 7387 VERIFY(scratch_pa); 7388 } else if (mclverify) { 7389 /* 7390 * Panic if a driver wrote to our scratch memory. 7391 */ 7392 for (k = 0; k < sizeof(scratch); k++) 7393 if (scratch[k]) 7394 panic("suspect DMA to freed address"); 7395 } 7396 /* 7397 * Don't free memory too often as that could cause excessive 7398 * waiting times for mbufs. Purge caches if we were asked to drain 7399 * in the last 5 minutes. 7400 */ 7401 lck_mtx_lock(mbuf_mlock); 7402 if (last_drain == 0) { 7403 last_drain = net_uptime(); 7404 lck_mtx_unlock(mbuf_mlock); 7405 return; 7406 } 7407 interval = net_uptime() - last_drain; 7408 if (interval <= mb_drain_maxint) { 7409 lck_mtx_unlock(mbuf_mlock); 7410 return; 7411 } 7412 if (interval <= mb_drain_maxint * 5) 7413 purge_caches = TRUE; 7414 last_drain = net_uptime(); 7415 /* 7416 * Don't free any memory if we're using 60% or more. 7417 */ 7418 for (mc = 0; mc < NELEM(mbuf_table); mc++) { 7419 total_mem += m_total(mc) * m_maxsize(mc); 7420 use_mem += m_active(mc) * m_maxsize(mc); 7421 } 7422 per = (float)use_mem / (float)total_mem; 7423 if (per >= 0.6) { 7424 lck_mtx_unlock(mbuf_mlock); 7425 return; 7426 } 7427 /* 7428 * Purge all the caches. This effectively disables 7429 * caching for a few seconds, but the mbuf worker thread will 7430 * re-enable them again. 7431 */ 7432 if (purge_caches == TRUE) 7433 for (mc = 0; mc < NELEM(mbuf_table); mc++) { 7434 if (m_total(mc) < m_avgtotal(mc)) 7435 continue; 7436 lck_mtx_unlock(mbuf_mlock); 7437 ret = mcache_purge_cache(m_cache(mc), FALSE); 7438 lck_mtx_lock(mbuf_mlock); 7439 if (ret == TRUE) 7440 m_purge_cnt(mc)++; 7441 } 7442 /* 7443 * Move the objects from the composite class freelist to 7444 * the rudimentary slabs list, but keep at least 10% of the average 7445 * total in the freelist. 7446 */ 7447 for (mc = 0; mc < NELEM(mbuf_table); mc++) { 7448 while (m_cobjlist(mc) && 7449 m_total(mc) < m_avgtotal(mc) && 7450 m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) { 7451 obj = m_cobjlist(mc); 7452 m_cobjlist(mc) = obj->obj_next; 7453 obj->obj_next = NULL; 7454 num = cslab_free(mc, obj, 1); 7455 VERIFY(num == 1); 7456 m_free_cnt(mc)++; 7457 m_infree(mc)--; 7458 /* cslab_free() handles m_total */ 7459 } 7460 } 7461 /* 7462 * Free the buffers present in the slab list up to 10% of the total 7463 * average per class. 7464 * 7465 * We walk the list backwards in an attempt to reduce fragmentation. 7466 */ 7467 for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) { 7468 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) { 7469 /* 7470 * Process only unused slabs occupying memory. 7471 */ 7472 if (sp->sl_refcnt != 0 || sp->sl_len == 0 || 7473 sp->sl_base == NULL) 7474 continue; 7475 if (m_total(mc) < m_avgtotal(mc) || 7476 m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc)) 7477 break; 7478 slab_remove(sp, mc); 7479 switch (mc) { 7480 case MC_MBUF: 7481 m_infree(mc) -= NMBPBG; 7482 m_total(mc) -= NMBPBG; 7483 if (mclaudit != NULL) 7484 mcl_audit_free(sp->sl_base, NMBPBG); 7485 break; 7486 case MC_CL: 7487 m_infree(mc) -= NCLPBG; 7488 m_total(mc) -= NCLPBG; 7489 if (mclaudit != NULL) 7490 mcl_audit_free(sp->sl_base, NMBPBG); 7491 break; 7492 case MC_BIGCL: 7493 m_infree(mc)--; 7494 m_total(mc)--; 7495 if (mclaudit != NULL) 7496 mcl_audit_free(sp->sl_base, NMBPBG); 7497 break; 7498 case MC_16KCL: 7499 m_infree(mc)--; 7500 m_total(mc)--; 7501 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { 7502 nsp = nsp->sl_next; 7503 VERIFY(nsp->sl_refcnt == 0 && 7504 nsp->sl_base != NULL && 7505 nsp->sl_len == 0); 7506 slab_init(nsp, 0, 0, NULL, NULL, 0, 0, 7507 0); 7508 nsp->sl_flags = 0; 7509 } 7510 if (mclaudit != NULL) 7511 mcl_audit_free(sp->sl_base, 1); 7512 break; 7513 default: 7514 /* 7515 * The composite classes have their own 7516 * freelist (m_cobjlist), so we only 7517 * process rudimentary classes here. 7518 */ 7519 VERIFY(0); 7520 } 7521 m_release_cnt(mc) += m_size(mc); 7522 released += m_size(mc); 7523 offset = ((char *)sp->sl_base - (char *)mbutl) / NBPG; 7524 /* 7525 * Make sure the IOMapper points to a valid, but 7526 * bogus, address. This should prevent further DMA 7527 * accesses to freed memory. 7528 */ 7529 IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa); 7530 mcl_paddr[offset] = 0; 7531 kmem_free(mb_map, (vm_offset_t)sp->sl_base, 7532 sp->sl_len); 7533 slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0); 7534 sp->sl_flags = 0; 7535 } 7536 } 7537 mbstat.m_drain++; 7538 mbstat.m_bigclusters = m_total(MC_BIGCL); 7539 mbstat.m_clusters = m_total(MC_CL); 7540 mbstat.m_mbufs = m_total(MC_MBUF); 7541 mbuf_stat_sync(); 7542 mbuf_mtypes_sync(TRUE); 7543 lck_mtx_unlock(mbuf_mlock); 7544} 7545 7546static int 7547m_drain_force_sysctl SYSCTL_HANDLER_ARGS 7548{ 7549#pragma unused(arg1, arg2) 7550 int val = 0, err; 7551 7552 err = sysctl_handle_int(oidp, &val, 0, req); 7553 if (err != 0 || req->newptr == USER_ADDR_NULL) 7554 return (err); 7555 if (val) 7556 m_drain(); 7557 7558 return (err); 7559} 7560 7561SYSCTL_DECL(_kern_ipc); 7562SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, 7563 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 7564 0, 0, mbstat_sysctl, "S,mbstat", ""); 7565SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, 7566 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 7567 0, 0, mb_stat_sysctl, "S,mb_stat", ""); 7568SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace, 7569 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 7570 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", ""); 7571SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table, 7572 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 7573 0, 0, mleak_table_sysctl, "S,mleak_table", ""); 7574SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor, 7575 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, ""); 7576SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, 7577 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, ""); 7578SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog, 7579 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, ""); 7580SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force, 7581 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0, 7582 m_drain_force_sysctl, "I", 7583 "Forces the mbuf garbage collection to run"); 7584SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint, 7585 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0, 7586 "Minimum time interval between garbage collection"); 7587