1/* 2 * Copyright (c) 1998-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1982, 1986, 1988, 1991, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. All advertising materials mentioning features or use of this software 42 * must display the following acknowledgement: 43 * This product includes software developed by the University of 44 * California, Berkeley and its contributors. 45 * 4. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 62 */ 63/* 64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 65 * support for mandatory and extensible security protections. This notice 66 * is included in support of clause 2.2 (b) of the Apple Public License, 67 * Version 2.0. 68 */ 69 70#include <sys/param.h> 71#include <sys/systm.h> 72#include <sys/malloc.h> 73#include <sys/mbuf.h> 74#include <sys/kernel.h> 75#include <sys/sysctl.h> 76#include <sys/syslog.h> 77#include <sys/protosw.h> 78#include <sys/domain.h> 79#include <sys/queue.h> 80#include <sys/proc.h> 81 82#include <dev/random/randomdev.h> 83 84#include <kern/kern_types.h> 85#include <kern/simple_lock.h> 86#include <kern/queue.h> 87#include <kern/sched_prim.h> 88#include <kern/cpu_number.h> 89#include <kern/zalloc.h> 90 91#include <libkern/OSAtomic.h> 92#include <libkern/OSDebug.h> 93#include <libkern/libkern.h> 94 95#include <IOKit/IOMapper.h> 96 97#include <machine/limits.h> 98#include <machine/machine_routines.h> 99 100#if CONFIG_MACF_NET 101#include <security/mac_framework.h> 102#endif /* MAC_NET */ 103 104#include <sys/mcache.h> 105 106/* 107 * MBUF IMPLEMENTATION NOTES. 108 * 109 * There is a total of 5 per-CPU caches: 110 * 111 * MC_MBUF: 112 * This is a cache of rudimentary objects of MSIZE in size; each 113 * object represents an mbuf structure. This cache preserves only 114 * the m_type field of the mbuf during its transactions. 115 * 116 * MC_CL: 117 * This is a cache of rudimentary objects of MCLBYTES in size; each 118 * object represents a mcluster structure. This cache does not 119 * preserve the contents of the objects during its transactions. 120 * 121 * MC_BIGCL: 122 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each 123 * object represents a mbigcluster structure. This cache does not 124 * preserve the contents of the objects during its transaction. 125 * 126 * MC_MBUF_CL: 127 * This is a cache of mbufs each having a cluster attached to it. 128 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several 129 * fields of the mbuf related to the external cluster are preserved 130 * during transactions. 131 * 132 * MC_MBUF_BIGCL: 133 * This is a cache of mbufs each having a big cluster attached to it. 134 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several 135 * fields of the mbuf related to the external cluster are preserved 136 * during transactions. 137 * 138 * OBJECT ALLOCATION: 139 * 140 * Allocation requests are handled first at the per-CPU (mcache) layer 141 * before falling back to the slab layer. Performance is optimal when 142 * the request is satisfied at the CPU layer because global data/lock 143 * never gets accessed. When the slab layer is entered for allocation, 144 * the slab freelist will be checked first for available objects before 145 * the VM backing store is invoked. Slab layer operations are serialized 146 * for all of the caches as the mbuf global lock is held most of the time. 147 * Allocation paths are different depending on the class of objects: 148 * 149 * a. Rudimentary object: 150 * 151 * { m_get_common(), m_clattach(), m_mclget(), 152 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(), 153 * composite object allocation } 154 * | ^ 155 * | | 156 * | +-----------------------+ 157 * v | 158 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit() 159 * | ^ 160 * v | 161 * [CPU cache] -------> (found?) -------+ 162 * | | 163 * v | 164 * mbuf_slab_alloc() | 165 * | | 166 * v | 167 * +---------> [freelist] -------> (found?) -------+ 168 * | | 169 * | v 170 * | m_clalloc() 171 * | | 172 * | v 173 * +---<<---- kmem_mb_alloc() 174 * 175 * b. Composite object: 176 * 177 * { m_getpackets_internal(), m_allocpacket_internal() } 178 * | ^ 179 * | | 180 * | +------ (done) ---------+ 181 * v | 182 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit() 183 * | ^ 184 * v | 185 * [CPU cache] -------> (found?) -------+ 186 * | | 187 * v | 188 * mbuf_cslab_alloc() | 189 * | | 190 * v | 191 * [freelist] -------> (found?) -------+ 192 * | | 193 * v | 194 * (rudimentary object) | 195 * mcache_alloc/mcache_alloc_ext() ------>>-----+ 196 * 197 * Auditing notes: If auditing is enabled, buffers will be subjected to 198 * integrity checks by the audit routine. This is done by verifying their 199 * contents against DEADBEEF (free) pattern before returning them to caller. 200 * As part of this step, the routine will also record the transaction and 201 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will 202 * also restore any constructed data structure fields if necessary. 203 * 204 * OBJECT DEALLOCATION: 205 * 206 * Freeing an object simply involves placing it into the CPU cache; this 207 * pollutes the cache to benefit subsequent allocations. The slab layer 208 * will only be entered if the object is to be purged out of the cache. 209 * During normal operations, this happens only when the CPU layer resizes 210 * its bucket while it's adjusting to the allocation load. Deallocation 211 * paths are different depending on the class of objects: 212 * 213 * a. Rudimentary object: 214 * 215 * { m_free(), m_freem_list(), composite object deallocation } 216 * | ^ 217 * | | 218 * | +------ (done) ---------+ 219 * v | 220 * mcache_free/mcache_free_ext() | 221 * | | 222 * v | 223 * mbuf_slab_audit() | 224 * | | 225 * v | 226 * [CPU cache] ---> (not purging?) -----+ 227 * | | 228 * v | 229 * mbuf_slab_free() | 230 * | | 231 * v | 232 * [freelist] ----------->>------------+ 233 * (objects never get purged to VM) 234 * 235 * b. Composite object: 236 * 237 * { m_free(), m_freem_list() } 238 * | ^ 239 * | | 240 * | +------ (done) ---------+ 241 * v | 242 * mcache_free/mcache_free_ext() | 243 * | | 244 * v | 245 * mbuf_cslab_audit() | 246 * | | 247 * v | 248 * [CPU cache] ---> (not purging?) -----+ 249 * | | 250 * v | 251 * mbuf_cslab_free() | 252 * | | 253 * v | 254 * [freelist] ---> (not purging?) -----+ 255 * | | 256 * v | 257 * (rudimentary object) | 258 * mcache_free/mcache_free_ext() ------->>------+ 259 * 260 * Auditing notes: If auditing is enabled, the audit routine will save 261 * any constructed data structure fields (if necessary) before filling the 262 * contents of the buffers with DEADBEEF (free) pattern and recording the 263 * transaction. Buffers that are freed (whether at CPU or slab layer) are 264 * expected to contain the free pattern. 265 * 266 * DEBUGGING: 267 * 268 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this 269 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally, 270 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag, 271 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak 272 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g. 273 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory. 274 * 275 * Each object is associated with exactly one mcache_audit_t structure that 276 * contains the information related to its last buffer transaction. Given 277 * an address of an object, the audit structure can be retrieved by finding 278 * the position of the object relevant to the base address of the cluster: 279 * 280 * +------------+ +=============+ 281 * | mbuf addr | | mclaudit[i] | 282 * +------------+ +=============+ 283 * | | cl_audit[0] | 284 * i = MTOBG(addr) +-------------+ 285 * | +-----> | cl_audit[1] | -----> mcache_audit_t 286 * b = BGTOM(i) | +-------------+ 287 * | | | ... | 288 * x = MCLIDX(b, addr) | +-------------+ 289 * | | | cl_audit[7] | 290 * +-----------------+ +-------------+ 291 * (e.g. x == 1) 292 * 293 * The mclaudit[] array is allocated at initialization time, but its contents 294 * get populated when the corresponding cluster is created. Because a page 295 * can be turned into NMBPBG number of mbufs, we preserve enough space for the 296 * mbufs so that there is a 1-to-1 mapping between them. A page that never 297 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the 298 * remaining entries unused. For 16KB cluster, only one entry from the first 299 * page is allocated and used for the entire object. 300 */ 301 302/* TODO: should be in header file */ 303/* kernel translater */ 304extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int); 305extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); 306extern vm_map_t mb_map; /* special map */ 307 308/* Global lock */ 309decl_lck_mtx_data(static, mbuf_mlock_data); 310static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data; 311static lck_attr_t *mbuf_mlock_attr; 312static lck_grp_t *mbuf_mlock_grp; 313static lck_grp_attr_t *mbuf_mlock_grp_attr; 314 315/* Back-end (common) layer */ 316static void *mbuf_worker_run; /* wait channel for worker thread */ 317static int mbuf_worker_ready; /* worker thread is runnable */ 318static int mbuf_expand_mcl; /* number of cluster creation requets */ 319static int mbuf_expand_big; /* number of big cluster creation requests */ 320static int mbuf_expand_16k; /* number of 16KB cluster creation requests */ 321static int ncpu; /* number of CPUs */ 322static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */ 323static ppnum_t mcl_pages; /* Size of array (# physical pages) */ 324static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */ 325static mcache_t *ref_cache; /* Cache of cluster reference & flags */ 326static mcache_t *mcl_audit_con_cache; /* Audit contents cache */ 327static unsigned int mbuf_debug; /* patchable mbuf mcache flags */ 328static unsigned int mb_normalized; /* number of packets "normalized" */ 329 330#define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */ 331#define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */ 332 333typedef enum { 334 MC_MBUF = 0, /* Regular mbuf */ 335 MC_CL, /* Cluster */ 336 MC_BIGCL, /* Large (4KB) cluster */ 337 MC_16KCL, /* Jumbo (16KB) cluster */ 338 MC_MBUF_CL, /* mbuf + cluster */ 339 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */ 340 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */ 341} mbuf_class_t; 342 343#define MBUF_CLASS_MIN MC_MBUF 344#define MBUF_CLASS_MAX MC_MBUF_16KCL 345#define MBUF_CLASS_LAST MC_16KCL 346#define MBUF_CLASS_VALID(c) \ 347 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX) 348#define MBUF_CLASS_COMPOSITE(c) \ 349 ((int)(c) > MBUF_CLASS_LAST) 350 351 352/* 353 * mbuf specific mcache allocation request flags. 354 */ 355#define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */ 356 357/* 358 * Per-cluster slab structure. 359 * 360 * A slab is a cluster control structure that contains one or more object 361 * chunks; the available chunks are chained in the slab's freelist (sl_head). 362 * Each time a chunk is taken out of the slab, the slab's reference count 363 * gets incremented. When all chunks have been taken out, the empty slab 364 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is 365 * returned to a slab causes the slab's reference count to be decremented; 366 * it also causes the slab to be reinserted back to class's slab list, if 367 * it's not already done. 368 * 369 * Compartmentalizing of the object chunks into slabs allows us to easily 370 * merge one or more slabs together when the adjacent slabs are idle, as 371 * well as to convert or move a slab from one class to another; e.g. the 372 * mbuf cluster slab can be converted to a regular cluster slab when all 373 * mbufs in the slab have been freed. 374 * 375 * A slab may also span across multiple clusters for chunks larger than 376 * a cluster's size. In this case, only the slab of the first cluster is 377 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate 378 * that they are part of the larger slab. 379 * 380 * Each slab controls a page of memory. 381 */ 382typedef struct mcl_slab { 383 struct mcl_slab *sl_next; /* neighboring slab */ 384 u_int8_t sl_class; /* controlling mbuf class */ 385 int8_t sl_refcnt; /* outstanding allocations */ 386 int8_t sl_chunks; /* chunks (bufs) in this slab */ 387 u_int16_t sl_flags; /* slab flags (see below) */ 388 u_int16_t sl_len; /* slab length */ 389 void *sl_base; /* base of allocated memory */ 390 void *sl_head; /* first free buffer */ 391 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */ 392} mcl_slab_t; 393 394#define SLF_MAPPED 0x0001 /* backed by a mapped page */ 395#define SLF_PARTIAL 0x0002 /* part of another slab */ 396#define SLF_DETACHED 0x0004 /* not in slab freelist */ 397 398/* 399 * The array of slabs are broken into groups of arrays per 1MB of kernel 400 * memory to reduce the footprint. Each group is allocated on demand 401 * whenever a new piece of memory mapped in from the VM crosses the 1MB 402 * boundary. 403 */ 404#define NSLABSPMB ((1 << MBSHIFT) >> PGSHIFT) /* 256 slabs/grp */ 405 406typedef struct mcl_slabg { 407 mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */ 408} mcl_slabg_t; 409 410/* 411 * Number of slabs needed to control a 16KB cluster object. 412 */ 413#define NSLABSP16KB (M16KCLBYTES >> PGSHIFT) 414 415/* 416 * Per-cluster audit structure. 417 */ 418typedef struct { 419 mcache_audit_t *cl_audit[NMBPBG]; /* array of audits */ 420} mcl_audit_t; 421 422typedef struct { 423 struct thread *msa_thread; /* thread doing transaction */ 424 struct thread *msa_pthread; /* previous transaction thread */ 425 uint32_t msa_tstamp; /* transaction timestamp (ms) */ 426 uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */ 427 uint16_t msa_depth; /* pc stack depth */ 428 uint16_t msa_pdepth; /* previous transaction pc stack */ 429 void *msa_stack[MCACHE_STACK_DEPTH]; 430 void *msa_pstack[MCACHE_STACK_DEPTH]; 431} mcl_scratch_audit_t; 432 433typedef struct { 434 /* 435 * Size of data from the beginning of an mbuf that covers m_hdr, 436 * pkthdr and m_ext structures. If auditing is enabled, we allocate 437 * a shadow mbuf structure of this size inside each audit structure, 438 * and the contents of the real mbuf gets copied into it when the mbuf 439 * is freed. This allows us to pattern-fill the mbuf for integrity 440 * check, and to preserve any constructed mbuf fields (e.g. mbuf + 441 * cluster cache case). Note that we don't save the contents of 442 * clusters when they are freed; we simply pattern-fill them. 443 */ 444 u_int8_t sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)]; 445 mcl_scratch_audit_t sc_scratch __attribute__((aligned(8))); 446} mcl_saved_contents_t; 447 448#define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t)) 449 450#define MCA_SAVED_MBUF_PTR(_mca) \ 451 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \ 452 (_mca)->mca_contents)->sc_mbuf) 453#define MCA_SAVED_MBUF_SIZE \ 454 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf)) 455#define MCA_SAVED_SCRATCH_PTR(_mca) \ 456 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch) 457 458/* 459 * mbuf specific mcache audit flags 460 */ 461#define MB_INUSE 0x01 /* object has not been returned to slab */ 462#define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */ 463#define MB_SCVALID 0x04 /* object has valid saved contents */ 464 465/* 466 * Each of the following two arrays hold up to nmbclusters elements. 467 */ 468static mcl_audit_t *mclaudit; /* array of cluster audit information */ 469static unsigned int maxclaudit; /* max # of entries in audit table */ 470static mcl_slabg_t **slabstbl; /* cluster slabs table */ 471static unsigned int maxslabgrp; /* max # of entries in slabs table */ 472static unsigned int slabgrp; /* # of entries in slabs table */ 473 474/* Globals */ 475int nclusters; /* # of clusters for non-jumbo (legacy) sizes */ 476int njcl; /* # of clusters for jumbo sizes */ 477int njclbytes; /* size of a jumbo cluster */ 478union mbigcluster *mbutl; /* first mapped cluster address */ 479union mbigcluster *embutl; /* ending virtual address of mclusters */ 480int _max_linkhdr; /* largest link-level header */ 481int _max_protohdr; /* largest protocol header */ 482int max_hdr; /* largest link+protocol header */ 483int max_datalen; /* MHLEN - max_hdr */ 484 485static boolean_t mclverify; /* debug: pattern-checking */ 486static boolean_t mcltrace; /* debug: stack tracing */ 487static boolean_t mclfindleak; /* debug: leak detection */ 488static boolean_t mclexpleak; /* debug: expose leak info to user space */ 489 490static struct timeval mb_start; /* beginning of time */ 491 492/* mbuf leak detection variables */ 493static struct mleak_table mleak_table; 494static mleak_stat_t *mleak_stat; 495 496#define MLEAK_STAT_SIZE(n) \ 497 ((size_t)(&((mleak_stat_t *)0)->ml_trace[n])) 498 499struct mallocation { 500 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */ 501 u_int32_t trace_index; /* mtrace index for corresponding backtrace */ 502 u_int32_t count; /* How many objects were requested */ 503 u_int64_t hitcount; /* for determining hash effectiveness */ 504}; 505 506struct mtrace { 507 u_int64_t collisions; 508 u_int64_t hitcount; 509 u_int64_t allocs; 510 u_int64_t depth; 511 uintptr_t addr[MLEAK_STACK_DEPTH]; 512}; 513 514/* Size must be a power of two for the zhash to be able to just mask off bits */ 515#define MLEAK_ALLOCATION_MAP_NUM 512 516#define MLEAK_TRACE_MAP_NUM 256 517 518/* 519 * Sample factor for how often to record a trace. This is overwritable 520 * by the boot-arg mleak_sample_factor. 521 */ 522#define MLEAK_SAMPLE_FACTOR 500 523 524/* 525 * Number of top leakers recorded. 526 */ 527#define MLEAK_NUM_TRACES 5 528 529#define MB_LEAK_SPACING_64 " " 530#define MB_LEAK_SPACING_32 " " 531 532 533#define MB_LEAK_HDR_32 "\n\ 534 trace [1] trace [2] trace [3] trace [4] trace [5] \n\ 535 ---------- ---------- ---------- ---------- ---------- \n\ 536" 537 538#define MB_LEAK_HDR_64 "\n\ 539 trace [1] trace [2] trace [3] \ 540 trace [4] trace [5] \n\ 541 ------------------ ------------------ ------------------ \ 542 ------------------ ------------------ \n\ 543" 544 545static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM; 546static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM; 547 548/* Hashmaps of allocations and their corresponding traces */ 549static struct mallocation *mleak_allocations; 550static struct mtrace *mleak_traces; 551static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES]; 552 553/* Lock to protect mleak tables from concurrent modification */ 554decl_lck_mtx_data(static, mleak_lock_data); 555static lck_mtx_t *mleak_lock = &mleak_lock_data; 556static lck_attr_t *mleak_lock_attr; 557static lck_grp_t *mleak_lock_grp; 558static lck_grp_attr_t *mleak_lock_grp_attr; 559 560extern u_int32_t high_sb_max; 561 562/* The minimum number of objects that are allocated, to start. */ 563#define MINCL 32 564#define MINBIGCL (MINCL >> 1) 565#define MIN16KCL (MINCL >> 2) 566 567/* Low watermarks (only map in pages once free counts go below) */ 568#define MBIGCL_LOWAT MINBIGCL 569#define M16KCL_LOWAT MIN16KCL 570 571typedef struct { 572 mbuf_class_t mtbl_class; /* class type */ 573 mcache_t *mtbl_cache; /* mcache for this buffer class */ 574 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */ 575 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */ 576 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */ 577 u_int32_t mtbl_maxsize; /* maximum buffer size */ 578 int mtbl_minlimit; /* minimum allowed */ 579 int mtbl_maxlimit; /* maximum allowed */ 580 u_int32_t mtbl_wantpurge; /* purge during next reclaim */ 581} mbuf_table_t; 582 583#define m_class(c) mbuf_table[c].mtbl_class 584#define m_cache(c) mbuf_table[c].mtbl_cache 585#define m_slablist(c) mbuf_table[c].mtbl_slablist 586#define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist 587#define m_maxsize(c) mbuf_table[c].mtbl_maxsize 588#define m_minlimit(c) mbuf_table[c].mtbl_minlimit 589#define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit 590#define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge 591#define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname 592#define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size 593#define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total 594#define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active 595#define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree 596#define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt 597#define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt 598#define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt 599#define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified 600#define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt 601#define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt 602#define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal 603 604static mbuf_table_t mbuf_table[] = { 605 /* 606 * The caches for mbufs, regular clusters and big clusters. 607 */ 608 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)), 609 NULL, NULL, 0, 0, 0, 0 }, 610 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)), 611 NULL, NULL, 0, 0, 0, 0 }, 612 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)), 613 NULL, NULL, 0, 0, 0, 0 }, 614 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)), 615 NULL, NULL, 0, 0, 0, 0 }, 616 /* 617 * The following are special caches; they serve as intermediate 618 * caches backed by the above rudimentary caches. Each object 619 * in the cache is an mbuf with a cluster attached to it. Unlike 620 * the above caches, these intermediate caches do not directly 621 * deal with the slab structures; instead, the constructed 622 * cached elements are simply stored in the freelists. 623 */ 624 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 }, 625 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 }, 626 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 }, 627}; 628 629#define NELEM(a) (sizeof (a) / sizeof ((a)[0])) 630 631static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */ 632static int mb_waiters; /* number of waiters */ 633 634#define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */ 635static struct timeval mb_wdtstart; /* watchdog start timestamp */ 636static char *mbuf_dump_buf; 637 638#define MBUF_DUMP_BUF_SIZE 2048 639 640/* 641 * mbuf watchdog is enabled by default on embedded platforms. It is 642 * also toggeable via the kern.ipc.mb_watchdog sysctl. 643 */ 644static unsigned int mb_watchdog = 0; 645 646/* Red zone */ 647static u_int32_t mb_redzone_cookie; 648static void m_redzone_init(struct mbuf *); 649static void m_redzone_verify(struct mbuf *m); 650 651/* The following are used to serialize m_clalloc() */ 652static boolean_t mb_clalloc_busy; 653static void *mb_clalloc_waitchan = &mb_clalloc_busy; 654static int mb_clalloc_waiters; 655 656static void mbuf_mtypes_sync(boolean_t); 657static int mbstat_sysctl SYSCTL_HANDLER_ARGS; 658static void mbuf_stat_sync(void); 659static int mb_stat_sysctl SYSCTL_HANDLER_ARGS; 660static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS; 661static int mleak_table_sysctl SYSCTL_HANDLER_ARGS; 662static char *mbuf_dump(void); 663static void mbuf_table_init(void); 664static inline void m_incref(struct mbuf *); 665static inline u_int32_t m_decref(struct mbuf *); 666static int m_clalloc(const u_int32_t, const int, const u_int32_t); 667static void mbuf_worker_thread_init(void); 668static mcache_obj_t *slab_alloc(mbuf_class_t, int); 669static void slab_free(mbuf_class_t, mcache_obj_t *); 670static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***, 671 unsigned int, int); 672static void mbuf_slab_free(void *, mcache_obj_t *, int); 673static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t); 674static void mbuf_slab_notify(void *, u_int32_t); 675static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***, 676 unsigned int); 677static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int); 678static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***, 679 unsigned int, int); 680static void mbuf_cslab_free(void *, mcache_obj_t *, int); 681static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t); 682static int freelist_populate(mbuf_class_t, unsigned int, int); 683static void freelist_init(mbuf_class_t); 684static boolean_t mbuf_cached_above(mbuf_class_t, int); 685static boolean_t mbuf_steal(mbuf_class_t, unsigned int); 686static void m_reclaim(mbuf_class_t, unsigned int, boolean_t); 687static int m_howmany(int, size_t); 688static void mbuf_worker_thread(void); 689static void mbuf_watchdog(void); 690static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int); 691 692static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **, 693 size_t, unsigned int); 694static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *); 695static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t); 696static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t, 697 boolean_t); 698static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t); 699static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *); 700static void mcl_audit_scratch(mcache_audit_t *); 701static void mcl_audit_mcheck_panic(struct mbuf *); 702static void mcl_audit_verify_nextptr(void *, mcache_audit_t *); 703 704static void mleak_activate(void); 705static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t); 706static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int); 707static void mleak_free(mcache_obj_t *); 708static void mleak_sort_traces(void); 709static void mleak_update_stats(void); 710 711static mcl_slab_t *slab_get(void *); 712static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t, 713 void *, void *, unsigned int, int, int); 714static void slab_insert(mcl_slab_t *, mbuf_class_t); 715static void slab_remove(mcl_slab_t *, mbuf_class_t); 716static boolean_t slab_inrange(mcl_slab_t *, void *); 717static void slab_nextptr_panic(mcl_slab_t *, void *); 718static void slab_detach(mcl_slab_t *); 719static boolean_t slab_is_detached(mcl_slab_t *); 720 721static int m_copyback0(struct mbuf **, int, int, const void *, int, int); 722static struct mbuf *m_split0(struct mbuf *, int, int, int); 723 724/* flags for m_copyback0 */ 725#define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */ 726#define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */ 727#define M_COPYBACK0_COW 0x0004 /* do copy-on-write */ 728#define M_COPYBACK0_EXTEND 0x0008 /* extend chain */ 729 730/* 731 * This flag is set for all mbufs that come out of and into the composite 732 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that 733 * are marked with such a flag have clusters attached to them, and will be 734 * treated differently when they are freed; instead of being placed back 735 * into the mbuf and cluster freelists, the composite mbuf + cluster objects 736 * are placed back into the appropriate composite cache's freelist, and the 737 * actual freeing is deferred until the composite objects are purged. At 738 * such a time, this flag will be cleared from the mbufs and the objects 739 * will be freed into their own separate freelists. 740 */ 741#define EXTF_COMPOSITE 0x1 742 743/* 744 * This flag indicates that the external cluster is read-only, i.e. it is 745 * or was referred to by more than one mbufs. Once set, this flag is never 746 * cleared. 747 */ 748#define EXTF_READONLY 0x2 749#define EXTF_MASK (EXTF_COMPOSITE | EXTF_READONLY) 750 751#define MEXT_RFA(m) ((m)->m_ext.ext_refflags) 752#define MEXT_REF(m) (MEXT_RFA(m)->refcnt) 753#define MEXT_FLAGS(m) (MEXT_RFA(m)->flags) 754#define MBUF_IS_COMPOSITE(m) \ 755 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE) 756 757/* 758 * Macros used to verify the integrity of the mbuf. 759 */ 760#define _MCHECK(m) { \ 761 if ((m)->m_type != MT_FREE) { \ 762 if (mclaudit == NULL) \ 763 panic("MCHECK: m_type=%d m=%p", \ 764 (u_int16_t)(m)->m_type, m); \ 765 else \ 766 mcl_audit_mcheck_panic(m); \ 767 } \ 768} 769 770#define MBUF_IN_MAP(addr) \ 771 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl) 772 773#define MRANGE(addr) { \ 774 if (!MBUF_IN_MAP(addr)) \ 775 panic("MRANGE: address out of range 0x%p", addr); \ 776} 777 778/* 779 * Macro version of mtod. 780 */ 781#define MTOD(m, t) ((t)((m)->m_data)) 782 783/* 784 * Macros to obtain (4KB) cluster index and base cluster address. 785 */ 786 787#define MTOBG(x) (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT) 788#define BGTOM(x) ((union mbigcluster *)(mbutl + (x))) 789 790/* 791 * Macro to find the mbuf index relative to a base. 792 */ 793#define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> MSIZESHIFT) 794 795/* 796 * Same thing for 2KB cluster index. 797 */ 798#define CLBGIDX(c, m) (((char *)(m) - (char *)(c)) >> MCLSHIFT) 799 800/* 801 * Macros used during mbuf and cluster initialization. 802 */ 803#define MBUF_INIT_PKTHDR(m) { \ 804 (m)->m_pkthdr.rcvif = NULL; \ 805 (m)->m_pkthdr.pkt_hdr = NULL; \ 806 (m)->m_pkthdr.len = 0; \ 807 (m)->m_pkthdr.csum_flags = 0; \ 808 (m)->m_pkthdr.csum_data = 0; \ 809 (m)->m_pkthdr.vlan_tag = 0; \ 810 m_classifier_init(m, 0); \ 811 m_tag_init(m, 1); \ 812 m_scratch_init(m); \ 813 m_redzone_init(m); \ 814} 815 816#define MBUF_INIT(m, pkthdr, type) { \ 817 _MCHECK(m); \ 818 (m)->m_next = (m)->m_nextpkt = NULL; \ 819 (m)->m_len = 0; \ 820 (m)->m_type = type; \ 821 if ((pkthdr) == 0) { \ 822 (m)->m_data = (m)->m_dat; \ 823 (m)->m_flags = 0; \ 824 } else { \ 825 (m)->m_data = (m)->m_pktdat; \ 826 (m)->m_flags = M_PKTHDR; \ 827 MBUF_INIT_PKTHDR(m); \ 828 } \ 829} 830 831#define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \ 832 (m)->m_data = (m)->m_ext.ext_buf = (buf); \ 833 (m)->m_flags |= M_EXT; \ 834 (m)->m_ext.ext_size = (size); \ 835 (m)->m_ext.ext_free = (free); \ 836 (m)->m_ext.ext_arg = (arg); \ 837 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \ 838 &(m)->m_ext.ext_refs; \ 839 MEXT_RFA(m) = (rfa); \ 840 MEXT_REF(m) = (ref); \ 841 MEXT_FLAGS(m) = (flag); \ 842} 843 844#define MBUF_CL_INIT(m, buf, rfa, ref, flag) \ 845 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag) 846 847#define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \ 848 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag) 849 850#define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \ 851 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag) 852 853/* 854 * Macro to convert BSD malloc sleep flag to mcache's 855 */ 856#define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP) 857 858/* 859 * The structure that holds all mbuf class statistics exportable via sysctl. 860 * Similar to mbstat structure, the mb_stat structure is protected by the 861 * global mbuf lock. It contains additional information about the classes 862 * that allows for a more accurate view of the state of the allocator. 863 */ 864struct mb_stat *mb_stat; 865struct omb_stat *omb_stat; /* For backwards compatibility */ 866 867#define MB_STAT_SIZE(n) \ 868 ((size_t)(&((mb_stat_t *)0)->mbs_class[n])) 869#define OMB_STAT_SIZE(n) \ 870 ((size_t)(&((struct omb_stat *)0)->mbs_class[n])) 871 872/* 873 * The legacy structure holding all of the mbuf allocation statistics. 874 * The actual statistics used by the kernel are stored in the mbuf_table 875 * instead, and are updated atomically while the global mbuf lock is held. 876 * They are mirrored in mbstat to support legacy applications (e.g. netstat). 877 * Unlike before, the kernel no longer relies on the contents of mbstat for 878 * its operations (e.g. cluster expansion) because the structure is exposed 879 * to outside and could possibly be modified, therefore making it unsafe. 880 * With the exception of the mbstat.m_mtypes array (see below), all of the 881 * statistics are updated as they change. 882 */ 883struct mbstat mbstat; 884 885#define MBSTAT_MTYPES_MAX \ 886 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0])) 887 888/* 889 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated 890 * atomically and stored in a per-CPU structure which is lock-free; this is 891 * done in order to avoid writing to the global mbstat data structure which 892 * would cause false sharing. During sysctl request for kern.ipc.mbstat, 893 * the statistics across all CPUs will be converged into the mbstat.m_mtypes 894 * array and returned to the application. Any updates for types greater or 895 * equal than MT_MAX would be done atomically to the mbstat; this slows down 896 * performance but is okay since the kernel uses only up to MT_MAX-1 while 897 * anything beyond that (up to type 255) is considered a corner case. 898 */ 899typedef struct { 900 unsigned int cpu_mtypes[MT_MAX]; 901} __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t; 902 903typedef struct { 904 mtypes_cpu_t mbs_cpu[1]; 905} mbuf_mtypes_t; 906 907static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */ 908 909#define MBUF_MTYPES_SIZE(n) \ 910 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n])) 911 912#define MTYPES_CPU(p) \ 913 ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number()))) 914 915#define mtype_stat_add(type, n) { \ 916 if ((unsigned)(type) < MT_MAX) { \ 917 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \ 918 atomic_add_32(&mbs->cpu_mtypes[type], n); \ 919 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \ 920 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \ 921 } \ 922} 923 924#define mtype_stat_sub(t, n) mtype_stat_add(t, -(n)) 925#define mtype_stat_inc(t) mtype_stat_add(t, 1) 926#define mtype_stat_dec(t) mtype_stat_sub(t, 1) 927 928static void 929mbuf_mtypes_sync(boolean_t locked) 930{ 931 int m, n; 932 mtypes_cpu_t mtc; 933 934 if (locked) 935 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 936 937 bzero(&mtc, sizeof (mtc)); 938 for (m = 0; m < ncpu; m++) { 939 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m]; 940 mtypes_cpu_t temp; 941 942 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes, 943 sizeof (temp.cpu_mtypes)); 944 945 for (n = 0; n < MT_MAX; n++) 946 mtc.cpu_mtypes[n] += temp.cpu_mtypes[n]; 947 } 948 if (!locked) 949 lck_mtx_lock(mbuf_mlock); 950 for (n = 0; n < MT_MAX; n++) 951 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n]; 952 if (!locked) 953 lck_mtx_unlock(mbuf_mlock); 954} 955 956static int 957mbstat_sysctl SYSCTL_HANDLER_ARGS 958{ 959#pragma unused(oidp, arg1, arg2) 960 mbuf_mtypes_sync(FALSE); 961 962 return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat))); 963} 964 965static void 966mbuf_stat_sync(void) 967{ 968 mb_class_stat_t *sp; 969 mcache_cpu_t *ccp; 970 mcache_t *cp; 971 int k, m, bktsize; 972 973 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 974 975 for (k = 0; k < NELEM(mbuf_table); k++) { 976 cp = m_cache(k); 977 ccp = &cp->mc_cpu[0]; 978 bktsize = ccp->cc_bktsize; 979 sp = mbuf_table[k].mtbl_stats; 980 981 if (cp->mc_flags & MCF_NOCPUCACHE) 982 sp->mbcl_mc_state = MCS_DISABLED; 983 else if (cp->mc_purge_cnt > 0) 984 sp->mbcl_mc_state = MCS_PURGING; 985 else if (bktsize == 0) 986 sp->mbcl_mc_state = MCS_OFFLINE; 987 else 988 sp->mbcl_mc_state = MCS_ONLINE; 989 990 sp->mbcl_mc_cached = 0; 991 for (m = 0; m < ncpu; m++) { 992 ccp = &cp->mc_cpu[m]; 993 if (ccp->cc_objs > 0) 994 sp->mbcl_mc_cached += ccp->cc_objs; 995 if (ccp->cc_pobjs > 0) 996 sp->mbcl_mc_cached += ccp->cc_pobjs; 997 } 998 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize); 999 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached - 1000 sp->mbcl_infree; 1001 1002 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt; 1003 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt; 1004 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt; 1005 1006 /* Calculate total count specific to each class */ 1007 sp->mbcl_ctotal = sp->mbcl_total; 1008 switch (m_class(k)) { 1009 case MC_MBUF: 1010 /* Deduct mbufs used in composite caches */ 1011 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) + 1012 m_total(MC_MBUF_BIGCL)); 1013 break; 1014 1015 case MC_CL: 1016 /* Deduct clusters used in composite cache */ 1017 sp->mbcl_ctotal -= m_total(MC_MBUF_CL); 1018 break; 1019 1020 case MC_BIGCL: 1021 /* Deduct clusters used in composite cache */ 1022 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL); 1023 break; 1024 1025 case MC_16KCL: 1026 /* Deduct clusters used in composite cache */ 1027 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL); 1028 break; 1029 1030 default: 1031 break; 1032 } 1033 } 1034} 1035 1036static int 1037mb_stat_sysctl SYSCTL_HANDLER_ARGS 1038{ 1039#pragma unused(oidp, arg1, arg2) 1040 void *statp; 1041 int k, statsz, proc64 = proc_is64bit(req->p); 1042 1043 lck_mtx_lock(mbuf_mlock); 1044 mbuf_stat_sync(); 1045 1046 if (!proc64) { 1047 struct omb_class_stat *oc; 1048 struct mb_class_stat *c; 1049 1050 omb_stat->mbs_cnt = mb_stat->mbs_cnt; 1051 oc = &omb_stat->mbs_class[0]; 1052 c = &mb_stat->mbs_class[0]; 1053 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) { 1054 (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname), 1055 "%s", c->mbcl_cname); 1056 oc->mbcl_size = c->mbcl_size; 1057 oc->mbcl_total = c->mbcl_total; 1058 oc->mbcl_active = c->mbcl_active; 1059 oc->mbcl_infree = c->mbcl_infree; 1060 oc->mbcl_slab_cnt = c->mbcl_slab_cnt; 1061 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt; 1062 oc->mbcl_free_cnt = c->mbcl_free_cnt; 1063 oc->mbcl_notified = c->mbcl_notified; 1064 oc->mbcl_purge_cnt = c->mbcl_purge_cnt; 1065 oc->mbcl_fail_cnt = c->mbcl_fail_cnt; 1066 oc->mbcl_ctotal = c->mbcl_ctotal; 1067 oc->mbcl_mc_state = c->mbcl_mc_state; 1068 oc->mbcl_mc_cached = c->mbcl_mc_cached; 1069 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt; 1070 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt; 1071 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt; 1072 } 1073 statp = omb_stat; 1074 statsz = OMB_STAT_SIZE(NELEM(mbuf_table)); 1075 } else { 1076 statp = mb_stat; 1077 statsz = MB_STAT_SIZE(NELEM(mbuf_table)); 1078 } 1079 1080 lck_mtx_unlock(mbuf_mlock); 1081 1082 return (SYSCTL_OUT(req, statp, statsz)); 1083} 1084 1085static int 1086mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS 1087{ 1088#pragma unused(oidp, arg1, arg2) 1089 int i; 1090 1091 /* Ensure leak tracing turned on */ 1092 if (!mclfindleak || !mclexpleak) 1093 return (ENXIO); 1094 1095 lck_mtx_lock(mleak_lock); 1096 mleak_update_stats(); 1097 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES)); 1098 lck_mtx_unlock(mleak_lock); 1099 1100 return (i); 1101} 1102 1103static int 1104mleak_table_sysctl SYSCTL_HANDLER_ARGS 1105{ 1106#pragma unused(oidp, arg1, arg2) 1107 int i = 0; 1108 1109 /* Ensure leak tracing turned on */ 1110 if (!mclfindleak || !mclexpleak) 1111 return (ENXIO); 1112 1113 lck_mtx_lock(mleak_lock); 1114 i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table)); 1115 lck_mtx_unlock(mleak_lock); 1116 1117 return (i); 1118} 1119 1120static inline void 1121m_incref(struct mbuf *m) 1122{ 1123 UInt32 old, new; 1124 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m); 1125 1126 do { 1127 old = *addr; 1128 new = old + 1; 1129 ASSERT(new != 0); 1130 } while (!OSCompareAndSwap(old, new, addr)); 1131 1132 /* 1133 * If cluster is shared, mark it with (sticky) EXTF_READONLY; 1134 * we don't clear the flag when the refcount goes back to 1 1135 * to simplify code calling m_mclhasreference(). 1136 */ 1137 if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY)) 1138 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m)); 1139} 1140 1141static inline u_int32_t 1142m_decref(struct mbuf *m) 1143{ 1144 UInt32 old, new; 1145 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m); 1146 1147 do { 1148 old = *addr; 1149 new = old - 1; 1150 ASSERT(old != 0); 1151 } while (!OSCompareAndSwap(old, new, addr)); 1152 1153 return (new); 1154} 1155 1156static void 1157mbuf_table_init(void) 1158{ 1159 unsigned int b, c, s; 1160 int m; 1161 1162 MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)), 1163 M_TEMP, M_WAITOK | M_ZERO); 1164 VERIFY(omb_stat != NULL); 1165 1166 MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)), 1167 M_TEMP, M_WAITOK | M_ZERO); 1168 VERIFY(mb_stat != NULL); 1169 1170 mb_stat->mbs_cnt = NELEM(mbuf_table); 1171 for (m = 0; m < NELEM(mbuf_table); m++) 1172 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m]; 1173 1174#if CONFIG_MBUF_JUMBO 1175 /* 1176 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do 1177 * this only on platforms where jumbo cluster pool is enabled. 1178 */ 1179 njcl = nmbclusters / 3; 1180 njclbytes = M16KCLBYTES; 1181#endif /* CONFIG_MBUF_JUMBO */ 1182 1183 /* 1184 * nclusters holds both the 2KB and 4KB pools, so ensure it's 1185 * a multiple of 4KB clusters. 1186 */ 1187 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG); 1188 if (njcl > 0) { 1189 /* 1190 * Each jumbo cluster takes 8 2KB clusters, so make 1191 * sure that the pool size is evenly divisible by 8; 1192 * njcl is in 2KB unit, hence treated as such. 1193 */ 1194 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8); 1195 1196 /* Update nclusters with rounded down value of njcl */ 1197 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG); 1198 } 1199 1200 /* 1201 * njcl is valid only on platforms with 16KB jumbo clusters, where 1202 * it is configured to 1/3 of the pool size. On these platforms, 1203 * the remaining is used for 2KB and 4KB clusters. On platforms 1204 * without 16KB jumbo clusters, the entire pool is used for both 1205 * 2KB and 4KB clusters. A 4KB cluster can either be splitted into 1206 * 16 mbufs, or into 2 2KB clusters. 1207 * 1208 * +---+---+------------ ... -----------+------- ... -------+ 1209 * | c | b | s | njcl | 1210 * +---+---+------------ ... -----------+------- ... -------+ 1211 * 1212 * 1/32th of the shared region is reserved for pure 2KB and 4KB 1213 * clusters (1/64th each.) 1214 */ 1215 c = P2ROUNDDOWN((nclusters >> 6), 2); /* in 2KB unit */ 1216 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */ 1217 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */ 1218 1219 /* 1220 * 1/64th (c) is reserved for 2KB clusters. 1221 */ 1222 m_minlimit(MC_CL) = c; 1223 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */ 1224 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES; 1225 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl"); 1226 1227 /* 1228 * Another 1/64th (b) of the map is reserved for 4KB clusters. 1229 * It cannot be turned into 2KB clusters or mbufs. 1230 */ 1231 m_minlimit(MC_BIGCL) = b; 1232 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */ 1233 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES; 1234 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl"); 1235 1236 /* 1237 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB) 1238 */ 1239 m_minlimit(MC_MBUF) = 0; 1240 m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */ 1241 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE; 1242 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf"); 1243 1244 /* 1245 * Set limits for the composite classes. 1246 */ 1247 m_minlimit(MC_MBUF_CL) = 0; 1248 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL); 1249 m_maxsize(MC_MBUF_CL) = MCLBYTES; 1250 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL); 1251 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl"); 1252 1253 m_minlimit(MC_MBUF_BIGCL) = 0; 1254 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL); 1255 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES; 1256 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL); 1257 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl"); 1258 1259 /* 1260 * And for jumbo classes. 1261 */ 1262 m_minlimit(MC_16KCL) = 0; 1263 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */ 1264 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES; 1265 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl"); 1266 1267 m_minlimit(MC_MBUF_16KCL) = 0; 1268 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL); 1269 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES; 1270 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL); 1271 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl"); 1272 1273 /* 1274 * Initialize the legacy mbstat structure. 1275 */ 1276 bzero(&mbstat, sizeof (mbstat)); 1277 mbstat.m_msize = m_maxsize(MC_MBUF); 1278 mbstat.m_mclbytes = m_maxsize(MC_CL); 1279 mbstat.m_minclsize = MINCLSIZE; 1280 mbstat.m_mlen = MLEN; 1281 mbstat.m_mhlen = MHLEN; 1282 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL); 1283} 1284 1285#if defined(__LP64__) 1286typedef struct ncl_tbl { 1287 uint64_t nt_maxmem; /* memory (sane) size */ 1288 uint32_t nt_mbpool; /* mbuf pool size */ 1289} ncl_tbl_t; 1290 1291/* Non-server */ 1292static ncl_tbl_t ncl_table[] = { 1293 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ }, 1294 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (96 << MBSHIFT) /* 96 MB */ }, 1295 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (128 << MBSHIFT) /* 128 MB */ }, 1296 { 0, 0 } 1297}; 1298 1299/* Server */ 1300static ncl_tbl_t ncl_table_srv[] = { 1301 { (1ULL << GBSHIFT) /* 1 GB */, (96 << MBSHIFT) /* 96 MB */ }, 1302 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (128 << MBSHIFT) /* 128 MB */ }, 1303 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (160 << MBSHIFT) /* 160 MB */ }, 1304 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (192 << MBSHIFT) /* 192 MB */ }, 1305 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (256 << MBSHIFT) /* 256 MB */ }, 1306 { (1ULL << (GBSHIFT + 6)) /* 64 GB */, (384 << MBSHIFT) /* 384 MB */ }, 1307 { 0, 0 } 1308}; 1309#endif /* __LP64__ */ 1310 1311__private_extern__ unsigned int 1312mbuf_default_ncl(int server, uint64_t mem) 1313{ 1314#if !defined(__LP64__) 1315#pragma unused(server) 1316 unsigned int n; 1317 /* 1318 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM). 1319 */ 1320 if ((n = ((mem / 16) / MCLBYTES)) > 32768) 1321 n = 32768; 1322#else 1323 unsigned int n, i; 1324 ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table); 1325 /* 1326 * 64-bit kernel (mbuf pool size based on table). 1327 */ 1328 n = tbl[0].nt_mbpool; 1329 for (i = 0; tbl[i].nt_mbpool != 0; i++) { 1330 if (mem < tbl[i].nt_maxmem) 1331 break; 1332 n = tbl[i].nt_mbpool; 1333 } 1334 n >>= MCLSHIFT; 1335#endif /* !__LP64__ */ 1336 return (n); 1337} 1338 1339__private_extern__ void 1340mbinit(void) 1341{ 1342 unsigned int m; 1343 unsigned int initmcl = 0; 1344 void *buf; 1345 thread_t thread = THREAD_NULL; 1346 1347 microuptime(&mb_start); 1348 1349 /* 1350 * These MBUF_ values must be equal to their private counterparts. 1351 */ 1352 _CASSERT(MBUF_EXT == M_EXT); 1353 _CASSERT(MBUF_PKTHDR == M_PKTHDR); 1354 _CASSERT(MBUF_EOR == M_EOR); 1355 _CASSERT(MBUF_LOOP == M_LOOP); 1356 _CASSERT(MBUF_BCAST == M_BCAST); 1357 _CASSERT(MBUF_MCAST == M_MCAST); 1358 _CASSERT(MBUF_FRAG == M_FRAG); 1359 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG); 1360 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG); 1361 _CASSERT(MBUF_PROMISC == M_PROMISC); 1362 _CASSERT(MBUF_HASFCS == M_HASFCS); 1363 1364 _CASSERT(MBUF_TYPE_FREE == MT_FREE); 1365 _CASSERT(MBUF_TYPE_DATA == MT_DATA); 1366 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER); 1367 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET); 1368 _CASSERT(MBUF_TYPE_PCB == MT_PCB); 1369 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE); 1370 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE); 1371 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE); 1372 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME); 1373 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS); 1374 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE); 1375 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS); 1376 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR); 1377 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL); 1378 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA); 1379 1380 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4); 1381 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6); 1382 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL); 1383 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16); 1384 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP); 1385 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP); 1386 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP); 1387 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6); 1388 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6); 1389 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED); 1390 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID); 1391 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID); 1392 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR); 1393 1394 _CASSERT(MBUF_WAITOK == M_WAIT); 1395 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT); 1396 _CASSERT(MBUF_COPYALL == M_COPYALL); 1397 1398 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK); 1399 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK); 1400 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE); 1401 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE); 1402 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE); 1403 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI); 1404 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI); 1405 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI); 1406 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO); 1407 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO); 1408 1409 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK); 1410 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE); 1411 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI); 1412 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO); 1413 1414 /* Module specific scratch space (32-bit alignment requirement) */ 1415 _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) % 1416 sizeof (uint32_t))); 1417 1418 /* Initialize random red zone cookie value */ 1419 _CASSERT(sizeof (mb_redzone_cookie) == 1420 sizeof (((struct pkthdr *)0)->redzone)); 1421 read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie)); 1422 1423 /* Make sure we don't save more than we should */ 1424 _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf)); 1425 1426 if (nmbclusters == 0) 1427 nmbclusters = NMBCLUSTERS; 1428 1429 /* This should be a sane (at least even) value by now */ 1430 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1)); 1431 1432 /* Setup the mbuf table */ 1433 mbuf_table_init(); 1434 1435 /* Global lock for common layer */ 1436 mbuf_mlock_grp_attr = lck_grp_attr_alloc_init(); 1437 mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr); 1438 mbuf_mlock_attr = lck_attr_alloc_init(); 1439 lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr); 1440 1441 /* 1442 * Allocate cluster slabs table: 1443 * 1444 * maxslabgrp = (N * 2048) / (1024 * 1024) 1445 * 1446 * Where N is nmbclusters rounded up to the nearest 512. This yields 1447 * mcl_slab_g_t units, each one representing a MB of memory. 1448 */ 1449 maxslabgrp = 1450 (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT; 1451 MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *), 1452 M_TEMP, M_WAITOK | M_ZERO); 1453 VERIFY(slabstbl != NULL); 1454 1455 /* 1456 * Allocate audit structures, if needed: 1457 * 1458 * maxclaudit = (maxslabgrp * 1024 * 1024) / 4096 1459 * 1460 * This yields mcl_audit_t units, each one representing a page. 1461 */ 1462 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug)); 1463 mbuf_debug |= mcache_getflags(); 1464 if (mbuf_debug & MCF_DEBUG) { 1465 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT); 1466 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit), 1467 M_TEMP, M_WAITOK | M_ZERO); 1468 VERIFY(mclaudit != NULL); 1469 1470 mcl_audit_con_cache = mcache_create("mcl_audit_contents", 1471 AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP); 1472 VERIFY(mcl_audit_con_cache != NULL); 1473 } 1474 mclverify = (mbuf_debug & MCF_VERIFY); 1475 mcltrace = (mbuf_debug & MCF_TRACE); 1476 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG); 1477 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG); 1478 1479 /* Enable mbuf leak logging, with a lock to protect the tables */ 1480 1481 mleak_lock_grp_attr = lck_grp_attr_alloc_init(); 1482 mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr); 1483 mleak_lock_attr = lck_attr_alloc_init(); 1484 lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr); 1485 1486 mleak_activate(); 1487 1488 /* Calculate the number of pages assigned to the cluster pool */ 1489 mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES; 1490 MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t), 1491 M_TEMP, M_WAITOK); 1492 VERIFY(mcl_paddr != NULL); 1493 1494 /* Register with the I/O Bus mapper */ 1495 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); 1496 bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t)); 1497 1498 embutl = (union mbigcluster *) 1499 ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES))); 1500 VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0); 1501 1502 /* Prime up the freelist */ 1503 PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl)); 1504 if (initmcl != 0) { 1505 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */ 1506 if (initmcl > m_maxlimit(MC_BIGCL)) 1507 initmcl = m_maxlimit(MC_BIGCL); 1508 } 1509 if (initmcl < m_minlimit(MC_BIGCL)) 1510 initmcl = m_minlimit(MC_BIGCL); 1511 1512 lck_mtx_lock(mbuf_mlock); 1513 1514 /* 1515 * For classes with non-zero minimum limits, populate their freelists 1516 * so that m_total(class) is at least m_minlimit(class). 1517 */ 1518 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0); 1519 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT); 1520 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL)); 1521 freelist_init(m_class(MC_CL)); 1522 1523 for (m = 0; m < NELEM(mbuf_table); m++) { 1524 /* Make sure we didn't miss any */ 1525 VERIFY(m_minlimit(m_class(m)) == 0 || 1526 m_total(m_class(m)) >= m_minlimit(m_class(m))); 1527 } 1528 1529 lck_mtx_unlock(mbuf_mlock); 1530 1531 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init, 1532 NULL, &thread); 1533 thread_deallocate(thread); 1534 1535 ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref), 1536 0, 0, MCR_SLEEP); 1537 1538 /* Create the cache for each class */ 1539 for (m = 0; m < NELEM(mbuf_table); m++) { 1540 void *allocfunc, *freefunc, *auditfunc, *logfunc; 1541 u_int32_t flags; 1542 1543 flags = mbuf_debug; 1544 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL || 1545 m_class(m) == MC_MBUF_16KCL) { 1546 allocfunc = mbuf_cslab_alloc; 1547 freefunc = mbuf_cslab_free; 1548 auditfunc = mbuf_cslab_audit; 1549 logfunc = mleak_logger; 1550 } else { 1551 allocfunc = mbuf_slab_alloc; 1552 freefunc = mbuf_slab_free; 1553 auditfunc = mbuf_slab_audit; 1554 logfunc = mleak_logger; 1555 } 1556 1557 /* 1558 * Disable per-CPU caches for jumbo classes if there 1559 * is no jumbo cluster pool available in the system. 1560 * The cache itself is still created (but will never 1561 * be populated) since it simplifies the code. 1562 */ 1563 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) && 1564 njcl == 0) 1565 flags |= MCF_NOCPUCACHE; 1566 1567 if (!mclfindleak) 1568 flags |= MCF_NOLEAKLOG; 1569 1570 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m), 1571 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify, 1572 (void *)(uintptr_t)m, flags, MCR_SLEEP); 1573 } 1574 1575 /* 1576 * Allocate structure for per-CPU statistics that's aligned 1577 * on the CPU cache boundary; this code assumes that we never 1578 * uninitialize this framework, since the original address 1579 * before alignment is not saved. 1580 */ 1581 ncpu = ml_get_max_cpus(); 1582 MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE, 1583 M_TEMP, M_WAITOK); 1584 VERIFY(buf != NULL); 1585 1586 mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, 1587 CPU_CACHE_LINE_SIZE); 1588 bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu)); 1589 1590 /* 1591 * Set the max limit on sb_max to be 1/16 th of the size of 1592 * memory allocated for mbuf clusters. 1593 */ 1594 high_sb_max = (nmbclusters << (MCLSHIFT - 4)); 1595 if (high_sb_max < sb_max) { 1596 /* sb_max is too large for this configuration, scale it down */ 1597 if (high_sb_max > (1 << MBSHIFT)) { 1598 /* We have atleast 16 M of mbuf pool */ 1599 sb_max = high_sb_max; 1600 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) { 1601 /* 1602 * If we have more than 1M of mbufpool, cap the size of 1603 * max sock buf at 1M 1604 */ 1605 sb_max = high_sb_max = (1 << MBSHIFT); 1606 } else { 1607 sb_max = high_sb_max; 1608 } 1609 } 1610 1611 /* allocate space for mbuf_dump_buf */ 1612 MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK); 1613 VERIFY(mbuf_dump_buf != NULL); 1614 1615 if (mbuf_debug & MCF_DEBUG) { 1616 printf("%s: MLEN %d, MHLEN %d\n", __func__, 1617 (int)_MLEN, (int)_MHLEN); 1618 } 1619 1620 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__, 1621 (nmbclusters << MCLSHIFT) >> MBSHIFT, 1622 (nclusters << MCLSHIFT) >> MBSHIFT, 1623 (njcl << MCLSHIFT) >> MBSHIFT); 1624} 1625 1626/* 1627 * Obtain a slab of object(s) from the class's freelist. 1628 */ 1629static mcache_obj_t * 1630slab_alloc(mbuf_class_t class, int wait) 1631{ 1632 mcl_slab_t *sp; 1633 mcache_obj_t *buf; 1634 1635 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 1636 1637 VERIFY(class != MC_16KCL || njcl > 0); 1638 1639 /* This should always be NULL for us */ 1640 VERIFY(m_cobjlist(class) == NULL); 1641 1642 /* 1643 * Treat composite objects as having longer lifespan by using 1644 * a slab from the reverse direction, in hoping that this could 1645 * reduce the probability of fragmentation for slabs that hold 1646 * more than one buffer chunks (e.g. mbuf slabs). For other 1647 * slabs, this probably doesn't make much of a difference. 1648 */ 1649 if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP)) 1650 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead); 1651 else 1652 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class)); 1653 1654 if (sp == NULL) { 1655 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0); 1656 /* The slab list for this class is empty */ 1657 return (NULL); 1658 } 1659 1660 VERIFY(m_infree(class) > 0); 1661 VERIFY(!slab_is_detached(sp)); 1662 VERIFY(sp->sl_class == class && 1663 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); 1664 buf = sp->sl_head; 1665 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf)); 1666 1667 if (class == MC_MBUF) { 1668 sp->sl_head = buf->obj_next; 1669 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1)); 1670 } else if (class == MC_CL) { 1671 sp->sl_head = buf->obj_next; 1672 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1)); 1673 } else { 1674 sp->sl_head = NULL; 1675 } 1676 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) { 1677 slab_nextptr_panic(sp, sp->sl_head); 1678 /* In case sl_head is in the map but not in the slab */ 1679 VERIFY(slab_inrange(sp, sp->sl_head)); 1680 /* NOTREACHED */ 1681 } 1682 1683 /* Increment slab reference */ 1684 sp->sl_refcnt++; 1685 1686 if (mclaudit != NULL) { 1687 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); 1688 mca->mca_uflags = 0; 1689 /* Save contents on mbuf objects only */ 1690 if (class == MC_MBUF) 1691 mca->mca_uflags |= MB_SCVALID; 1692 } 1693 1694 if (class == MC_CL) { 1695 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL); 1696 /* 1697 * A 2K cluster slab can have at most NCLPBG references. 1698 */ 1699 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG && 1700 sp->sl_chunks == NCLPBG && 1701 sp->sl_len == m_maxsize(MC_BIGCL)); 1702 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL); 1703 } else if (class == MC_BIGCL) { 1704 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) + 1705 m_infree(MC_MBUF_BIGCL); 1706 /* 1707 * A 4K cluster slab can have at most 1 reference. 1708 */ 1709 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && 1710 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); 1711 } else if (class == MC_16KCL) { 1712 mcl_slab_t *nsp; 1713 int k; 1714 1715 --m_infree(MC_16KCL); 1716 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && 1717 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); 1718 /* 1719 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB. 1720 * A 16KB big cluster takes NSLABSP16KB slabs, each having at 1721 * most 1 reference. 1722 */ 1723 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { 1724 nsp = nsp->sl_next; 1725 /* Next slab must already be present */ 1726 VERIFY(nsp != NULL); 1727 nsp->sl_refcnt++; 1728 VERIFY(!slab_is_detached(nsp)); 1729 VERIFY(nsp->sl_class == MC_16KCL && 1730 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) && 1731 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 && 1732 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && 1733 nsp->sl_head == NULL); 1734 } 1735 } else { 1736 VERIFY(class == MC_MBUF); 1737 --m_infree(MC_MBUF); 1738 /* 1739 * If auditing is turned on, this check is 1740 * deferred until later in mbuf_slab_audit(). 1741 */ 1742 if (mclaudit == NULL) 1743 _MCHECK((struct mbuf *)buf); 1744 /* 1745 * Since we have incremented the reference count above, 1746 * an mbuf slab (formerly a 4KB cluster slab that was cut 1747 * up into mbufs) must have a reference count between 1 1748 * and NMBPBG at this point. 1749 */ 1750 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG && 1751 sp->sl_chunks == NMBPBG && 1752 sp->sl_len == m_maxsize(MC_BIGCL)); 1753 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL); 1754 } 1755 1756 /* If empty, remove this slab from the class's freelist */ 1757 if (sp->sl_head == NULL) { 1758 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG); 1759 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG); 1760 slab_remove(sp, class); 1761 } 1762 1763 return (buf); 1764} 1765 1766/* 1767 * Place a slab of object(s) back into a class's slab list. 1768 */ 1769static void 1770slab_free(mbuf_class_t class, mcache_obj_t *buf) 1771{ 1772 mcl_slab_t *sp; 1773 1774 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 1775 1776 VERIFY(class != MC_16KCL || njcl > 0); 1777 VERIFY(buf->obj_next == NULL); 1778 sp = slab_get(buf); 1779 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) && 1780 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); 1781 1782 /* Decrement slab reference */ 1783 sp->sl_refcnt--; 1784 1785 if (class == MC_CL) { 1786 VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); 1787 /* 1788 * A slab that has been splitted for 2KB clusters can have 1789 * at most 1 outstanding reference at this point. 1790 */ 1791 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) && 1792 sp->sl_chunks == NCLPBG && 1793 sp->sl_len == m_maxsize(MC_BIGCL)); 1794 VERIFY(sp->sl_refcnt < (NCLPBG - 1) || 1795 (slab_is_detached(sp) && sp->sl_head == NULL)); 1796 } else if (class == MC_BIGCL) { 1797 VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); 1798 /* 1799 * A 4KB cluster slab can have at most 1 reference 1800 * which must be 0 at this point. 1801 */ 1802 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && 1803 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); 1804 VERIFY(slab_is_detached(sp)); 1805 } else if (class == MC_16KCL) { 1806 mcl_slab_t *nsp; 1807 int k; 1808 /* 1809 * A 16KB cluster takes NSLABSP16KB slabs, all must 1810 * now have 0 reference. 1811 */ 1812 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES)); 1813 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && 1814 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); 1815 VERIFY(slab_is_detached(sp)); 1816 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { 1817 nsp = nsp->sl_next; 1818 /* Next slab must already be present */ 1819 VERIFY(nsp != NULL); 1820 nsp->sl_refcnt--; 1821 VERIFY(slab_is_detached(nsp)); 1822 VERIFY(nsp->sl_class == MC_16KCL && 1823 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) && 1824 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 && 1825 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && 1826 nsp->sl_head == NULL); 1827 } 1828 } else { 1829 /* 1830 * A slab that has been splitted for mbufs has at most NMBPBG 1831 * reference counts. Since we have decremented one reference 1832 * above, it must now be between 0 and NMBPBG-1. 1833 */ 1834 VERIFY(class == MC_MBUF); 1835 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) && 1836 sp->sl_chunks == NMBPBG && 1837 sp->sl_len == m_maxsize(MC_BIGCL)); 1838 VERIFY(sp->sl_refcnt < (NMBPBG - 1) || 1839 (slab_is_detached(sp) && sp->sl_head == NULL)); 1840 } 1841 1842 /* 1843 * When auditing is enabled, ensure that the buffer still 1844 * contains the free pattern. Otherwise it got corrupted 1845 * while at the CPU cache layer. 1846 */ 1847 if (mclaudit != NULL) { 1848 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); 1849 if (mclverify) { 1850 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class)); 1851 } 1852 mca->mca_uflags &= ~MB_SCVALID; 1853 } 1854 1855 if (class == MC_CL) { 1856 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL); 1857 buf->obj_next = sp->sl_head; 1858 } else if (class == MC_BIGCL) { 1859 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + 1860 m_infree(MC_MBUF_BIGCL); 1861 } else if (class == MC_16KCL) { 1862 ++m_infree(MC_16KCL); 1863 } else { 1864 ++m_infree(MC_MBUF); 1865 buf->obj_next = sp->sl_head; 1866 } 1867 sp->sl_head = buf; 1868 1869 /* 1870 * If a slab has been splitted to either one which holds 2KB clusters, 1871 * or one which holds mbufs, turn it back to one which holds a 4KB 1872 * cluster. 1873 */ 1874 if (class == MC_MBUF && sp->sl_refcnt == 0 && 1875 m_total(class) > m_minlimit(class) && 1876 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) { 1877 int i = NMBPBG; 1878 1879 m_total(MC_BIGCL)++; 1880 mbstat.m_bigclusters = m_total(MC_BIGCL); 1881 m_total(MC_MBUF) -= NMBPBG; 1882 mbstat.m_mbufs = m_total(MC_MBUF); 1883 m_infree(MC_MBUF) -= NMBPBG; 1884 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG)); 1885 1886 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); 1887 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF)); 1888 1889 while (i--) { 1890 struct mbuf *m = sp->sl_head; 1891 VERIFY(m != NULL); 1892 sp->sl_head = m->m_next; 1893 m->m_next = NULL; 1894 } 1895 VERIFY(sp->sl_head == NULL); 1896 1897 /* Remove the slab from the mbuf class's slab list */ 1898 slab_remove(sp, class); 1899 1900 /* Reinitialize it as a 4KB cluster slab */ 1901 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base, 1902 sp->sl_len, 0, 1); 1903 1904 if (mclverify) { 1905 mcache_set_pattern(MCACHE_FREE_PATTERN, 1906 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL)); 1907 } 1908 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + 1909 m_infree(MC_MBUF_BIGCL); 1910 1911 VERIFY(slab_is_detached(sp)); 1912 /* And finally switch class */ 1913 class = MC_BIGCL; 1914 } else if (class == MC_CL && sp->sl_refcnt == 0 && 1915 m_total(class) > m_minlimit(class) && 1916 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) { 1917 int i = NCLPBG; 1918 1919 m_total(MC_BIGCL)++; 1920 mbstat.m_bigclusters = m_total(MC_BIGCL); 1921 m_total(MC_CL) -= NCLPBG; 1922 mbstat.m_clusters = m_total(MC_CL); 1923 m_infree(MC_CL) -= NCLPBG; 1924 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); 1925 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL)); 1926 1927 while (i--) { 1928 union mcluster *c = sp->sl_head; 1929 VERIFY(c != NULL); 1930 sp->sl_head = c->mcl_next; 1931 c->mcl_next = NULL; 1932 } 1933 VERIFY(sp->sl_head == NULL); 1934 1935 /* Remove the slab from the 2KB cluster class's slab list */ 1936 slab_remove(sp, class); 1937 1938 /* Reinitialize it as a 4KB cluster slab */ 1939 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base, 1940 sp->sl_len, 0, 1); 1941 1942 if (mclverify) { 1943 mcache_set_pattern(MCACHE_FREE_PATTERN, 1944 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL)); 1945 } 1946 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + 1947 m_infree(MC_MBUF_BIGCL); 1948 1949 VERIFY(slab_is_detached(sp)); 1950 /* And finally switch class */ 1951 class = MC_BIGCL; 1952 } 1953 1954 /* Reinsert the slab to the class's slab list */ 1955 if (slab_is_detached(sp)) 1956 slab_insert(sp, class); 1957} 1958 1959/* 1960 * Common allocator for rudimentary objects called by the CPU cache layer 1961 * during an allocation request whenever there is no available element in the 1962 * bucket layer. It returns one or more elements from the appropriate global 1963 * freelist. If the freelist is empty, it will attempt to populate it and 1964 * retry the allocation. 1965 */ 1966static unsigned int 1967mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait) 1968{ 1969 mbuf_class_t class = (mbuf_class_t)arg; 1970 unsigned int need = num; 1971 mcache_obj_t **list = *plist; 1972 1973 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); 1974 ASSERT(need > 0); 1975 1976 lck_mtx_lock(mbuf_mlock); 1977 1978 for (;;) { 1979 if ((*list = slab_alloc(class, wait)) != NULL) { 1980 (*list)->obj_next = NULL; 1981 list = *plist = &(*list)->obj_next; 1982 1983 if (--need == 0) { 1984 /* 1985 * If the number of elements in freelist has 1986 * dropped below low watermark, asynchronously 1987 * populate the freelist now rather than doing 1988 * it later when we run out of elements. 1989 */ 1990 if (!mbuf_cached_above(class, wait) && 1991 m_infree(class) < m_total(class) >> 5) { 1992 (void) freelist_populate(class, 1, 1993 M_DONTWAIT); 1994 } 1995 break; 1996 } 1997 } else { 1998 VERIFY(m_infree(class) == 0 || class == MC_CL); 1999 2000 (void) freelist_populate(class, 1, 2001 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT); 2002 2003 if (m_infree(class) > 0) 2004 continue; 2005 2006 /* Check if there's anything at the cache layer */ 2007 if (mbuf_cached_above(class, wait)) 2008 break; 2009 2010 /* watchdog checkpoint */ 2011 mbuf_watchdog(); 2012 2013 /* We have nothing and cannot block; give up */ 2014 if (wait & MCR_NOSLEEP) { 2015 if (!(wait & MCR_TRYHARD)) { 2016 m_fail_cnt(class)++; 2017 mbstat.m_drops++; 2018 break; 2019 } 2020 } 2021 2022 /* 2023 * If the freelist is still empty and the caller is 2024 * willing to be blocked, sleep on the wait channel 2025 * until an element is available. Otherwise, if 2026 * MCR_TRYHARD is set, do our best to satisfy the 2027 * request without having to go to sleep. 2028 */ 2029 if (mbuf_worker_ready && 2030 mbuf_sleep(class, need, wait)) 2031 break; 2032 2033 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2034 } 2035 } 2036 2037 m_alloc_cnt(class) += num - need; 2038 lck_mtx_unlock(mbuf_mlock); 2039 2040 return (num - need); 2041} 2042 2043/* 2044 * Common de-allocator for rudimentary objects called by the CPU cache 2045 * layer when one or more elements need to be returned to the appropriate 2046 * global freelist. 2047 */ 2048static void 2049mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged) 2050{ 2051 mbuf_class_t class = (mbuf_class_t)arg; 2052 mcache_obj_t *nlist; 2053 unsigned int num = 0; 2054 int w; 2055 2056 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); 2057 2058 lck_mtx_lock(mbuf_mlock); 2059 2060 for (;;) { 2061 nlist = list->obj_next; 2062 list->obj_next = NULL; 2063 slab_free(class, list); 2064 ++num; 2065 if ((list = nlist) == NULL) 2066 break; 2067 } 2068 m_free_cnt(class) += num; 2069 2070 if ((w = mb_waiters) > 0) 2071 mb_waiters = 0; 2072 2073 lck_mtx_unlock(mbuf_mlock); 2074 2075 if (w != 0) 2076 wakeup(mb_waitchan); 2077} 2078 2079/* 2080 * Common auditor for rudimentary objects called by the CPU cache layer 2081 * during an allocation or free request. For the former, this is called 2082 * after the objects are obtained from either the bucket or slab layer 2083 * and before they are returned to the caller. For the latter, this is 2084 * called immediately during free and before placing the objects into 2085 * the bucket or slab layer. 2086 */ 2087static void 2088mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) 2089{ 2090 mbuf_class_t class = (mbuf_class_t)arg; 2091 mcache_audit_t *mca; 2092 2093 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); 2094 2095 while (list != NULL) { 2096 lck_mtx_lock(mbuf_mlock); 2097 mca = mcl_audit_buf2mca(class, list); 2098 2099 /* Do the sanity checks */ 2100 if (class == MC_MBUF) { 2101 mcl_audit_mbuf(mca, list, FALSE, alloc); 2102 ASSERT(mca->mca_uflags & MB_SCVALID); 2103 } else { 2104 mcl_audit_cluster(mca, list, m_maxsize(class), 2105 alloc, TRUE); 2106 ASSERT(!(mca->mca_uflags & MB_SCVALID)); 2107 } 2108 /* Record this transaction */ 2109 if (mcltrace) 2110 mcache_buffer_log(mca, list, m_cache(class), &mb_start); 2111 2112 if (alloc) 2113 mca->mca_uflags |= MB_INUSE; 2114 else 2115 mca->mca_uflags &= ~MB_INUSE; 2116 /* Unpair the object (unconditionally) */ 2117 mca->mca_uptr = NULL; 2118 lck_mtx_unlock(mbuf_mlock); 2119 2120 list = list->obj_next; 2121 } 2122} 2123 2124/* 2125 * Common notify routine for all caches. It is called by mcache when 2126 * one or more objects get freed. We use this indication to trigger 2127 * the wakeup of any sleeping threads so that they can retry their 2128 * allocation requests. 2129 */ 2130static void 2131mbuf_slab_notify(void *arg, u_int32_t reason) 2132{ 2133 mbuf_class_t class = (mbuf_class_t)arg; 2134 int w; 2135 2136 ASSERT(MBUF_CLASS_VALID(class)); 2137 2138 if (reason != MCN_RETRYALLOC) 2139 return; 2140 2141 lck_mtx_lock(mbuf_mlock); 2142 if ((w = mb_waiters) > 0) { 2143 m_notified(class)++; 2144 mb_waiters = 0; 2145 } 2146 lck_mtx_unlock(mbuf_mlock); 2147 2148 if (w != 0) 2149 wakeup(mb_waitchan); 2150} 2151 2152/* 2153 * Obtain object(s) from the composite class's freelist. 2154 */ 2155static unsigned int 2156cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num) 2157{ 2158 unsigned int need = num; 2159 mcl_slab_t *sp, *clsp, *nsp; 2160 struct mbuf *m; 2161 mcache_obj_t **list = *plist; 2162 void *cl; 2163 2164 VERIFY(need > 0); 2165 VERIFY(class != MC_MBUF_16KCL || njcl > 0); 2166 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2167 2168 /* Get what we can from the freelist */ 2169 while ((*list = m_cobjlist(class)) != NULL) { 2170 MRANGE(*list); 2171 2172 m = (struct mbuf *)*list; 2173 sp = slab_get(m); 2174 cl = m->m_ext.ext_buf; 2175 clsp = slab_get(cl); 2176 VERIFY(m->m_flags == M_EXT && cl != NULL); 2177 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m)); 2178 2179 if (class == MC_MBUF_CL) { 2180 VERIFY(clsp->sl_refcnt >= 1 && 2181 clsp->sl_refcnt <= NCLPBG); 2182 } else { 2183 VERIFY(clsp->sl_refcnt == 1); 2184 } 2185 2186 if (class == MC_MBUF_16KCL) { 2187 int k; 2188 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { 2189 nsp = nsp->sl_next; 2190 /* Next slab must already be present */ 2191 VERIFY(nsp != NULL); 2192 VERIFY(nsp->sl_refcnt == 1); 2193 } 2194 } 2195 2196 if ((m_cobjlist(class) = (*list)->obj_next) != NULL && 2197 !MBUF_IN_MAP(m_cobjlist(class))) { 2198 slab_nextptr_panic(sp, m_cobjlist(class)); 2199 /* NOTREACHED */ 2200 } 2201 (*list)->obj_next = NULL; 2202 list = *plist = &(*list)->obj_next; 2203 2204 if (--need == 0) 2205 break; 2206 } 2207 m_infree(class) -= (num - need); 2208 2209 return (num - need); 2210} 2211 2212/* 2213 * Place object(s) back into a composite class's freelist. 2214 */ 2215static unsigned int 2216cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) 2217{ 2218 mcache_obj_t *o, *tail; 2219 unsigned int num = 0; 2220 struct mbuf *m, *ms; 2221 mcache_audit_t *mca = NULL; 2222 mcache_obj_t *ref_list = NULL; 2223 mcl_slab_t *clsp, *nsp; 2224 void *cl; 2225 mbuf_class_t cl_class; 2226 2227 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); 2228 VERIFY(class != MC_MBUF_16KCL || njcl > 0); 2229 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2230 2231 if (class == MC_MBUF_CL) { 2232 cl_class = MC_CL; 2233 } else if (class == MC_MBUF_BIGCL) { 2234 cl_class = MC_BIGCL; 2235 } else { 2236 VERIFY(class == MC_MBUF_16KCL); 2237 cl_class = MC_16KCL; 2238 } 2239 2240 o = tail = list; 2241 2242 while ((m = ms = (struct mbuf *)o) != NULL) { 2243 mcache_obj_t *rfa, *nexto = o->obj_next; 2244 2245 /* Do the mbuf sanity checks */ 2246 if (mclaudit != NULL) { 2247 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 2248 if (mclverify) { 2249 mcache_audit_free_verify(mca, m, 0, 2250 m_maxsize(MC_MBUF)); 2251 } 2252 ms = MCA_SAVED_MBUF_PTR(mca); 2253 } 2254 2255 /* Do the cluster sanity checks */ 2256 cl = ms->m_ext.ext_buf; 2257 clsp = slab_get(cl); 2258 if (mclverify) { 2259 size_t size = m_maxsize(cl_class); 2260 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class, 2261 (mcache_obj_t *)cl), cl, 0, size); 2262 } 2263 VERIFY(ms->m_type == MT_FREE); 2264 VERIFY(ms->m_flags == M_EXT); 2265 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); 2266 if (cl_class == MC_CL) { 2267 VERIFY(clsp->sl_refcnt >= 1 && 2268 clsp->sl_refcnt <= NCLPBG); 2269 } else { 2270 VERIFY(clsp->sl_refcnt == 1); 2271 } 2272 if (cl_class == MC_16KCL) { 2273 int k; 2274 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { 2275 nsp = nsp->sl_next; 2276 /* Next slab must already be present */ 2277 VERIFY(nsp != NULL); 2278 VERIFY(nsp->sl_refcnt == 1); 2279 } 2280 } 2281 2282 /* 2283 * If we're asked to purge, restore the actual mbuf using 2284 * contents of the shadow structure (if auditing is enabled) 2285 * and clear EXTF_COMPOSITE flag from the mbuf, as we are 2286 * about to free it and the attached cluster into their caches. 2287 */ 2288 if (purged) { 2289 /* Restore constructed mbuf fields */ 2290 if (mclaudit != NULL) 2291 mcl_audit_restore_mbuf(m, mca, TRUE); 2292 2293 MEXT_REF(m) = 0; 2294 MEXT_FLAGS(m) = 0; 2295 2296 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m); 2297 rfa->obj_next = ref_list; 2298 ref_list = rfa; 2299 MEXT_RFA(m) = NULL; 2300 2301 m->m_type = MT_FREE; 2302 m->m_flags = m->m_len = 0; 2303 m->m_next = m->m_nextpkt = NULL; 2304 2305 /* Save mbuf fields and make auditing happy */ 2306 if (mclaudit != NULL) 2307 mcl_audit_mbuf(mca, o, FALSE, FALSE); 2308 2309 VERIFY(m_total(class) > 0); 2310 m_total(class)--; 2311 2312 /* Free the mbuf */ 2313 o->obj_next = NULL; 2314 slab_free(MC_MBUF, o); 2315 2316 /* And free the cluster */ 2317 ((mcache_obj_t *)cl)->obj_next = NULL; 2318 if (class == MC_MBUF_CL) 2319 slab_free(MC_CL, cl); 2320 else if (class == MC_MBUF_BIGCL) 2321 slab_free(MC_BIGCL, cl); 2322 else 2323 slab_free(MC_16KCL, cl); 2324 } 2325 2326 ++num; 2327 tail = o; 2328 o = nexto; 2329 } 2330 2331 if (!purged) { 2332 tail->obj_next = m_cobjlist(class); 2333 m_cobjlist(class) = list; 2334 m_infree(class) += num; 2335 } else if (ref_list != NULL) { 2336 mcache_free_ext(ref_cache, ref_list); 2337 } 2338 2339 return (num); 2340} 2341 2342/* 2343 * Common allocator for composite objects called by the CPU cache layer 2344 * during an allocation request whenever there is no available element in 2345 * the bucket layer. It returns one or more composite elements from the 2346 * appropriate global freelist. If the freelist is empty, it will attempt 2347 * to obtain the rudimentary objects from their caches and construct them 2348 * into composite mbuf + cluster objects. 2349 */ 2350static unsigned int 2351mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, 2352 int wait) 2353{ 2354 mbuf_class_t class = (mbuf_class_t)arg; 2355 mbuf_class_t cl_class = 0; 2356 unsigned int num = 0, cnum = 0, want = needed; 2357 mcache_obj_t *ref_list = NULL; 2358 mcache_obj_t *mp_list = NULL; 2359 mcache_obj_t *clp_list = NULL; 2360 mcache_obj_t **list; 2361 struct ext_ref *rfa; 2362 struct mbuf *m; 2363 void *cl; 2364 2365 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); 2366 ASSERT(needed > 0); 2367 2368 VERIFY(class != MC_MBUF_16KCL || njcl > 0); 2369 2370 /* There should not be any slab for this class */ 2371 VERIFY(m_slab_cnt(class) == 0 && 2372 m_slablist(class).tqh_first == NULL && 2373 m_slablist(class).tqh_last == NULL); 2374 2375 lck_mtx_lock(mbuf_mlock); 2376 2377 /* Try using the freelist first */ 2378 num = cslab_alloc(class, plist, needed); 2379 list = *plist; 2380 if (num == needed) { 2381 m_alloc_cnt(class) += num; 2382 lck_mtx_unlock(mbuf_mlock); 2383 return (needed); 2384 } 2385 2386 lck_mtx_unlock(mbuf_mlock); 2387 2388 /* 2389 * We could not satisfy the request using the freelist alone; 2390 * allocate from the appropriate rudimentary caches and use 2391 * whatever we can get to construct the composite objects. 2392 */ 2393 needed -= num; 2394 2395 /* 2396 * Mark these allocation requests as coming from a composite cache. 2397 * Also, if the caller is willing to be blocked, mark the request 2398 * with MCR_FAILOK such that we don't end up sleeping at the mbuf 2399 * slab layer waiting for the individual object when one or more 2400 * of the already-constructed composite objects are available. 2401 */ 2402 wait |= MCR_COMP; 2403 if (!(wait & MCR_NOSLEEP)) 2404 wait |= MCR_FAILOK; 2405 2406 /* allocate mbufs */ 2407 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait); 2408 if (needed == 0) { 2409 ASSERT(mp_list == NULL); 2410 goto fail; 2411 } 2412 2413 /* allocate clusters */ 2414 if (class == MC_MBUF_CL) { 2415 cl_class = MC_CL; 2416 } else if (class == MC_MBUF_BIGCL) { 2417 cl_class = MC_BIGCL; 2418 } else { 2419 VERIFY(class == MC_MBUF_16KCL); 2420 cl_class = MC_16KCL; 2421 } 2422 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait); 2423 if (needed == 0) { 2424 ASSERT(clp_list == NULL); 2425 goto fail; 2426 } 2427 2428 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait); 2429 if (needed == 0) { 2430 ASSERT(ref_list == NULL); 2431 goto fail; 2432 } 2433 2434 /* 2435 * By this time "needed" is MIN(mbuf, cluster, ref). Any left 2436 * overs will get freed accordingly before we return to caller. 2437 */ 2438 for (cnum = 0; cnum < needed; cnum++) { 2439 struct mbuf *ms; 2440 2441 m = ms = (struct mbuf *)mp_list; 2442 mp_list = mp_list->obj_next; 2443 2444 cl = clp_list; 2445 clp_list = clp_list->obj_next; 2446 ((mcache_obj_t *)cl)->obj_next = NULL; 2447 2448 rfa = (struct ext_ref *)ref_list; 2449 ref_list = ref_list->obj_next; 2450 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL; 2451 2452 /* 2453 * If auditing is enabled, construct the shadow mbuf 2454 * in the audit structure instead of in the actual one. 2455 * mbuf_cslab_audit() will take care of restoring the 2456 * contents after the integrity check. 2457 */ 2458 if (mclaudit != NULL) { 2459 mcache_audit_t *mca, *cl_mca; 2460 2461 lck_mtx_lock(mbuf_mlock); 2462 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 2463 ms = MCA_SAVED_MBUF_PTR(mca); 2464 cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl); 2465 2466 /* 2467 * Pair them up. Note that this is done at the time 2468 * the mbuf+cluster objects are constructed. This 2469 * information should be treated as "best effort" 2470 * debugging hint since more than one mbufs can refer 2471 * to a cluster. In that case, the cluster might not 2472 * be freed along with the mbuf it was paired with. 2473 */ 2474 mca->mca_uptr = cl_mca; 2475 cl_mca->mca_uptr = mca; 2476 2477 ASSERT(mca->mca_uflags & MB_SCVALID); 2478 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID)); 2479 lck_mtx_unlock(mbuf_mlock); 2480 2481 /* Technically, they are in the freelist */ 2482 if (mclverify) { 2483 size_t size; 2484 2485 mcache_set_pattern(MCACHE_FREE_PATTERN, m, 2486 m_maxsize(MC_MBUF)); 2487 2488 if (class == MC_MBUF_CL) 2489 size = m_maxsize(MC_CL); 2490 else if (class == MC_MBUF_BIGCL) 2491 size = m_maxsize(MC_BIGCL); 2492 else 2493 size = m_maxsize(MC_16KCL); 2494 2495 mcache_set_pattern(MCACHE_FREE_PATTERN, cl, 2496 size); 2497 } 2498 } 2499 2500 MBUF_INIT(ms, 0, MT_FREE); 2501 if (class == MC_MBUF_16KCL) { 2502 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); 2503 } else if (class == MC_MBUF_BIGCL) { 2504 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); 2505 } else { 2506 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); 2507 } 2508 VERIFY(ms->m_flags == M_EXT); 2509 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); 2510 2511 *list = (mcache_obj_t *)m; 2512 (*list)->obj_next = NULL; 2513 list = *plist = &(*list)->obj_next; 2514 } 2515 2516fail: 2517 /* 2518 * Free up what's left of the above. 2519 */ 2520 if (mp_list != NULL) 2521 mcache_free_ext(m_cache(MC_MBUF), mp_list); 2522 if (clp_list != NULL) 2523 mcache_free_ext(m_cache(cl_class), clp_list); 2524 if (ref_list != NULL) 2525 mcache_free_ext(ref_cache, ref_list); 2526 2527 lck_mtx_lock(mbuf_mlock); 2528 if (num > 0 || cnum > 0) { 2529 m_total(class) += cnum; 2530 VERIFY(m_total(class) <= m_maxlimit(class)); 2531 m_alloc_cnt(class) += num + cnum; 2532 } 2533 if ((num + cnum) < want) 2534 m_fail_cnt(class) += (want - (num + cnum)); 2535 lck_mtx_unlock(mbuf_mlock); 2536 2537 return (num + cnum); 2538} 2539 2540/* 2541 * Common de-allocator for composite objects called by the CPU cache 2542 * layer when one or more elements need to be returned to the appropriate 2543 * global freelist. 2544 */ 2545static void 2546mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged) 2547{ 2548 mbuf_class_t class = (mbuf_class_t)arg; 2549 unsigned int num; 2550 int w; 2551 2552 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); 2553 2554 lck_mtx_lock(mbuf_mlock); 2555 2556 num = cslab_free(class, list, purged); 2557 m_free_cnt(class) += num; 2558 2559 if ((w = mb_waiters) > 0) 2560 mb_waiters = 0; 2561 2562 lck_mtx_unlock(mbuf_mlock); 2563 2564 if (w != 0) 2565 wakeup(mb_waitchan); 2566} 2567 2568/* 2569 * Common auditor for composite objects called by the CPU cache layer 2570 * during an allocation or free request. For the former, this is called 2571 * after the objects are obtained from either the bucket or slab layer 2572 * and before they are returned to the caller. For the latter, this is 2573 * called immediately during free and before placing the objects into 2574 * the bucket or slab layer. 2575 */ 2576static void 2577mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) 2578{ 2579 mbuf_class_t class = (mbuf_class_t)arg; 2580 mcache_audit_t *mca; 2581 struct mbuf *m, *ms; 2582 mcl_slab_t *clsp, *nsp; 2583 size_t size; 2584 void *cl; 2585 2586 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); 2587 2588 while ((m = ms = (struct mbuf *)list) != NULL) { 2589 lck_mtx_lock(mbuf_mlock); 2590 /* Do the mbuf sanity checks and record its transaction */ 2591 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 2592 mcl_audit_mbuf(mca, m, TRUE, alloc); 2593 if (mcltrace) 2594 mcache_buffer_log(mca, m, m_cache(class), &mb_start); 2595 2596 if (alloc) 2597 mca->mca_uflags |= MB_COMP_INUSE; 2598 else 2599 mca->mca_uflags &= ~MB_COMP_INUSE; 2600 2601 /* 2602 * Use the shadow mbuf in the audit structure if we are 2603 * freeing, since the contents of the actual mbuf has been 2604 * pattern-filled by the above call to mcl_audit_mbuf(). 2605 */ 2606 if (!alloc && mclverify) 2607 ms = MCA_SAVED_MBUF_PTR(mca); 2608 2609 /* Do the cluster sanity checks and record its transaction */ 2610 cl = ms->m_ext.ext_buf; 2611 clsp = slab_get(cl); 2612 VERIFY(ms->m_flags == M_EXT && cl != NULL); 2613 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); 2614 if (class == MC_MBUF_CL) 2615 VERIFY(clsp->sl_refcnt >= 1 && 2616 clsp->sl_refcnt <= NCLPBG); 2617 else 2618 VERIFY(clsp->sl_refcnt == 1); 2619 2620 if (class == MC_MBUF_16KCL) { 2621 int k; 2622 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { 2623 nsp = nsp->sl_next; 2624 /* Next slab must already be present */ 2625 VERIFY(nsp != NULL); 2626 VERIFY(nsp->sl_refcnt == 1); 2627 } 2628 } 2629 2630 mca = mcl_audit_buf2mca(MC_CL, cl); 2631 if (class == MC_MBUF_CL) 2632 size = m_maxsize(MC_CL); 2633 else if (class == MC_MBUF_BIGCL) 2634 size = m_maxsize(MC_BIGCL); 2635 else 2636 size = m_maxsize(MC_16KCL); 2637 mcl_audit_cluster(mca, cl, size, alloc, FALSE); 2638 if (mcltrace) 2639 mcache_buffer_log(mca, cl, m_cache(class), &mb_start); 2640 2641 if (alloc) 2642 mca->mca_uflags |= MB_COMP_INUSE; 2643 else 2644 mca->mca_uflags &= ~MB_COMP_INUSE; 2645 lck_mtx_unlock(mbuf_mlock); 2646 2647 list = list->obj_next; 2648 } 2649} 2650 2651/* 2652 * Allocate some number of mbuf clusters and place on cluster freelist. 2653 */ 2654static int 2655m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) 2656{ 2657 int i; 2658 vm_size_t size = 0; 2659 int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL)); 2660 vm_offset_t page = 0; 2661 mcache_audit_t *mca_list = NULL; 2662 mcache_obj_t *con_list = NULL; 2663 mcl_slab_t *sp; 2664 2665 VERIFY(bufsize == m_maxsize(MC_BIGCL) || 2666 bufsize == m_maxsize(MC_16KCL)); 2667 2668 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2669 2670 /* 2671 * Multiple threads may attempt to populate the cluster map one 2672 * after another. Since we drop the lock below prior to acquiring 2673 * the physical page(s), our view of the cluster map may no longer 2674 * be accurate, and we could end up over-committing the pages beyond 2675 * the maximum allowed for each class. To prevent it, this entire 2676 * operation (including the page mapping) is serialized. 2677 */ 2678 while (mb_clalloc_busy) { 2679 mb_clalloc_waiters++; 2680 (void) msleep(mb_clalloc_waitchan, mbuf_mlock, 2681 (PZERO-1), "m_clalloc", NULL); 2682 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2683 } 2684 2685 /* We are busy now; tell everyone else to go away */ 2686 mb_clalloc_busy = TRUE; 2687 2688 /* 2689 * Honor the caller's wish to block or not block. We have a way 2690 * to grow the pool asynchronously using the mbuf worker thread. 2691 */ 2692 i = m_howmany(num, bufsize); 2693 if (i == 0 || (wait & M_DONTWAIT)) 2694 goto out; 2695 2696 lck_mtx_unlock(mbuf_mlock); 2697 2698 size = round_page(i * bufsize); 2699 page = kmem_mb_alloc(mb_map, size, large_buffer); 2700 2701 /* 2702 * If we did ask for "n" 16KB physically contiguous chunks 2703 * and didn't get them, then please try again without this 2704 * restriction. 2705 */ 2706 if (large_buffer && page == 0) 2707 page = kmem_mb_alloc(mb_map, size, 0); 2708 2709 if (page == 0) { 2710 if (bufsize == m_maxsize(MC_BIGCL)) { 2711 /* Try for 1 page if failed, only 4KB request */ 2712 size = NBPG; 2713 page = kmem_mb_alloc(mb_map, size, 0); 2714 } 2715 2716 if (page == 0) { 2717 lck_mtx_lock(mbuf_mlock); 2718 goto out; 2719 } 2720 } 2721 2722 VERIFY(IS_P2ALIGNED(page, NBPG)); 2723 numpages = size / NBPG; 2724 2725 /* If auditing is enabled, allocate the audit structures now */ 2726 if (mclaudit != NULL) { 2727 int needed; 2728 2729 /* 2730 * Yes, I realize this is a waste of memory for clusters 2731 * that never get transformed into mbufs, as we may end 2732 * up with NMBPBG-1 unused audit structures per cluster. 2733 * But doing so tremendously simplifies the allocation 2734 * strategy, since at this point we are not holding the 2735 * mbuf lock and the caller is okay to be blocked. 2736 */ 2737 if (bufsize == m_maxsize(MC_BIGCL)) { 2738 needed = numpages * NMBPBG; 2739 2740 i = mcache_alloc_ext(mcl_audit_con_cache, 2741 &con_list, needed, MCR_SLEEP); 2742 2743 VERIFY(con_list != NULL && i == needed); 2744 } else { 2745 needed = numpages / NSLABSP16KB; 2746 } 2747 2748 i = mcache_alloc_ext(mcache_audit_cache, 2749 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP); 2750 2751 VERIFY(mca_list != NULL && i == needed); 2752 } 2753 2754 lck_mtx_lock(mbuf_mlock); 2755 2756 for (i = 0; i < numpages; i++, page += NBPG) { 2757 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG; 2758 ppnum_t new_page = pmap_find_phys(kernel_pmap, page); 2759 2760 /* 2761 * If there is a mapper the appropriate I/O page is returned; 2762 * zero out the page to discard its past contents to prevent 2763 * exposing leftover kernel memory. 2764 */ 2765 VERIFY(offset < mcl_pages); 2766 if (mcl_paddr_base != 0) { 2767 bzero((void *)(uintptr_t) page, page_size); 2768 new_page = IOMapperInsertPage(mcl_paddr_base, 2769 offset, new_page); 2770 } 2771 mcl_paddr[offset] = new_page; 2772 2773 /* Pattern-fill this fresh page */ 2774 if (mclverify) { 2775 mcache_set_pattern(MCACHE_FREE_PATTERN, 2776 (caddr_t)page, NBPG); 2777 } 2778 if (bufsize == m_maxsize(MC_BIGCL)) { 2779 union mbigcluster *mbc = (union mbigcluster *)page; 2780 2781 /* One for the entire page */ 2782 sp = slab_get(mbc); 2783 if (mclaudit != NULL) { 2784 mcl_audit_init(mbc, &mca_list, &con_list, 2785 AUDIT_CONTENTS_SIZE, NMBPBG); 2786 } 2787 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); 2788 slab_init(sp, MC_BIGCL, SLF_MAPPED, 2789 mbc, mbc, bufsize, 0, 1); 2790 2791 /* Insert this slab */ 2792 slab_insert(sp, MC_BIGCL); 2793 2794 /* Update stats now since slab_get() drops the lock */ 2795 mbstat.m_bigclfree = ++m_infree(MC_BIGCL) + 2796 m_infree(MC_MBUF_BIGCL); 2797 mbstat.m_bigclusters = ++m_total(MC_BIGCL); 2798 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); 2799 } else if ((i % NSLABSP16KB) == 0) { 2800 union m16kcluster *m16kcl = (union m16kcluster *)page; 2801 mcl_slab_t *nsp; 2802 int k; 2803 2804 VERIFY(njcl > 0); 2805 /* One for the entire 16KB */ 2806 sp = slab_get(m16kcl); 2807 if (mclaudit != NULL) 2808 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1); 2809 2810 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); 2811 slab_init(sp, MC_16KCL, SLF_MAPPED, 2812 m16kcl, m16kcl, bufsize, 0, 1); 2813 2814 /* 2815 * 2nd-Nth page's slab is part of the first one, 2816 * where N is NSLABSP16KB. 2817 */ 2818 for (k = 1; k < NSLABSP16KB; k++) { 2819 nsp = slab_get(((union mbigcluster *)page) + k); 2820 VERIFY(nsp->sl_refcnt == 0 && 2821 nsp->sl_flags == 0); 2822 slab_init(nsp, MC_16KCL, 2823 SLF_MAPPED | SLF_PARTIAL, 2824 m16kcl, NULL, 0, 0, 0); 2825 } 2826 2827 /* Insert this slab */ 2828 slab_insert(sp, MC_16KCL); 2829 2830 /* Update stats now since slab_get() drops the lock */ 2831 m_infree(MC_16KCL)++; 2832 m_total(MC_16KCL)++; 2833 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); 2834 } 2835 } 2836 VERIFY(mca_list == NULL && con_list == NULL); 2837 2838 /* We're done; let others enter */ 2839 mb_clalloc_busy = FALSE; 2840 if (mb_clalloc_waiters > 0) { 2841 mb_clalloc_waiters = 0; 2842 wakeup(mb_clalloc_waitchan); 2843 } 2844 2845 if (bufsize == m_maxsize(MC_BIGCL)) 2846 return (numpages); 2847 2848 VERIFY(bufsize == m_maxsize(MC_16KCL)); 2849 return (numpages / NSLABSP16KB); 2850 2851out: 2852 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2853 2854 /* We're done; let others enter */ 2855 mb_clalloc_busy = FALSE; 2856 if (mb_clalloc_waiters > 0) { 2857 mb_clalloc_waiters = 0; 2858 wakeup(mb_clalloc_waitchan); 2859 } 2860 2861 /* 2862 * When non-blocking we kick a thread if we have to grow the 2863 * pool or if the number of free clusters is less than requested. 2864 */ 2865 if (bufsize == m_maxsize(MC_BIGCL)) { 2866 if (i > 0) { 2867 /* 2868 * Remember total number of 4KB clusters needed 2869 * at this time. 2870 */ 2871 i += m_total(MC_BIGCL); 2872 if (i > mbuf_expand_big) { 2873 mbuf_expand_big = i; 2874 if (mbuf_worker_ready) 2875 wakeup((caddr_t)&mbuf_worker_run); 2876 } 2877 } 2878 2879 if (m_infree(MC_BIGCL) >= num) 2880 return (1); 2881 } else { 2882 if (i > 0) { 2883 /* 2884 * Remember total number of 16KB clusters needed 2885 * at this time. 2886 */ 2887 i += m_total(MC_16KCL); 2888 if (i > mbuf_expand_16k) { 2889 mbuf_expand_16k = i; 2890 if (mbuf_worker_ready) 2891 wakeup((caddr_t)&mbuf_worker_run); 2892 } 2893 } 2894 2895 if (m_infree(MC_16KCL) >= num) 2896 return (1); 2897 } 2898 return (0); 2899} 2900 2901/* 2902 * Populate the global freelist of the corresponding buffer class. 2903 */ 2904static int 2905freelist_populate(mbuf_class_t class, unsigned int num, int wait) 2906{ 2907 mcache_obj_t *o = NULL; 2908 int i, numpages = 0, count; 2909 2910 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL || 2911 class == MC_16KCL); 2912 2913 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2914 2915 switch (class) { 2916 case MC_MBUF: 2917 case MC_CL: 2918 case MC_BIGCL: 2919 numpages = (num * m_size(class) + NBPG - 1) / NBPG; 2920 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL)); 2921 2922 /* Respect the 4KB clusters minimum limit */ 2923 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) && 2924 m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) { 2925 if (class != MC_BIGCL || (wait & MCR_COMP)) 2926 return (0); 2927 } 2928 if (class == MC_BIGCL) 2929 return (i != 0); 2930 break; 2931 2932 case MC_16KCL: 2933 return (m_clalloc(num, wait, m_maxsize(class)) != 0); 2934 /* NOTREACHED */ 2935 2936 default: 2937 VERIFY(0); 2938 /* NOTREACHED */ 2939 } 2940 2941 VERIFY(class == MC_MBUF || class == MC_CL); 2942 2943 /* how many objects will we cut the page into? */ 2944 int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG); 2945 2946 for (count = 0; count < numpages; count++) { 2947 2948 /* respect totals, minlimit, maxlimit */ 2949 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) || 2950 m_total(class) >= m_maxlimit(class)) 2951 break; 2952 2953 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL) 2954 break; 2955 2956 struct mbuf *m = (struct mbuf *)o; 2957 union mcluster *c = (union mcluster *)o; 2958 mcl_slab_t *sp = slab_get(o); 2959 mcache_audit_t *mca = NULL; 2960 2961 VERIFY(slab_is_detached(sp) && 2962 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); 2963 2964 /* 2965 * Make sure that the cluster is unmolested 2966 * while in freelist 2967 */ 2968 if (mclverify) { 2969 mca = mcl_audit_buf2mca(MC_BIGCL, o); 2970 mcache_audit_free_verify(mca, o, 0, 2971 m_maxsize(MC_BIGCL)); 2972 } 2973 2974 /* Reinitialize it as an mbuf or 2K slab */ 2975 slab_init(sp, class, sp->sl_flags, 2976 sp->sl_base, NULL, sp->sl_len, 0, numobj); 2977 2978 VERIFY(o == (mcache_obj_t *)sp->sl_base); 2979 VERIFY(sp->sl_head == NULL); 2980 2981 VERIFY(m_total(MC_BIGCL) > 0); 2982 m_total(MC_BIGCL)--; 2983 mbstat.m_bigclusters = m_total(MC_BIGCL); 2984 2985 m_total(class) += numobj; 2986 m_infree(class) += numobj; 2987 2988 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL)); 2989 VERIFY(m_total(class) <= m_maxlimit(class)); 2990 2991 i = numobj; 2992 if (class == MC_MBUF) { 2993 mbstat.m_mbufs = m_total(MC_MBUF); 2994 mtype_stat_add(MT_FREE, NMBPBG); 2995 while (i--) { 2996 /* 2997 * If auditing is enabled, construct the 2998 * shadow mbuf in the audit structure 2999 * instead of the actual one. 3000 * mbuf_slab_audit() will take care of 3001 * restoring the contents after the 3002 * integrity check. 3003 */ 3004 if (mclaudit != NULL) { 3005 struct mbuf *ms; 3006 mca = mcl_audit_buf2mca(MC_MBUF, 3007 (mcache_obj_t *)m); 3008 ms = MCA_SAVED_MBUF_PTR(mca); 3009 ms->m_type = MT_FREE; 3010 } else { 3011 m->m_type = MT_FREE; 3012 } 3013 m->m_next = sp->sl_head; 3014 sp->sl_head = (void *)m++; 3015 } 3016 } else { /* MC_CL */ 3017 mbstat.m_clfree = 3018 m_infree(MC_CL) + m_infree(MC_MBUF_CL); 3019 mbstat.m_clusters = m_total(MC_CL); 3020 while (i--) { 3021 c->mcl_next = sp->sl_head; 3022 sp->sl_head = (void *)c++; 3023 } 3024 } 3025 3026 /* Insert into the mbuf or 2k slab list */ 3027 slab_insert(sp, class); 3028 3029 if ((i = mb_waiters) > 0) 3030 mb_waiters = 0; 3031 if (i != 0) 3032 wakeup(mb_waitchan); 3033 } 3034 return (count != 0); 3035} 3036 3037/* 3038 * For each class, initialize the freelist to hold m_minlimit() objects. 3039 */ 3040static void 3041freelist_init(mbuf_class_t class) 3042{ 3043 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 3044 3045 VERIFY(class == MC_CL || class == MC_BIGCL); 3046 VERIFY(m_total(class) == 0); 3047 VERIFY(m_minlimit(class) > 0); 3048 3049 while (m_total(class) < m_minlimit(class)) 3050 (void) freelist_populate(class, m_minlimit(class), M_WAIT); 3051 3052 VERIFY(m_total(class) >= m_minlimit(class)); 3053} 3054 3055/* 3056 * (Inaccurately) check if it might be worth a trip back to the 3057 * mcache layer due the availability of objects there. We'll 3058 * end up back here if there's nothing up there. 3059 */ 3060static boolean_t 3061mbuf_cached_above(mbuf_class_t class, int wait) 3062{ 3063 switch (class) { 3064 case MC_MBUF: 3065 if (wait & MCR_COMP) 3066 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) || 3067 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL))); 3068 break; 3069 3070 case MC_CL: 3071 if (wait & MCR_COMP) 3072 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL))); 3073 break; 3074 3075 case MC_BIGCL: 3076 if (wait & MCR_COMP) 3077 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL))); 3078 break; 3079 3080 case MC_16KCL: 3081 if (wait & MCR_COMP) 3082 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL))); 3083 break; 3084 3085 case MC_MBUF_CL: 3086 case MC_MBUF_BIGCL: 3087 case MC_MBUF_16KCL: 3088 break; 3089 3090 default: 3091 VERIFY(0); 3092 /* NOTREACHED */ 3093 } 3094 3095 return (!mcache_bkt_isempty(m_cache(class))); 3096} 3097 3098/* 3099 * If possible, convert constructed objects to raw ones. 3100 */ 3101static boolean_t 3102mbuf_steal(mbuf_class_t class, unsigned int num) 3103{ 3104 mcache_obj_t *top = NULL; 3105 mcache_obj_t **list = ⊤ 3106 unsigned int tot = 0; 3107 3108 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 3109 3110 switch (class) { 3111 case MC_MBUF: 3112 case MC_CL: 3113 case MC_BIGCL: 3114 case MC_16KCL: 3115 return (FALSE); 3116 3117 case MC_MBUF_CL: 3118 case MC_MBUF_BIGCL: 3119 case MC_MBUF_16KCL: 3120 /* Get the required number of constructed objects if possible */ 3121 if (m_infree(class) > m_minlimit(class)) { 3122 tot = cslab_alloc(class, &list, 3123 MIN(num, m_infree(class))); 3124 } 3125 3126 /* And destroy them to get back the raw objects */ 3127 if (top != NULL) 3128 (void) cslab_free(class, top, 1); 3129 break; 3130 3131 default: 3132 VERIFY(0); 3133 /* NOTREACHED */ 3134 } 3135 3136 return (tot == num); 3137} 3138 3139static void 3140m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp) 3141{ 3142 int m, bmap = 0; 3143 3144 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 3145 3146 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); 3147 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); 3148 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); 3149 3150 /* 3151 * This logic can be made smarter; for now, simply mark 3152 * all other related classes as potential victims. 3153 */ 3154 switch (class) { 3155 case MC_MBUF: 3156 m_wantpurge(MC_CL)++; 3157 m_wantpurge(MC_BIGCL)++; 3158 m_wantpurge(MC_MBUF_CL)++; 3159 m_wantpurge(MC_MBUF_BIGCL)++; 3160 break; 3161 3162 case MC_CL: 3163 m_wantpurge(MC_MBUF)++; 3164 m_wantpurge(MC_BIGCL)++; 3165 m_wantpurge(MC_MBUF_BIGCL)++; 3166 if (!comp) 3167 m_wantpurge(MC_MBUF_CL)++; 3168 break; 3169 3170 case MC_BIGCL: 3171 m_wantpurge(MC_MBUF)++; 3172 m_wantpurge(MC_CL)++; 3173 m_wantpurge(MC_MBUF_CL)++; 3174 if (!comp) 3175 m_wantpurge(MC_MBUF_BIGCL)++; 3176 break; 3177 3178 case MC_16KCL: 3179 if (!comp) 3180 m_wantpurge(MC_MBUF_16KCL)++; 3181 break; 3182 3183 default: 3184 VERIFY(0); 3185 /* NOTREACHED */ 3186 } 3187 3188 /* 3189 * Run through each marked class and check if we really need to 3190 * purge (and therefore temporarily disable) the per-CPU caches 3191 * layer used by the class. If so, remember the classes since 3192 * we are going to drop the lock below prior to purging. 3193 */ 3194 for (m = 0; m < NELEM(mbuf_table); m++) { 3195 if (m_wantpurge(m) > 0) { 3196 m_wantpurge(m) = 0; 3197 /* 3198 * Try hard to steal the required number of objects 3199 * from the freelist of other mbuf classes. Only 3200 * purge and disable the per-CPU caches layer when 3201 * we don't have enough; it's the last resort. 3202 */ 3203 if (!mbuf_steal(m, num)) 3204 bmap |= (1 << m); 3205 } 3206 } 3207 3208 lck_mtx_unlock(mbuf_mlock); 3209 3210 if (bmap != 0) { 3211 /* signal the domains to drain */ 3212 net_drain_domains(); 3213 3214 /* Sigh; we have no other choices but to ask mcache to purge */ 3215 for (m = 0; m < NELEM(mbuf_table); m++) { 3216 if ((bmap & (1 << m)) && 3217 mcache_purge_cache(m_cache(m))) { 3218 lck_mtx_lock(mbuf_mlock); 3219 m_purge_cnt(m)++; 3220 mbstat.m_drain++; 3221 lck_mtx_unlock(mbuf_mlock); 3222 } 3223 } 3224 } else { 3225 /* 3226 * Request mcache to reap extra elements from all of its caches; 3227 * note that all reaps are serialized and happen only at a fixed 3228 * interval. 3229 */ 3230 mcache_reap(); 3231 } 3232 lck_mtx_lock(mbuf_mlock); 3233} 3234 3235static inline struct mbuf * 3236m_get_common(int wait, short type, int hdr) 3237{ 3238 struct mbuf *m; 3239 int mcflags = MSLEEPF(wait); 3240 3241 /* Is this due to a non-blocking retry? If so, then try harder */ 3242 if (mcflags & MCR_NOSLEEP) 3243 mcflags |= MCR_TRYHARD; 3244 3245 m = mcache_alloc(m_cache(MC_MBUF), mcflags); 3246 if (m != NULL) { 3247 MBUF_INIT(m, hdr, type); 3248 mtype_stat_inc(type); 3249 mtype_stat_dec(MT_FREE); 3250#if CONFIG_MACF_NET 3251 if (hdr && mac_init_mbuf(m, wait) != 0) { 3252 m_free(m); 3253 return (NULL); 3254 } 3255#endif /* MAC_NET */ 3256 } 3257 return (m); 3258} 3259 3260/* 3261 * Space allocation routines; these are also available as macros 3262 * for critical paths. 3263 */ 3264#define _M_GET(wait, type) m_get_common(wait, type, 0) 3265#define _M_GETHDR(wait, type) m_get_common(wait, type, 1) 3266#define _M_RETRY(wait, type) _M_GET(wait, type) 3267#define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type) 3268#define _MGET(m, how, type) ((m) = _M_GET(how, type)) 3269#define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type)) 3270 3271struct mbuf * 3272m_get(int wait, int type) 3273{ 3274 return (_M_GET(wait, type)); 3275} 3276 3277struct mbuf * 3278m_gethdr(int wait, int type) 3279{ 3280 return (_M_GETHDR(wait, type)); 3281} 3282 3283struct mbuf * 3284m_retry(int wait, int type) 3285{ 3286 return (_M_RETRY(wait, type)); 3287} 3288 3289struct mbuf * 3290m_retryhdr(int wait, int type) 3291{ 3292 return (_M_RETRYHDR(wait, type)); 3293} 3294 3295struct mbuf * 3296m_getclr(int wait, int type) 3297{ 3298 struct mbuf *m; 3299 3300 _MGET(m, wait, type); 3301 if (m != NULL) 3302 bzero(MTOD(m, caddr_t), MLEN); 3303 return (m); 3304} 3305 3306struct mbuf * 3307m_free(struct mbuf *m) 3308{ 3309 struct mbuf *n = m->m_next; 3310 3311 if (m->m_type == MT_FREE) 3312 panic("m_free: freeing an already freed mbuf"); 3313 3314 if (m->m_flags & M_PKTHDR) { 3315 /* Check for scratch area overflow */ 3316 m_redzone_verify(m); 3317 /* Free the aux data and tags if there is any */ 3318 m_tag_delete_chain(m, NULL); 3319 } 3320 3321 if (m->m_flags & M_EXT) { 3322 u_int32_t refcnt; 3323 u_int32_t composite; 3324 3325 refcnt = m_decref(m); 3326 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); 3327 if (refcnt == 0 && !composite) { 3328 if (m->m_ext.ext_free == NULL) { 3329 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); 3330 } else if (m->m_ext.ext_free == m_bigfree) { 3331 mcache_free(m_cache(MC_BIGCL), 3332 m->m_ext.ext_buf); 3333 } else if (m->m_ext.ext_free == m_16kfree) { 3334 mcache_free(m_cache(MC_16KCL), 3335 m->m_ext.ext_buf); 3336 } else { 3337 (*(m->m_ext.ext_free))(m->m_ext.ext_buf, 3338 m->m_ext.ext_size, m->m_ext.ext_arg); 3339 } 3340 mcache_free(ref_cache, MEXT_RFA(m)); 3341 MEXT_RFA(m) = NULL; 3342 } else if (refcnt == 0 && composite) { 3343 VERIFY(m->m_type != MT_FREE); 3344 3345 mtype_stat_dec(m->m_type); 3346 mtype_stat_inc(MT_FREE); 3347 3348 m->m_type = MT_FREE; 3349 m->m_flags = M_EXT; 3350 m->m_len = 0; 3351 m->m_next = m->m_nextpkt = NULL; 3352 3353 MEXT_FLAGS(m) &= ~EXTF_READONLY; 3354 3355 /* "Free" into the intermediate cache */ 3356 if (m->m_ext.ext_free == NULL) { 3357 mcache_free(m_cache(MC_MBUF_CL), m); 3358 } else if (m->m_ext.ext_free == m_bigfree) { 3359 mcache_free(m_cache(MC_MBUF_BIGCL), m); 3360 } else { 3361 VERIFY(m->m_ext.ext_free == m_16kfree); 3362 mcache_free(m_cache(MC_MBUF_16KCL), m); 3363 } 3364 return (n); 3365 } 3366 } 3367 3368 if (m->m_type != MT_FREE) { 3369 mtype_stat_dec(m->m_type); 3370 mtype_stat_inc(MT_FREE); 3371 } 3372 3373 m->m_type = MT_FREE; 3374 m->m_flags = m->m_len = 0; 3375 m->m_next = m->m_nextpkt = NULL; 3376 3377 mcache_free(m_cache(MC_MBUF), m); 3378 3379 return (n); 3380} 3381 3382__private_extern__ struct mbuf * 3383m_clattach(struct mbuf *m, int type, caddr_t extbuf, 3384 void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg, 3385 int wait) 3386{ 3387 struct ext_ref *rfa = NULL; 3388 3389 if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL) 3390 return (NULL); 3391 3392 if (m->m_flags & M_EXT) { 3393 u_int32_t refcnt; 3394 u_int32_t composite; 3395 3396 refcnt = m_decref(m); 3397 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); 3398 if (refcnt == 0 && !composite) { 3399 if (m->m_ext.ext_free == NULL) { 3400 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); 3401 } else if (m->m_ext.ext_free == m_bigfree) { 3402 mcache_free(m_cache(MC_BIGCL), 3403 m->m_ext.ext_buf); 3404 } else if (m->m_ext.ext_free == m_16kfree) { 3405 mcache_free(m_cache(MC_16KCL), 3406 m->m_ext.ext_buf); 3407 } else { 3408 (*(m->m_ext.ext_free))(m->m_ext.ext_buf, 3409 m->m_ext.ext_size, m->m_ext.ext_arg); 3410 } 3411 /* Re-use the reference structure */ 3412 rfa = MEXT_RFA(m); 3413 } else if (refcnt == 0 && composite) { 3414 VERIFY(m->m_type != MT_FREE); 3415 3416 mtype_stat_dec(m->m_type); 3417 mtype_stat_inc(MT_FREE); 3418 3419 m->m_type = MT_FREE; 3420 m->m_flags = M_EXT; 3421 m->m_len = 0; 3422 m->m_next = m->m_nextpkt = NULL; 3423 3424 MEXT_FLAGS(m) &= ~EXTF_READONLY; 3425 3426 /* "Free" into the intermediate cache */ 3427 if (m->m_ext.ext_free == NULL) { 3428 mcache_free(m_cache(MC_MBUF_CL), m); 3429 } else if (m->m_ext.ext_free == m_bigfree) { 3430 mcache_free(m_cache(MC_MBUF_BIGCL), m); 3431 } else { 3432 VERIFY(m->m_ext.ext_free == m_16kfree); 3433 mcache_free(m_cache(MC_MBUF_16KCL), m); 3434 } 3435 /* 3436 * Allocate a new mbuf, since we didn't divorce 3437 * the composite mbuf + cluster pair above. 3438 */ 3439 if ((m = _M_GETHDR(wait, type)) == NULL) 3440 return (NULL); 3441 } 3442 } 3443 3444 if (rfa == NULL && 3445 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) { 3446 m_free(m); 3447 return (NULL); 3448 } 3449 3450 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0); 3451 3452 return (m); 3453} 3454 3455/* 3456 * Perform `fast' allocation mbuf clusters from a cache of recently-freed 3457 * clusters. (If the cache is empty, new clusters are allocated en-masse.) 3458 */ 3459struct mbuf * 3460m_getcl(int wait, int type, int flags) 3461{ 3462 struct mbuf *m; 3463 int mcflags = MSLEEPF(wait); 3464 int hdr = (flags & M_PKTHDR); 3465 3466 /* Is this due to a non-blocking retry? If so, then try harder */ 3467 if (mcflags & MCR_NOSLEEP) 3468 mcflags |= MCR_TRYHARD; 3469 3470 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags); 3471 if (m != NULL) { 3472 u_int32_t flag; 3473 struct ext_ref *rfa; 3474 void *cl; 3475 3476 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); 3477 cl = m->m_ext.ext_buf; 3478 rfa = MEXT_RFA(m); 3479 3480 ASSERT(cl != NULL && rfa != NULL); 3481 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL); 3482 3483 flag = MEXT_FLAGS(m); 3484 3485 MBUF_INIT(m, hdr, type); 3486 MBUF_CL_INIT(m, cl, rfa, 1, flag); 3487 3488 mtype_stat_inc(type); 3489 mtype_stat_dec(MT_FREE); 3490#if CONFIG_MACF_NET 3491 if (hdr && mac_init_mbuf(m, wait) != 0) { 3492 m_freem(m); 3493 return (NULL); 3494 } 3495#endif /* MAC_NET */ 3496 } 3497 return (m); 3498} 3499 3500/* m_mclget() add an mbuf cluster to a normal mbuf */ 3501struct mbuf * 3502m_mclget(struct mbuf *m, int wait) 3503{ 3504 struct ext_ref *rfa; 3505 3506 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) 3507 return (m); 3508 3509 m->m_ext.ext_buf = m_mclalloc(wait); 3510 if (m->m_ext.ext_buf != NULL) { 3511 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); 3512 } else { 3513 mcache_free(ref_cache, rfa); 3514 } 3515 return (m); 3516} 3517 3518/* Allocate an mbuf cluster */ 3519caddr_t 3520m_mclalloc(int wait) 3521{ 3522 int mcflags = MSLEEPF(wait); 3523 3524 /* Is this due to a non-blocking retry? If so, then try harder */ 3525 if (mcflags & MCR_NOSLEEP) 3526 mcflags |= MCR_TRYHARD; 3527 3528 return (mcache_alloc(m_cache(MC_CL), mcflags)); 3529} 3530 3531/* Free an mbuf cluster */ 3532void 3533m_mclfree(caddr_t p) 3534{ 3535 mcache_free(m_cache(MC_CL), p); 3536} 3537 3538/* 3539 * mcl_hasreference() checks if a cluster of an mbuf is referenced by 3540 * another mbuf; see comments in m_incref() regarding EXTF_READONLY. 3541 */ 3542int 3543m_mclhasreference(struct mbuf *m) 3544{ 3545 if (!(m->m_flags & M_EXT)) 3546 return (0); 3547 3548 ASSERT(MEXT_RFA(m) != NULL); 3549 3550 return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0); 3551} 3552 3553__private_extern__ caddr_t 3554m_bigalloc(int wait) 3555{ 3556 int mcflags = MSLEEPF(wait); 3557 3558 /* Is this due to a non-blocking retry? If so, then try harder */ 3559 if (mcflags & MCR_NOSLEEP) 3560 mcflags |= MCR_TRYHARD; 3561 3562 return (mcache_alloc(m_cache(MC_BIGCL), mcflags)); 3563} 3564 3565__private_extern__ void 3566m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg) 3567{ 3568 mcache_free(m_cache(MC_BIGCL), p); 3569} 3570 3571/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */ 3572__private_extern__ struct mbuf * 3573m_mbigget(struct mbuf *m, int wait) 3574{ 3575 struct ext_ref *rfa; 3576 3577 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) 3578 return (m); 3579 3580 m->m_ext.ext_buf = m_bigalloc(wait); 3581 if (m->m_ext.ext_buf != NULL) { 3582 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); 3583 } else { 3584 mcache_free(ref_cache, rfa); 3585 } 3586 return (m); 3587} 3588 3589__private_extern__ caddr_t 3590m_16kalloc(int wait) 3591{ 3592 int mcflags = MSLEEPF(wait); 3593 3594 /* Is this due to a non-blocking retry? If so, then try harder */ 3595 if (mcflags & MCR_NOSLEEP) 3596 mcflags |= MCR_TRYHARD; 3597 3598 return (mcache_alloc(m_cache(MC_16KCL), mcflags)); 3599} 3600 3601__private_extern__ void 3602m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg) 3603{ 3604 mcache_free(m_cache(MC_16KCL), p); 3605} 3606 3607/* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */ 3608__private_extern__ struct mbuf * 3609m_m16kget(struct mbuf *m, int wait) 3610{ 3611 struct ext_ref *rfa; 3612 3613 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) 3614 return (m); 3615 3616 m->m_ext.ext_buf = m_16kalloc(wait); 3617 if (m->m_ext.ext_buf != NULL) { 3618 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); 3619 } else { 3620 mcache_free(ref_cache, rfa); 3621 } 3622 return (m); 3623} 3624 3625/* 3626 * "Move" mbuf pkthdr from "from" to "to". 3627 * "from" must have M_PKTHDR set, and "to" must be empty. 3628 */ 3629void 3630m_copy_pkthdr(struct mbuf *to, struct mbuf *from) 3631{ 3632 VERIFY(from->m_flags & M_PKTHDR); 3633 3634 /* Check for scratch area overflow */ 3635 m_redzone_verify(from); 3636 3637 if (to->m_flags & M_PKTHDR) { 3638 /* Check for scratch area overflow */ 3639 m_redzone_verify(to); 3640 /* We will be taking over the tags of 'to' */ 3641 m_tag_delete_chain(to, NULL); 3642 } 3643 to->m_pkthdr = from->m_pkthdr; /* especially tags */ 3644 m_classifier_init(from, 0); /* purge classifier info */ 3645 m_tag_init(from, 1); /* purge all tags from src */ 3646 m_scratch_init(from); /* clear src scratch area */ 3647 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); 3648 if ((to->m_flags & M_EXT) == 0) 3649 to->m_data = to->m_pktdat; 3650 m_redzone_init(to); /* setup red zone on dst */ 3651} 3652 3653/* 3654 * Duplicate "from"'s mbuf pkthdr in "to". 3655 * "from" must have M_PKTHDR set, and "to" must be empty. 3656 * In particular, this does a deep copy of the packet tags. 3657 */ 3658static int 3659m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how) 3660{ 3661 VERIFY(from->m_flags & M_PKTHDR); 3662 3663 /* Check for scratch area overflow */ 3664 m_redzone_verify(from); 3665 3666 if (to->m_flags & M_PKTHDR) { 3667 /* Check for scratch area overflow */ 3668 m_redzone_verify(to); 3669 /* We will be taking over the tags of 'to' */ 3670 m_tag_delete_chain(to, NULL); 3671 } 3672 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); 3673 if ((to->m_flags & M_EXT) == 0) 3674 to->m_data = to->m_pktdat; 3675 to->m_pkthdr = from->m_pkthdr; 3676 m_redzone_init(to); /* setup red zone on dst */ 3677 m_tag_init(to, 0); /* preserve dst static tags */ 3678 return (m_tag_copy_chain(to, from, how)); 3679} 3680 3681void 3682m_copy_pftag(struct mbuf *to, struct mbuf *from) 3683{ 3684 to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag; 3685#if PF_ECN 3686 to->m_pkthdr.pf_mtag.pftag_hdr = NULL; 3687 to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6); 3688#endif /* PF_ECN */ 3689} 3690 3691void 3692m_classifier_init(struct mbuf *m, uint32_t pktf_mask) 3693{ 3694 VERIFY(m->m_flags & M_PKTHDR); 3695 3696 m->m_pkthdr.pkt_proto = 0; 3697 m->m_pkthdr.pkt_flowsrc = 0; 3698 m->m_pkthdr.pkt_flowid = 0; 3699 m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */ 3700 /* preserve service class and interface info for loopback packets */ 3701 if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) 3702 (void) m_set_service_class(m, MBUF_SC_BE); 3703 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) 3704 m->m_pkthdr.pkt_ifainfo = 0; 3705#if MEASURE_BW 3706 m->m_pkthdr.pkt_bwseq = 0; 3707#endif /* MEASURE_BW */ 3708} 3709 3710void 3711m_copy_classifier(struct mbuf *to, struct mbuf *from) 3712{ 3713 VERIFY(to->m_flags & M_PKTHDR); 3714 VERIFY(from->m_flags & M_PKTHDR); 3715 3716 to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto; 3717 to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc; 3718 to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid; 3719 to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags; 3720 (void) m_set_service_class(to, from->m_pkthdr.pkt_svc); 3721 to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo; 3722 to->m_pkthdr.ipsec_policy = from->m_pkthdr.ipsec_policy; 3723#if MEASURE_BW 3724 to->m_pkthdr.pkt_bwseq = from->m_pkthdr.pkt_bwseq; 3725#endif /* MEASURE_BW */ 3726} 3727 3728/* 3729 * Return a list of mbuf hdrs that point to clusters. Try for num_needed; 3730 * if wantall is not set, return whatever number were available. Set up the 3731 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these 3732 * are chained on the m_nextpkt field. Any packets requested beyond this 3733 * are chained onto the last packet header's m_next field. The size of 3734 * the cluster is controlled by the parameter bufsize. 3735 */ 3736__private_extern__ struct mbuf * 3737m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, 3738 int wait, int wantall, size_t bufsize) 3739{ 3740 struct mbuf *m; 3741 struct mbuf **np, *top; 3742 unsigned int pnum, needed = *num_needed; 3743 mcache_obj_t *mp_list = NULL; 3744 int mcflags = MSLEEPF(wait); 3745 u_int32_t flag; 3746 struct ext_ref *rfa; 3747 mcache_t *cp; 3748 void *cl; 3749 3750 ASSERT(bufsize == m_maxsize(MC_CL) || 3751 bufsize == m_maxsize(MC_BIGCL) || 3752 bufsize == m_maxsize(MC_16KCL)); 3753 3754 /* 3755 * Caller must first check for njcl because this 3756 * routine is internal and not exposed/used via KPI. 3757 */ 3758 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0); 3759 3760 top = NULL; 3761 np = ⊤ 3762 pnum = 0; 3763 3764 /* 3765 * The caller doesn't want all the requested buffers; only some. 3766 * Try hard to get what we can, but don't block. This effectively 3767 * overrides MCR_SLEEP, since this thread will not go to sleep 3768 * if we can't get all the buffers. 3769 */ 3770 if (!wantall || (mcflags & MCR_NOSLEEP)) 3771 mcflags |= MCR_TRYHARD; 3772 3773 /* Allocate the composite mbuf + cluster elements from the cache */ 3774 if (bufsize == m_maxsize(MC_CL)) 3775 cp = m_cache(MC_MBUF_CL); 3776 else if (bufsize == m_maxsize(MC_BIGCL)) 3777 cp = m_cache(MC_MBUF_BIGCL); 3778 else 3779 cp = m_cache(MC_MBUF_16KCL); 3780 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags); 3781 3782 for (pnum = 0; pnum < needed; pnum++) { 3783 m = (struct mbuf *)mp_list; 3784 mp_list = mp_list->obj_next; 3785 3786 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); 3787 cl = m->m_ext.ext_buf; 3788 rfa = MEXT_RFA(m); 3789 3790 ASSERT(cl != NULL && rfa != NULL); 3791 VERIFY(MBUF_IS_COMPOSITE(m)); 3792 3793 flag = MEXT_FLAGS(m); 3794 3795 MBUF_INIT(m, num_with_pkthdrs, MT_DATA); 3796 if (bufsize == m_maxsize(MC_16KCL)) { 3797 MBUF_16KCL_INIT(m, cl, rfa, 1, flag); 3798 } else if (bufsize == m_maxsize(MC_BIGCL)) { 3799 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag); 3800 } else { 3801 MBUF_CL_INIT(m, cl, rfa, 1, flag); 3802 } 3803 3804 if (num_with_pkthdrs > 0) { 3805 --num_with_pkthdrs; 3806#if CONFIG_MACF_NET 3807 if (mac_mbuf_label_init(m, wait) != 0) { 3808 m_freem(m); 3809 break; 3810 } 3811#endif /* MAC_NET */ 3812 } 3813 3814 *np = m; 3815 if (num_with_pkthdrs > 0) 3816 np = &m->m_nextpkt; 3817 else 3818 np = &m->m_next; 3819 } 3820 ASSERT(pnum != *num_needed || mp_list == NULL); 3821 if (mp_list != NULL) 3822 mcache_free_ext(cp, mp_list); 3823 3824 if (pnum > 0) { 3825 mtype_stat_add(MT_DATA, pnum); 3826 mtype_stat_sub(MT_FREE, pnum); 3827 } 3828 3829 if (wantall && (pnum != *num_needed)) { 3830 if (top != NULL) 3831 m_freem_list(top); 3832 return (NULL); 3833 } 3834 3835 if (pnum > *num_needed) { 3836 printf("%s: File a radar related to <rdar://10146739>. \ 3837 needed = %u, pnum = %u, num_needed = %u \n", 3838 __func__, needed, pnum, *num_needed); 3839 } 3840 3841 *num_needed = pnum; 3842 return (top); 3843} 3844 3845/* 3846 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if 3847 * wantall is not set, return whatever number were available. The size of 3848 * each mbuf in the list is controlled by the parameter packetlen. Each 3849 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf 3850 * in the chain is called a segment. If maxsegments is not null and the 3851 * value pointed to is not null, this specify the maximum number of segments 3852 * for a chain of mbufs. If maxsegments is zero or the value pointed to 3853 * is zero the caller does not have any restriction on the number of segments. 3854 * The actual number of segments of a mbuf chain is return in the value 3855 * pointed to by maxsegments. 3856 */ 3857__private_extern__ struct mbuf * 3858m_allocpacket_internal(unsigned int *numlist, size_t packetlen, 3859 unsigned int *maxsegments, int wait, int wantall, size_t wantsize) 3860{ 3861 struct mbuf **np, *top, *first = NULL; 3862 size_t bufsize, r_bufsize; 3863 unsigned int num = 0; 3864 unsigned int nsegs = 0; 3865 unsigned int needed, resid; 3866 int mcflags = MSLEEPF(wait); 3867 mcache_obj_t *mp_list = NULL, *rmp_list = NULL; 3868 mcache_t *cp = NULL, *rcp = NULL; 3869 3870 if (*numlist == 0) 3871 return (NULL); 3872 3873 top = NULL; 3874 np = ⊤ 3875 3876 if (wantsize == 0) { 3877 if (packetlen <= MINCLSIZE) { 3878 bufsize = packetlen; 3879 } else if (packetlen > m_maxsize(MC_CL)) { 3880 /* Use 4KB if jumbo cluster pool isn't available */ 3881 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0) 3882 bufsize = m_maxsize(MC_BIGCL); 3883 else 3884 bufsize = m_maxsize(MC_16KCL); 3885 } else { 3886 bufsize = m_maxsize(MC_CL); 3887 } 3888 } else if (wantsize == m_maxsize(MC_CL) || 3889 wantsize == m_maxsize(MC_BIGCL) || 3890 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) { 3891 bufsize = wantsize; 3892 } else { 3893 return (NULL); 3894 } 3895 3896 if (bufsize <= MHLEN) { 3897 nsegs = 1; 3898 } else if (bufsize <= MINCLSIZE) { 3899 if (maxsegments != NULL && *maxsegments == 1) { 3900 bufsize = m_maxsize(MC_CL); 3901 nsegs = 1; 3902 } else { 3903 nsegs = 2; 3904 } 3905 } else if (bufsize == m_maxsize(MC_16KCL)) { 3906 VERIFY(njcl > 0); 3907 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1; 3908 } else if (bufsize == m_maxsize(MC_BIGCL)) { 3909 nsegs = ((packetlen - 1) >> PGSHIFT) + 1; 3910 } else { 3911 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1; 3912 } 3913 if (maxsegments != NULL) { 3914 if (*maxsegments && nsegs > *maxsegments) { 3915 *maxsegments = nsegs; 3916 return (NULL); 3917 } 3918 *maxsegments = nsegs; 3919 } 3920 3921 /* 3922 * The caller doesn't want all the requested buffers; only some. 3923 * Try hard to get what we can, but don't block. This effectively 3924 * overrides MCR_SLEEP, since this thread will not go to sleep 3925 * if we can't get all the buffers. 3926 */ 3927 if (!wantall || (mcflags & MCR_NOSLEEP)) 3928 mcflags |= MCR_TRYHARD; 3929 3930 /* 3931 * Simple case where all elements in the lists/chains are mbufs. 3932 * Unless bufsize is greater than MHLEN, each segment chain is made 3933 * up of exactly 1 mbuf. Otherwise, each segment chain is made up 3934 * of 2 mbufs; the second one is used for the residual data, i.e. 3935 * the remaining data that cannot fit into the first mbuf. 3936 */ 3937 if (bufsize <= MINCLSIZE) { 3938 /* Allocate the elements in one shot from the mbuf cache */ 3939 ASSERT(bufsize <= MHLEN || nsegs == 2); 3940 cp = m_cache(MC_MBUF); 3941 needed = mcache_alloc_ext(cp, &mp_list, 3942 (*numlist) * nsegs, mcflags); 3943 3944 /* 3945 * The number of elements must be even if we are to use an 3946 * mbuf (instead of a cluster) to store the residual data. 3947 * If we couldn't allocate the requested number of mbufs, 3948 * trim the number down (if it's odd) in order to avoid 3949 * creating a partial segment chain. 3950 */ 3951 if (bufsize > MHLEN && (needed & 0x1)) 3952 needed--; 3953 3954 while (num < needed) { 3955 struct mbuf *m; 3956 3957 m = (struct mbuf *)mp_list; 3958 mp_list = mp_list->obj_next; 3959 ASSERT(m != NULL); 3960 3961 MBUF_INIT(m, 1, MT_DATA); 3962#if CONFIG_MACF_NET 3963 if (mac_init_mbuf(m, wait) != 0) { 3964 m_free(m); 3965 break; 3966 } 3967#endif /* MAC_NET */ 3968 num++; 3969 if (bufsize > MHLEN) { 3970 /* A second mbuf for this segment chain */ 3971 m->m_next = (struct mbuf *)mp_list; 3972 mp_list = mp_list->obj_next; 3973 ASSERT(m->m_next != NULL); 3974 3975 MBUF_INIT(m->m_next, 0, MT_DATA); 3976 num++; 3977 } 3978 *np = m; 3979 np = &m->m_nextpkt; 3980 } 3981 ASSERT(num != *numlist || mp_list == NULL); 3982 3983 if (num > 0) { 3984 mtype_stat_add(MT_DATA, num); 3985 mtype_stat_sub(MT_FREE, num); 3986 } 3987 num /= nsegs; 3988 3989 /* We've got them all; return to caller */ 3990 if (num == *numlist) 3991 return (top); 3992 3993 goto fail; 3994 } 3995 3996 /* 3997 * Complex cases where elements are made up of one or more composite 3998 * mbufs + cluster, depending on packetlen. Each N-segment chain can 3999 * be illustrated as follows: 4000 * 4001 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N] 4002 * 4003 * Every composite mbuf + cluster element comes from the intermediate 4004 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency, 4005 * the last composite element will come from the MC_MBUF_CL cache, 4006 * unless the residual data is larger than 2KB where we use the 4007 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual 4008 * data is defined as extra data beyond the first element that cannot 4009 * fit into the previous element, i.e. there is no residual data if 4010 * the chain only has 1 segment. 4011 */ 4012 r_bufsize = bufsize; 4013 resid = packetlen > bufsize ? packetlen % bufsize : 0; 4014 if (resid > 0) { 4015 /* There is residual data; figure out the cluster size */ 4016 if (wantsize == 0 && packetlen > MINCLSIZE) { 4017 /* 4018 * Caller didn't request that all of the segments 4019 * in the chain use the same cluster size; use the 4020 * smaller of the cluster sizes. 4021 */ 4022 if (njcl > 0 && resid > m_maxsize(MC_BIGCL)) 4023 r_bufsize = m_maxsize(MC_16KCL); 4024 else if (resid > m_maxsize(MC_CL)) 4025 r_bufsize = m_maxsize(MC_BIGCL); 4026 else 4027 r_bufsize = m_maxsize(MC_CL); 4028 } else { 4029 /* Use the same cluster size as the other segments */ 4030 resid = 0; 4031 } 4032 } 4033 4034 needed = *numlist; 4035 if (resid > 0) { 4036 /* 4037 * Attempt to allocate composite mbuf + cluster elements for 4038 * the residual data in each chain; record the number of such 4039 * elements that can be allocated so that we know how many 4040 * segment chains we can afford to create. 4041 */ 4042 if (r_bufsize <= m_maxsize(MC_CL)) 4043 rcp = m_cache(MC_MBUF_CL); 4044 else if (r_bufsize <= m_maxsize(MC_BIGCL)) 4045 rcp = m_cache(MC_MBUF_BIGCL); 4046 else 4047 rcp = m_cache(MC_MBUF_16KCL); 4048 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags); 4049 4050 if (needed == 0) 4051 goto fail; 4052 4053 /* This is temporarily reduced for calculation */ 4054 ASSERT(nsegs > 1); 4055 nsegs--; 4056 } 4057 4058 /* 4059 * Attempt to allocate the rest of the composite mbuf + cluster 4060 * elements for the number of segment chains that we need. 4061 */ 4062 if (bufsize <= m_maxsize(MC_CL)) 4063 cp = m_cache(MC_MBUF_CL); 4064 else if (bufsize <= m_maxsize(MC_BIGCL)) 4065 cp = m_cache(MC_MBUF_BIGCL); 4066 else 4067 cp = m_cache(MC_MBUF_16KCL); 4068 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags); 4069 4070 /* Round it down to avoid creating a partial segment chain */ 4071 needed = (needed / nsegs) * nsegs; 4072 if (needed == 0) 4073 goto fail; 4074 4075 if (resid > 0) { 4076 /* 4077 * We're about to construct the chain(s); take into account 4078 * the number of segments we have created above to hold the 4079 * residual data for each chain, as well as restore the 4080 * original count of segments per chain. 4081 */ 4082 ASSERT(nsegs > 0); 4083 needed += needed / nsegs; 4084 nsegs++; 4085 } 4086 4087 for (;;) { 4088 struct mbuf *m; 4089 u_int32_t flag; 4090 struct ext_ref *rfa; 4091 void *cl; 4092 int pkthdr; 4093 4094 ++num; 4095 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) { 4096 m = (struct mbuf *)mp_list; 4097 mp_list = mp_list->obj_next; 4098 } else { 4099 m = (struct mbuf *)rmp_list; 4100 rmp_list = rmp_list->obj_next; 4101 } 4102 ASSERT(m != NULL); 4103 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); 4104 VERIFY(m->m_ext.ext_free == NULL || 4105 m->m_ext.ext_free == m_bigfree || 4106 m->m_ext.ext_free == m_16kfree); 4107 4108 cl = m->m_ext.ext_buf; 4109 rfa = MEXT_RFA(m); 4110 4111 ASSERT(cl != NULL && rfa != NULL); 4112 VERIFY(MBUF_IS_COMPOSITE(m)); 4113 4114 flag = MEXT_FLAGS(m); 4115 4116 pkthdr = (nsegs == 1 || (num % nsegs) == 1); 4117 if (pkthdr) 4118 first = m; 4119 MBUF_INIT(m, pkthdr, MT_DATA); 4120 if (m->m_ext.ext_free == m_16kfree) { 4121 MBUF_16KCL_INIT(m, cl, rfa, 1, flag); 4122 } else if (m->m_ext.ext_free == m_bigfree) { 4123 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag); 4124 } else { 4125 MBUF_CL_INIT(m, cl, rfa, 1, flag); 4126 } 4127#if CONFIG_MACF_NET 4128 if (pkthdr && mac_init_mbuf(m, wait) != 0) { 4129 --num; 4130 m_freem(m); 4131 break; 4132 } 4133#endif /* MAC_NET */ 4134 4135 *np = m; 4136 if ((num % nsegs) == 0) 4137 np = &first->m_nextpkt; 4138 else 4139 np = &m->m_next; 4140 4141 if (num == needed) 4142 break; 4143 } 4144 4145 if (num > 0) { 4146 mtype_stat_add(MT_DATA, num); 4147 mtype_stat_sub(MT_FREE, num); 4148 } 4149 4150 num /= nsegs; 4151 4152 /* We've got them all; return to caller */ 4153 if (num == *numlist) { 4154 ASSERT(mp_list == NULL && rmp_list == NULL); 4155 return (top); 4156 } 4157 4158fail: 4159 /* Free up what's left of the above */ 4160 if (mp_list != NULL) 4161 mcache_free_ext(cp, mp_list); 4162 if (rmp_list != NULL) 4163 mcache_free_ext(rcp, rmp_list); 4164 if (wantall && top != NULL) { 4165 m_freem(top); 4166 return (NULL); 4167 } 4168 *numlist = num; 4169 return (top); 4170} 4171 4172/* 4173 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated 4174 * packets on receive ring. 4175 */ 4176__private_extern__ struct mbuf * 4177m_getpacket_how(int wait) 4178{ 4179 unsigned int num_needed = 1; 4180 4181 return (m_getpackets_internal(&num_needed, 1, wait, 1, 4182 m_maxsize(MC_CL))); 4183} 4184 4185/* 4186 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated 4187 * packets on receive ring. 4188 */ 4189struct mbuf * 4190m_getpacket(void) 4191{ 4192 unsigned int num_needed = 1; 4193 4194 return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1, 4195 m_maxsize(MC_CL))); 4196} 4197 4198/* 4199 * Return a list of mbuf hdrs that point to clusters. Try for num_needed; 4200 * if this can't be met, return whatever number were available. Set up the 4201 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These 4202 * are chained on the m_nextpkt field. Any packets requested beyond this are 4203 * chained onto the last packet header's m_next field. 4204 */ 4205struct mbuf * 4206m_getpackets(int num_needed, int num_with_pkthdrs, int how) 4207{ 4208 unsigned int n = num_needed; 4209 4210 return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0, 4211 m_maxsize(MC_CL))); 4212} 4213 4214/* 4215 * Return a list of mbuf hdrs set up as packet hdrs chained together 4216 * on the m_nextpkt field 4217 */ 4218struct mbuf * 4219m_getpackethdrs(int num_needed, int how) 4220{ 4221 struct mbuf *m; 4222 struct mbuf **np, *top; 4223 4224 top = NULL; 4225 np = ⊤ 4226 4227 while (num_needed--) { 4228 m = _M_RETRYHDR(how, MT_DATA); 4229 if (m == NULL) 4230 break; 4231 4232 *np = m; 4233 np = &m->m_nextpkt; 4234 } 4235 4236 return (top); 4237} 4238 4239/* 4240 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count 4241 * for mbufs packets freed. Used by the drivers. 4242 */ 4243int 4244m_freem_list(struct mbuf *m) 4245{ 4246 struct mbuf *nextpkt; 4247 mcache_obj_t *mp_list = NULL; 4248 mcache_obj_t *mcl_list = NULL; 4249 mcache_obj_t *mbc_list = NULL; 4250 mcache_obj_t *m16k_list = NULL; 4251 mcache_obj_t *m_mcl_list = NULL; 4252 mcache_obj_t *m_mbc_list = NULL; 4253 mcache_obj_t *m_m16k_list = NULL; 4254 mcache_obj_t *ref_list = NULL; 4255 int pktcount = 0; 4256 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0; 4257 4258 while (m != NULL) { 4259 pktcount++; 4260 4261 nextpkt = m->m_nextpkt; 4262 m->m_nextpkt = NULL; 4263 4264 while (m != NULL) { 4265 struct mbuf *next = m->m_next; 4266 mcache_obj_t *o, *rfa; 4267 u_int32_t refcnt, composite; 4268 4269 if (m->m_type == MT_FREE) 4270 panic("m_free: freeing an already freed mbuf"); 4271 4272 if (m->m_type != MT_FREE) 4273 mt_free++; 4274 4275 if (m->m_flags & M_PKTHDR) { 4276 /* Check for scratch area overflow */ 4277 m_redzone_verify(m); 4278 /* Free the aux data and tags if there is any */ 4279 m_tag_delete_chain(m, NULL); 4280 } 4281 4282 if (!(m->m_flags & M_EXT)) 4283 goto simple_free; 4284 4285 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf; 4286 refcnt = m_decref(m); 4287 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); 4288 if (refcnt == 0 && !composite) { 4289 if (m->m_ext.ext_free == NULL) { 4290 o->obj_next = mcl_list; 4291 mcl_list = o; 4292 } else if (m->m_ext.ext_free == m_bigfree) { 4293 o->obj_next = mbc_list; 4294 mbc_list = o; 4295 } else if (m->m_ext.ext_free == m_16kfree) { 4296 o->obj_next = m16k_list; 4297 m16k_list = o; 4298 } else { 4299 (*(m->m_ext.ext_free))((caddr_t)o, 4300 m->m_ext.ext_size, 4301 m->m_ext.ext_arg); 4302 } 4303 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m); 4304 rfa->obj_next = ref_list; 4305 ref_list = rfa; 4306 MEXT_RFA(m) = NULL; 4307 } else if (refcnt == 0 && composite) { 4308 VERIFY(m->m_type != MT_FREE); 4309 /* 4310 * Amortize the costs of atomic operations 4311 * by doing them at the end, if possible. 4312 */ 4313 if (m->m_type == MT_DATA) 4314 mt_data++; 4315 else if (m->m_type == MT_HEADER) 4316 mt_header++; 4317 else if (m->m_type == MT_SONAME) 4318 mt_soname++; 4319 else if (m->m_type == MT_TAG) 4320 mt_tag++; 4321 else 4322 mtype_stat_dec(m->m_type); 4323 4324 m->m_type = MT_FREE; 4325 m->m_flags = M_EXT; 4326 m->m_len = 0; 4327 m->m_next = m->m_nextpkt = NULL; 4328 4329 MEXT_FLAGS(m) &= ~EXTF_READONLY; 4330 4331 /* "Free" into the intermediate cache */ 4332 o = (mcache_obj_t *)m; 4333 if (m->m_ext.ext_free == NULL) { 4334 o->obj_next = m_mcl_list; 4335 m_mcl_list = o; 4336 } else if (m->m_ext.ext_free == m_bigfree) { 4337 o->obj_next = m_mbc_list; 4338 m_mbc_list = o; 4339 } else { 4340 VERIFY(m->m_ext.ext_free == m_16kfree); 4341 o->obj_next = m_m16k_list; 4342 m_m16k_list = o; 4343 } 4344 m = next; 4345 continue; 4346 } 4347simple_free: 4348 /* 4349 * Amortize the costs of atomic operations 4350 * by doing them at the end, if possible. 4351 */ 4352 if (m->m_type == MT_DATA) 4353 mt_data++; 4354 else if (m->m_type == MT_HEADER) 4355 mt_header++; 4356 else if (m->m_type == MT_SONAME) 4357 mt_soname++; 4358 else if (m->m_type == MT_TAG) 4359 mt_tag++; 4360 else if (m->m_type != MT_FREE) 4361 mtype_stat_dec(m->m_type); 4362 4363 m->m_type = MT_FREE; 4364 m->m_flags = m->m_len = 0; 4365 m->m_next = m->m_nextpkt = NULL; 4366 4367 ((mcache_obj_t *)m)->obj_next = mp_list; 4368 mp_list = (mcache_obj_t *)m; 4369 4370 m = next; 4371 } 4372 4373 m = nextpkt; 4374 } 4375 4376 if (mt_free > 0) 4377 mtype_stat_add(MT_FREE, mt_free); 4378 if (mt_data > 0) 4379 mtype_stat_sub(MT_DATA, mt_data); 4380 if (mt_header > 0) 4381 mtype_stat_sub(MT_HEADER, mt_header); 4382 if (mt_soname > 0) 4383 mtype_stat_sub(MT_SONAME, mt_soname); 4384 if (mt_tag > 0) 4385 mtype_stat_sub(MT_TAG, mt_tag); 4386 4387 if (mp_list != NULL) 4388 mcache_free_ext(m_cache(MC_MBUF), mp_list); 4389 if (mcl_list != NULL) 4390 mcache_free_ext(m_cache(MC_CL), mcl_list); 4391 if (mbc_list != NULL) 4392 mcache_free_ext(m_cache(MC_BIGCL), mbc_list); 4393 if (m16k_list != NULL) 4394 mcache_free_ext(m_cache(MC_16KCL), m16k_list); 4395 if (m_mcl_list != NULL) 4396 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list); 4397 if (m_mbc_list != NULL) 4398 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list); 4399 if (m_m16k_list != NULL) 4400 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list); 4401 if (ref_list != NULL) 4402 mcache_free_ext(ref_cache, ref_list); 4403 4404 return (pktcount); 4405} 4406 4407void 4408m_freem(struct mbuf *m) 4409{ 4410 while (m != NULL) 4411 m = m_free(m); 4412} 4413 4414/* 4415 * Mbuffer utility routines. 4416 */ 4417 4418/* 4419 * Compute the amount of space available before the current start 4420 * of data in an mbuf. 4421 */ 4422int 4423m_leadingspace(struct mbuf *m) 4424{ 4425 if (m->m_flags & M_EXT) { 4426 if (MCLHASREFERENCE(m)) 4427 return (0); 4428 return (m->m_data - m->m_ext.ext_buf); 4429 } 4430 if (m->m_flags & M_PKTHDR) 4431 return (m->m_data - m->m_pktdat); 4432 return (m->m_data - m->m_dat); 4433} 4434 4435/* 4436 * Compute the amount of space available after the end of data in an mbuf. 4437 */ 4438int 4439m_trailingspace(struct mbuf *m) 4440{ 4441 if (m->m_flags & M_EXT) { 4442 if (MCLHASREFERENCE(m)) 4443 return (0); 4444 return (m->m_ext.ext_buf + m->m_ext.ext_size - 4445 (m->m_data + m->m_len)); 4446 } 4447 return (&m->m_dat[MLEN] - (m->m_data + m->m_len)); 4448} 4449 4450/* 4451 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain, 4452 * copy junk along. Does not adjust packet header length. 4453 */ 4454struct mbuf * 4455m_prepend(struct mbuf *m, int len, int how) 4456{ 4457 struct mbuf *mn; 4458 4459 _MGET(mn, how, m->m_type); 4460 if (mn == NULL) { 4461 m_freem(m); 4462 return (NULL); 4463 } 4464 if (m->m_flags & M_PKTHDR) { 4465 M_COPY_PKTHDR(mn, m); 4466 m->m_flags &= ~M_PKTHDR; 4467 } 4468 mn->m_next = m; 4469 m = mn; 4470 if (len < MHLEN) 4471 MH_ALIGN(m, len); 4472 m->m_len = len; 4473 return (m); 4474} 4475 4476/* 4477 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to 4478 * chain, copy junk along, and adjust length. 4479 */ 4480struct mbuf * 4481m_prepend_2(struct mbuf *m, int len, int how) 4482{ 4483 if (M_LEADINGSPACE(m) >= len) { 4484 m->m_data -= len; 4485 m->m_len += len; 4486 } else { 4487 m = m_prepend(m, len, how); 4488 } 4489 if ((m) && (m->m_flags & M_PKTHDR)) 4490 m->m_pkthdr.len += len; 4491 return (m); 4492} 4493 4494/* 4495 * Make a copy of an mbuf chain starting "off0" bytes from the beginning, 4496 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. 4497 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller. 4498 */ 4499int MCFail; 4500 4501struct mbuf * 4502m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode) 4503{ 4504 struct mbuf *n, *mhdr = NULL, **np; 4505 int off = off0; 4506 struct mbuf *top; 4507 int copyhdr = 0; 4508 4509 if (off < 0 || len < 0) 4510 panic("m_copym: invalid offset %d or len %d", off, len); 4511 4512 if (off == 0 && (m->m_flags & M_PKTHDR)) { 4513 mhdr = m; 4514 copyhdr = 1; 4515 } 4516 4517 while (off >= m->m_len) { 4518 if (m->m_next == NULL) 4519 panic("m_copym: invalid mbuf chain"); 4520 off -= m->m_len; 4521 m = m->m_next; 4522 } 4523 np = ⊤ 4524 top = NULL; 4525 4526 while (len > 0) { 4527 if (m == NULL) { 4528 if (len != M_COPYALL) 4529 panic("m_copym: len != M_COPYALL"); 4530 break; 4531 } 4532 4533 n = _M_RETRY(wait, m->m_type); 4534 *np = n; 4535 4536 if (n == NULL) 4537 goto nospace; 4538 4539 if (copyhdr != 0) { 4540 if (mode == M_COPYM_MOVE_HDR) { 4541 M_COPY_PKTHDR(n, mhdr); 4542 } else if (mode == M_COPYM_COPY_HDR) { 4543 if (m_dup_pkthdr(n, mhdr, wait) == 0) 4544 goto nospace; 4545 } 4546 if (len == M_COPYALL) 4547 n->m_pkthdr.len -= off0; 4548 else 4549 n->m_pkthdr.len = len; 4550 copyhdr = 0; 4551 } 4552 if (len == M_COPYALL) { 4553 if (MIN(len, (m->m_len - off)) == len) { 4554 printf("m->m_len %d - off %d = %d, %d\n", 4555 m->m_len, off, m->m_len - off, 4556 MIN(len, (m->m_len - off))); 4557 } 4558 } 4559 n->m_len = MIN(len, (m->m_len - off)); 4560 if (n->m_len == M_COPYALL) { 4561 printf("n->m_len == M_COPYALL, fixing\n"); 4562 n->m_len = MHLEN; 4563 } 4564 if (m->m_flags & M_EXT) { 4565 n->m_ext = m->m_ext; 4566 m_incref(m); 4567 n->m_data = m->m_data + off; 4568 n->m_flags |= M_EXT; 4569 } else { 4570 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t), 4571 (unsigned)n->m_len); 4572 } 4573 if (len != M_COPYALL) 4574 len -= n->m_len; 4575 off = 0; 4576 m = m->m_next; 4577 np = &n->m_next; 4578 } 4579 4580 if (top == NULL) 4581 MCFail++; 4582 4583 return (top); 4584nospace: 4585 4586 m_freem(top); 4587 MCFail++; 4588 return (NULL); 4589} 4590 4591 4592struct mbuf * 4593m_copym(struct mbuf *m, int off0, int len, int wait) 4594{ 4595 return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR)); 4596} 4597 4598/* 4599 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated 4600 * within this routine also, the last mbuf and offset accessed are passed 4601 * out and can be passed back in to avoid having to rescan the entire mbuf 4602 * list (normally hung off of the socket) 4603 */ 4604struct mbuf * 4605m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait, 4606 struct mbuf **m_lastm, int *m_off, uint32_t mode) 4607{ 4608 struct mbuf *n, **np = NULL; 4609 int off = off0, len = len0; 4610 struct mbuf *top = NULL; 4611 int mcflags = MSLEEPF(wait); 4612 int copyhdr = 0; 4613 int type = 0; 4614 mcache_obj_t *list = NULL; 4615 int needed = 0; 4616 4617 if (off == 0 && (m->m_flags & M_PKTHDR)) 4618 copyhdr = 1; 4619 4620 if (*m_lastm != NULL) { 4621 m = *m_lastm; 4622 off = *m_off; 4623 } else { 4624 while (off >= m->m_len) { 4625 off -= m->m_len; 4626 m = m->m_next; 4627 } 4628 } 4629 4630 n = m; 4631 while (len > 0) { 4632 needed++; 4633 ASSERT(n != NULL); 4634 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0))); 4635 n = n->m_next; 4636 } 4637 needed++; 4638 len = len0; 4639 4640 /* 4641 * If the caller doesn't want to be put to sleep, mark it with 4642 * MCR_TRYHARD so that we may reclaim buffers from other places 4643 * before giving up. 4644 */ 4645 if (mcflags & MCR_NOSLEEP) 4646 mcflags |= MCR_TRYHARD; 4647 4648 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed, 4649 mcflags) != needed) 4650 goto nospace; 4651 4652 needed = 0; 4653 while (len > 0) { 4654 n = (struct mbuf *)list; 4655 list = list->obj_next; 4656 ASSERT(n != NULL && m != NULL); 4657 4658 type = (top == NULL) ? MT_HEADER : m->m_type; 4659 MBUF_INIT(n, (top == NULL), type); 4660#if CONFIG_MACF_NET 4661 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) { 4662 mtype_stat_inc(MT_HEADER); 4663 mtype_stat_dec(MT_FREE); 4664 m_free(n); 4665 goto nospace; 4666 } 4667#endif /* MAC_NET */ 4668 4669 if (top == NULL) { 4670 top = n; 4671 np = &top->m_next; 4672 continue; 4673 } else { 4674 needed++; 4675 *np = n; 4676 } 4677 4678 if (copyhdr) { 4679 if (mode == M_COPYM_MOVE_HDR) { 4680 M_COPY_PKTHDR(n, m); 4681 } else if (mode == M_COPYM_COPY_HDR) { 4682 if (m_dup_pkthdr(n, m, wait) == 0) 4683 goto nospace; 4684 } 4685 n->m_pkthdr.len = len; 4686 copyhdr = 0; 4687 } 4688 n->m_len = MIN(len, (m->m_len - off)); 4689 4690 if (m->m_flags & M_EXT) { 4691 n->m_ext = m->m_ext; 4692 m_incref(m); 4693 n->m_data = m->m_data + off; 4694 n->m_flags |= M_EXT; 4695 } else { 4696 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t), 4697 (unsigned)n->m_len); 4698 } 4699 len -= n->m_len; 4700 4701 if (len == 0) { 4702 if ((off + n->m_len) == m->m_len) { 4703 *m_lastm = m->m_next; 4704 *m_off = 0; 4705 } else { 4706 *m_lastm = m; 4707 *m_off = off + n->m_len; 4708 } 4709 break; 4710 } 4711 off = 0; 4712 m = m->m_next; 4713 np = &n->m_next; 4714 } 4715 4716 mtype_stat_inc(MT_HEADER); 4717 mtype_stat_add(type, needed); 4718 mtype_stat_sub(MT_FREE, needed + 1); 4719 4720 ASSERT(list == NULL); 4721 return (top); 4722 4723nospace: 4724 if (list != NULL) 4725 mcache_free_ext(m_cache(MC_MBUF), list); 4726 if (top != NULL) 4727 m_freem(top); 4728 MCFail++; 4729 return (NULL); 4730} 4731 4732/* 4733 * Copy data from an mbuf chain starting "off" bytes from the beginning, 4734 * continuing for "len" bytes, into the indicated buffer. 4735 */ 4736void 4737m_copydata(struct mbuf *m, int off, int len, void *vp) 4738{ 4739 unsigned count; 4740 char *cp = vp; 4741 4742 if (off < 0 || len < 0) 4743 panic("m_copydata: invalid offset %d or len %d", off, len); 4744 4745 while (off > 0) { 4746 if (m == NULL) 4747 panic("m_copydata: invalid mbuf chain"); 4748 if (off < m->m_len) 4749 break; 4750 off -= m->m_len; 4751 m = m->m_next; 4752 } 4753 while (len > 0) { 4754 if (m == NULL) 4755 panic("m_copydata: invalid mbuf chain"); 4756 count = MIN(m->m_len - off, len); 4757 bcopy(MTOD(m, caddr_t) + off, cp, count); 4758 len -= count; 4759 cp += count; 4760 off = 0; 4761 m = m->m_next; 4762 } 4763} 4764 4765/* 4766 * Concatenate mbuf chain n to m. Both chains must be of the same type 4767 * (e.g. MT_DATA). Any m_pkthdr is not updated. 4768 */ 4769void 4770m_cat(struct mbuf *m, struct mbuf *n) 4771{ 4772 while (m->m_next) 4773 m = m->m_next; 4774 while (n) { 4775 if ((m->m_flags & M_EXT) || 4776 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) { 4777 /* just join the two chains */ 4778 m->m_next = n; 4779 return; 4780 } 4781 /* splat the data from one into the other */ 4782 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len, 4783 (u_int)n->m_len); 4784 m->m_len += n->m_len; 4785 n = m_free(n); 4786 } 4787} 4788 4789void 4790m_adj(struct mbuf *mp, int req_len) 4791{ 4792 int len = req_len; 4793 struct mbuf *m; 4794 int count; 4795 4796 if ((m = mp) == NULL) 4797 return; 4798 if (len >= 0) { 4799 /* 4800 * Trim from head. 4801 */ 4802 while (m != NULL && len > 0) { 4803 if (m->m_len <= len) { 4804 len -= m->m_len; 4805 m->m_len = 0; 4806 m = m->m_next; 4807 } else { 4808 m->m_len -= len; 4809 m->m_data += len; 4810 len = 0; 4811 } 4812 } 4813 m = mp; 4814 if (m->m_flags & M_PKTHDR) 4815 m->m_pkthdr.len -= (req_len - len); 4816 } else { 4817 /* 4818 * Trim from tail. Scan the mbuf chain, 4819 * calculating its length and finding the last mbuf. 4820 * If the adjustment only affects this mbuf, then just 4821 * adjust and return. Otherwise, rescan and truncate 4822 * after the remaining size. 4823 */ 4824 len = -len; 4825 count = 0; 4826 for (;;) { 4827 count += m->m_len; 4828 if (m->m_next == (struct mbuf *)0) 4829 break; 4830 m = m->m_next; 4831 } 4832 if (m->m_len >= len) { 4833 m->m_len -= len; 4834 m = mp; 4835 if (m->m_flags & M_PKTHDR) 4836 m->m_pkthdr.len -= len; 4837 return; 4838 } 4839 count -= len; 4840 if (count < 0) 4841 count = 0; 4842 /* 4843 * Correct length for chain is "count". 4844 * Find the mbuf with last data, adjust its length, 4845 * and toss data from remaining mbufs on chain. 4846 */ 4847 m = mp; 4848 if (m->m_flags & M_PKTHDR) 4849 m->m_pkthdr.len = count; 4850 for (; m; m = m->m_next) { 4851 if (m->m_len >= count) { 4852 m->m_len = count; 4853 break; 4854 } 4855 count -= m->m_len; 4856 } 4857 while ((m = m->m_next)) 4858 m->m_len = 0; 4859 } 4860} 4861 4862/* 4863 * Rearange an mbuf chain so that len bytes are contiguous 4864 * and in the data area of an mbuf (so that mtod and dtom 4865 * will work for a structure of size len). Returns the resulting 4866 * mbuf chain on success, frees it and returns null on failure. 4867 * If there is room, it will add up to max_protohdr-len extra bytes to the 4868 * contiguous region in an attempt to avoid being called next time. 4869 */ 4870int MPFail; 4871 4872struct mbuf * 4873m_pullup(struct mbuf *n, int len) 4874{ 4875 struct mbuf *m; 4876 int count; 4877 int space; 4878 4879 /* 4880 * If first mbuf has no cluster, and has room for len bytes 4881 * without shifting current data, pullup into it, 4882 * otherwise allocate a new mbuf to prepend to the chain. 4883 */ 4884 if ((n->m_flags & M_EXT) == 0 && 4885 n->m_data + len < &n->m_dat[MLEN] && n->m_next) { 4886 if (n->m_len >= len) 4887 return (n); 4888 m = n; 4889 n = n->m_next; 4890 len -= m->m_len; 4891 } else { 4892 if (len > MHLEN) 4893 goto bad; 4894 _MGET(m, M_DONTWAIT, n->m_type); 4895 if (m == 0) 4896 goto bad; 4897 m->m_len = 0; 4898 if (n->m_flags & M_PKTHDR) { 4899 M_COPY_PKTHDR(m, n); 4900 n->m_flags &= ~M_PKTHDR; 4901 } 4902 } 4903 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 4904 do { 4905 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len); 4906 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len, 4907 (unsigned)count); 4908 len -= count; 4909 m->m_len += count; 4910 n->m_len -= count; 4911 space -= count; 4912 if (n->m_len) 4913 n->m_data += count; 4914 else 4915 n = m_free(n); 4916 } while (len > 0 && n); 4917 if (len > 0) { 4918 (void) m_free(m); 4919 goto bad; 4920 } 4921 m->m_next = n; 4922 return (m); 4923bad: 4924 m_freem(n); 4925 MPFail++; 4926 return (0); 4927} 4928 4929/* 4930 * Like m_pullup(), except a new mbuf is always allocated, and we allow 4931 * the amount of empty space before the data in the new mbuf to be specified 4932 * (in the event that the caller expects to prepend later). 4933 */ 4934__private_extern__ int MSFail = 0; 4935 4936__private_extern__ struct mbuf * 4937m_copyup(struct mbuf *n, int len, int dstoff) 4938{ 4939 struct mbuf *m; 4940 int count, space; 4941 4942 if (len > (MHLEN - dstoff)) 4943 goto bad; 4944 MGET(m, M_DONTWAIT, n->m_type); 4945 if (m == NULL) 4946 goto bad; 4947 m->m_len = 0; 4948 if (n->m_flags & M_PKTHDR) { 4949 m_copy_pkthdr(m, n); 4950 n->m_flags &= ~M_PKTHDR; 4951 } 4952 m->m_data += dstoff; 4953 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 4954 do { 4955 count = min(min(max(len, max_protohdr), space), n->m_len); 4956 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), 4957 (unsigned)count); 4958 len -= count; 4959 m->m_len += count; 4960 n->m_len -= count; 4961 space -= count; 4962 if (n->m_len) 4963 n->m_data += count; 4964 else 4965 n = m_free(n); 4966 } while (len > 0 && n); 4967 if (len > 0) { 4968 (void) m_free(m); 4969 goto bad; 4970 } 4971 m->m_next = n; 4972 return (m); 4973bad: 4974 m_freem(n); 4975 MSFail++; 4976 return (NULL); 4977} 4978 4979/* 4980 * Partition an mbuf chain in two pieces, returning the tail -- 4981 * all but the first len0 bytes. In case of failure, it returns NULL and 4982 * attempts to restore the chain to its original state. 4983 */ 4984struct mbuf * 4985m_split(struct mbuf *m0, int len0, int wait) 4986{ 4987 return (m_split0(m0, len0, wait, 1)); 4988} 4989 4990static struct mbuf * 4991m_split0(struct mbuf *m0, int len0, int wait, int copyhdr) 4992{ 4993 struct mbuf *m, *n; 4994 unsigned len = len0, remain; 4995 4996 for (m = m0; m && len > m->m_len; m = m->m_next) 4997 len -= m->m_len; 4998 if (m == NULL) 4999 return (NULL); 5000 remain = m->m_len - len; 5001 if (copyhdr && (m0->m_flags & M_PKTHDR)) { 5002 _MGETHDR(n, wait, m0->m_type); 5003 if (n == NULL) 5004 return (NULL); 5005 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; 5006 n->m_pkthdr.len = m0->m_pkthdr.len - len0; 5007 m0->m_pkthdr.len = len0; 5008 if (m->m_flags & M_EXT) 5009 goto extpacket; 5010 if (remain > MHLEN) { 5011 /* m can't be the lead packet */ 5012 MH_ALIGN(n, 0); 5013 n->m_next = m_split(m, len, wait); 5014 if (n->m_next == NULL) { 5015 (void) m_free(n); 5016 return (NULL); 5017 } else 5018 return (n); 5019 } else 5020 MH_ALIGN(n, remain); 5021 } else if (remain == 0) { 5022 n = m->m_next; 5023 m->m_next = NULL; 5024 return (n); 5025 } else { 5026 _MGET(n, wait, m->m_type); 5027 if (n == NULL) 5028 return (NULL); 5029 M_ALIGN(n, remain); 5030 } 5031extpacket: 5032 if (m->m_flags & M_EXT) { 5033 n->m_flags |= M_EXT; 5034 n->m_ext = m->m_ext; 5035 m_incref(m); 5036 n->m_data = m->m_data + len; 5037 } else { 5038 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain); 5039 } 5040 n->m_len = remain; 5041 m->m_len = len; 5042 n->m_next = m->m_next; 5043 m->m_next = NULL; 5044 return (n); 5045} 5046 5047/* 5048 * Routine to copy from device local memory into mbufs. 5049 */ 5050struct mbuf * 5051m_devget(char *buf, int totlen, int off0, struct ifnet *ifp, 5052 void (*copy)(const void *, void *, size_t)) 5053{ 5054 struct mbuf *m; 5055 struct mbuf *top = NULL, **mp = ⊤ 5056 int off = off0, len; 5057 char *cp; 5058 char *epkt; 5059 5060 cp = buf; 5061 epkt = cp + totlen; 5062 if (off) { 5063 /* 5064 * If 'off' is non-zero, packet is trailer-encapsulated, 5065 * so we have to skip the type and length fields. 5066 */ 5067 cp += off + 2 * sizeof (u_int16_t); 5068 totlen -= 2 * sizeof (u_int16_t); 5069 } 5070 _MGETHDR(m, M_DONTWAIT, MT_DATA); 5071 if (m == NULL) 5072 return (NULL); 5073 m->m_pkthdr.rcvif = ifp; 5074 m->m_pkthdr.len = totlen; 5075 m->m_len = MHLEN; 5076 5077 while (totlen > 0) { 5078 if (top != NULL) { 5079 _MGET(m, M_DONTWAIT, MT_DATA); 5080 if (m == NULL) { 5081 m_freem(top); 5082 return (NULL); 5083 } 5084 m->m_len = MLEN; 5085 } 5086 len = MIN(totlen, epkt - cp); 5087 if (len >= MINCLSIZE) { 5088 MCLGET(m, M_DONTWAIT); 5089 if (m->m_flags & M_EXT) { 5090 m->m_len = len = MIN(len, m_maxsize(MC_CL)); 5091 } else { 5092 /* give up when it's out of cluster mbufs */ 5093 if (top != NULL) 5094 m_freem(top); 5095 m_freem(m); 5096 return (NULL); 5097 } 5098 } else { 5099 /* 5100 * Place initial small packet/header at end of mbuf. 5101 */ 5102 if (len < m->m_len) { 5103 if (top == NULL && 5104 len + max_linkhdr <= m->m_len) 5105 m->m_data += max_linkhdr; 5106 m->m_len = len; 5107 } else { 5108 len = m->m_len; 5109 } 5110 } 5111 if (copy) 5112 copy(cp, MTOD(m, caddr_t), (unsigned)len); 5113 else 5114 bcopy(cp, MTOD(m, caddr_t), (unsigned)len); 5115 cp += len; 5116 *mp = m; 5117 mp = &m->m_next; 5118 totlen -= len; 5119 if (cp == epkt) 5120 cp = buf; 5121 } 5122 return (top); 5123} 5124 5125#ifndef MBUF_GROWTH_NORMAL_THRESH 5126#define MBUF_GROWTH_NORMAL_THRESH 25 5127#endif 5128 5129/* 5130 * Cluster freelist allocation check. 5131 */ 5132static int 5133m_howmany(int num, size_t bufsize) 5134{ 5135 int i = 0, j = 0; 5136 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters; 5137 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree; 5138 u_int32_t sumclusters, freeclusters; 5139 u_int32_t percent_pool, percent_kmem; 5140 u_int32_t mb_growth, mb_growth_thresh; 5141 5142 VERIFY(bufsize == m_maxsize(MC_BIGCL) || 5143 bufsize == m_maxsize(MC_16KCL)); 5144 5145 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 5146 5147 /* Numbers in 2K cluster units */ 5148 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT; 5149 m_clusters = m_total(MC_CL); 5150 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT; 5151 m_16kclusters = m_total(MC_16KCL); 5152 sumclusters = m_mbclusters + m_clusters + m_bigclusters; 5153 5154 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT; 5155 m_clfree = m_infree(MC_CL); 5156 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT; 5157 m_16kclfree = m_infree(MC_16KCL); 5158 freeclusters = m_mbfree + m_clfree + m_bigclfree; 5159 5160 /* Bail if we've maxed out the mbuf memory map */ 5161 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) || 5162 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) && 5163 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) { 5164 return (0); 5165 } 5166 5167 if (bufsize == m_maxsize(MC_BIGCL)) { 5168 /* Under minimum */ 5169 if (m_bigclusters < m_minlimit(MC_BIGCL)) 5170 return (m_minlimit(MC_BIGCL) - m_bigclusters); 5171 5172 percent_pool = 5173 ((sumclusters - freeclusters) * 100) / sumclusters; 5174 percent_kmem = (sumclusters * 100) / nclusters; 5175 5176 /* 5177 * If a light/normal user, grow conservatively (75%) 5178 * If a heavy user, grow aggressively (50%) 5179 */ 5180 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) 5181 mb_growth = MB_GROWTH_NORMAL; 5182 else 5183 mb_growth = MB_GROWTH_AGGRESSIVE; 5184 5185 if (percent_kmem < 5) { 5186 /* For initial allocations */ 5187 i = num; 5188 } else { 5189 /* Return if >= MBIGCL_LOWAT clusters available */ 5190 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT && 5191 m_total(MC_BIGCL) >= 5192 MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) 5193 return (0); 5194 5195 /* Ensure at least num clusters are accessible */ 5196 if (num >= m_infree(MC_BIGCL)) 5197 i = num - m_infree(MC_BIGCL); 5198 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) 5199 j = num - (m_total(MC_BIGCL) - 5200 m_minlimit(MC_BIGCL)); 5201 5202 i = MAX(i, j); 5203 5204 /* 5205 * Grow pool if percent_pool > 75 (normal growth) 5206 * or percent_pool > 50 (aggressive growth). 5207 */ 5208 mb_growth_thresh = 100 - (100 / (1 << mb_growth)); 5209 if (percent_pool > mb_growth_thresh) 5210 j = ((sumclusters + num) >> mb_growth) - 5211 freeclusters; 5212 i = MAX(i, j); 5213 } 5214 5215 /* Check to ensure we didn't go over limits */ 5216 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) 5217 i = m_maxlimit(MC_BIGCL) - m_bigclusters; 5218 if ((i << 1) + sumclusters >= nclusters) 5219 i = (nclusters - sumclusters) >> 1; 5220 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL)); 5221 VERIFY(sumclusters + (i << 1) <= nclusters); 5222 5223 } else { /* 16K CL */ 5224 VERIFY(njcl > 0); 5225 /* Under minimum */ 5226 if (m_16kclusters < MIN16KCL) 5227 return (MIN16KCL - m_16kclusters); 5228 if (m_16kclfree >= M16KCL_LOWAT) 5229 return (0); 5230 5231 /* Ensure at least num clusters are available */ 5232 if (num >= m_16kclfree) 5233 i = num - m_16kclfree; 5234 5235 /* Always grow 16KCL pool aggressively */ 5236 if (((m_16kclusters + num) >> 1) > m_16kclfree) 5237 j = ((m_16kclusters + num) >> 1) - m_16kclfree; 5238 i = MAX(i, j); 5239 5240 /* Check to ensure we don't go over limit */ 5241 if (i + m_16kclusters >= m_maxlimit(MC_16KCL)) 5242 i = m_maxlimit(MC_16KCL) - m_16kclusters; 5243 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL)); 5244 } 5245 return (i); 5246} 5247/* 5248 * Return the number of bytes in the mbuf chain, m. 5249 */ 5250unsigned int 5251m_length(struct mbuf *m) 5252{ 5253 struct mbuf *m0; 5254 unsigned int pktlen; 5255 5256 if (m->m_flags & M_PKTHDR) 5257 return (m->m_pkthdr.len); 5258 5259 pktlen = 0; 5260 for (m0 = m; m0 != NULL; m0 = m0->m_next) 5261 pktlen += m0->m_len; 5262 return (pktlen); 5263} 5264 5265/* 5266 * Copy data from a buffer back into the indicated mbuf chain, 5267 * starting "off" bytes from the beginning, extending the mbuf 5268 * chain if necessary. 5269 */ 5270void 5271m_copyback(struct mbuf *m0, int off, int len, const void *cp) 5272{ 5273#if DEBUG 5274 struct mbuf *origm = m0; 5275 int error; 5276#endif /* DEBUG */ 5277 5278 if (m0 == NULL) 5279 return; 5280 5281#if DEBUG 5282 error = 5283#endif /* DEBUG */ 5284 m_copyback0(&m0, off, len, cp, 5285 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT); 5286 5287#if DEBUG 5288 if (error != 0 || (m0 != NULL && origm != m0)) 5289 panic("m_copyback"); 5290#endif /* DEBUG */ 5291} 5292 5293struct mbuf * 5294m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how) 5295{ 5296 int error; 5297 5298 /* don't support chain expansion */ 5299 VERIFY(off + len <= m_length(m0)); 5300 5301 error = m_copyback0(&m0, off, len, cp, 5302 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how); 5303 if (error) { 5304 /* 5305 * no way to recover from partial success. 5306 * just free the chain. 5307 */ 5308 m_freem(m0); 5309 return (NULL); 5310 } 5311 return (m0); 5312} 5313 5314/* 5315 * m_makewritable: ensure the specified range writable. 5316 */ 5317int 5318m_makewritable(struct mbuf **mp, int off, int len, int how) 5319{ 5320 int error; 5321#if DEBUG 5322 struct mbuf *n; 5323 int origlen, reslen; 5324 5325 origlen = m_length(*mp); 5326#endif /* DEBUG */ 5327 5328#if 0 /* M_COPYALL is large enough */ 5329 if (len == M_COPYALL) 5330 len = m_length(*mp) - off; /* XXX */ 5331#endif 5332 5333 error = m_copyback0(mp, off, len, NULL, 5334 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how); 5335 5336#if DEBUG 5337 reslen = 0; 5338 for (n = *mp; n; n = n->m_next) 5339 reslen += n->m_len; 5340 if (origlen != reslen) 5341 panic("m_makewritable: length changed"); 5342 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len) 5343 panic("m_makewritable: inconsist"); 5344#endif /* DEBUG */ 5345 5346 return (error); 5347} 5348 5349static int 5350m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags, 5351 int how) 5352{ 5353 int mlen; 5354 struct mbuf *m, *n; 5355 struct mbuf **mp; 5356 int totlen = 0; 5357 const char *cp = vp; 5358 5359 VERIFY(mp0 != NULL); 5360 VERIFY(*mp0 != NULL); 5361 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL); 5362 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL); 5363 5364 /* 5365 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW, 5366 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive. 5367 */ 5368 5369 VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0); 5370 5371 mp = mp0; 5372 m = *mp; 5373 while (off > (mlen = m->m_len)) { 5374 off -= mlen; 5375 totlen += mlen; 5376 if (m->m_next == NULL) { 5377 int tspace; 5378extend: 5379 if (!(flags & M_COPYBACK0_EXTEND)) 5380 goto out; 5381 5382 /* 5383 * try to make some space at the end of "m". 5384 */ 5385 5386 mlen = m->m_len; 5387 if (off + len >= MINCLSIZE && 5388 !(m->m_flags & M_EXT) && m->m_len == 0) { 5389 MCLGET(m, how); 5390 } 5391 tspace = M_TRAILINGSPACE(m); 5392 if (tspace > 0) { 5393 tspace = MIN(tspace, off + len); 5394 VERIFY(tspace > 0); 5395 bzero(mtod(m, char *) + m->m_len, 5396 MIN(off, tspace)); 5397 m->m_len += tspace; 5398 off += mlen; 5399 totlen -= mlen; 5400 continue; 5401 } 5402 5403 /* 5404 * need to allocate an mbuf. 5405 */ 5406 5407 if (off + len >= MINCLSIZE) { 5408 n = m_getcl(how, m->m_type, 0); 5409 } else { 5410 n = _M_GET(how, m->m_type); 5411 } 5412 if (n == NULL) { 5413 goto out; 5414 } 5415 n->m_len = 0; 5416 n->m_len = MIN(M_TRAILINGSPACE(n), off + len); 5417 bzero(mtod(n, char *), MIN(n->m_len, off)); 5418 m->m_next = n; 5419 } 5420 mp = &m->m_next; 5421 m = m->m_next; 5422 } 5423 while (len > 0) { 5424 mlen = m->m_len - off; 5425 if (mlen != 0 && m_mclhasreference(m)) { 5426 char *datap; 5427 int eatlen; 5428 5429 /* 5430 * this mbuf is read-only. 5431 * allocate a new writable mbuf and try again. 5432 */ 5433 5434#if DIAGNOSTIC 5435 if (!(flags & M_COPYBACK0_COW)) 5436 panic("m_copyback0: read-only"); 5437#endif /* DIAGNOSTIC */ 5438 5439 /* 5440 * if we're going to write into the middle of 5441 * a mbuf, split it first. 5442 */ 5443 if (off > 0 && len < mlen) { 5444 n = m_split0(m, off, how, 0); 5445 if (n == NULL) 5446 goto enobufs; 5447 m->m_next = n; 5448 mp = &m->m_next; 5449 m = n; 5450 off = 0; 5451 continue; 5452 } 5453 5454 /* 5455 * XXX TODO coalesce into the trailingspace of 5456 * the previous mbuf when possible. 5457 */ 5458 5459 /* 5460 * allocate a new mbuf. copy packet header if needed. 5461 */ 5462 n = _M_GET(how, m->m_type); 5463 if (n == NULL) 5464 goto enobufs; 5465 if (off == 0 && (m->m_flags & M_PKTHDR)) { 5466 M_COPY_PKTHDR(n, m); 5467 n->m_len = MHLEN; 5468 } else { 5469 if (len >= MINCLSIZE) 5470 MCLGET(n, M_DONTWAIT); 5471 n->m_len = 5472 (n->m_flags & M_EXT) ? MCLBYTES : MLEN; 5473 } 5474 if (n->m_len > len) 5475 n->m_len = len; 5476 5477 /* 5478 * free the region which has been overwritten. 5479 * copying data from old mbufs if requested. 5480 */ 5481 if (flags & M_COPYBACK0_PRESERVE) 5482 datap = mtod(n, char *); 5483 else 5484 datap = NULL; 5485 eatlen = n->m_len; 5486 VERIFY(off == 0 || eatlen >= mlen); 5487 if (off > 0) { 5488 VERIFY(len >= mlen); 5489 m->m_len = off; 5490 m->m_next = n; 5491 if (datap) { 5492 m_copydata(m, off, mlen, datap); 5493 datap += mlen; 5494 } 5495 eatlen -= mlen; 5496 mp = &m->m_next; 5497 m = m->m_next; 5498 } 5499 while (m != NULL && m_mclhasreference(m) && 5500 n->m_type == m->m_type && eatlen > 0) { 5501 mlen = MIN(eatlen, m->m_len); 5502 if (datap) { 5503 m_copydata(m, 0, mlen, datap); 5504 datap += mlen; 5505 } 5506 m->m_data += mlen; 5507 m->m_len -= mlen; 5508 eatlen -= mlen; 5509 if (m->m_len == 0) 5510 *mp = m = m_free(m); 5511 } 5512 if (eatlen > 0) 5513 n->m_len -= eatlen; 5514 n->m_next = m; 5515 *mp = m = n; 5516 continue; 5517 } 5518 mlen = MIN(mlen, len); 5519 if (flags & M_COPYBACK0_COPYBACK) { 5520 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen); 5521 cp += mlen; 5522 } 5523 len -= mlen; 5524 mlen += off; 5525 off = 0; 5526 totlen += mlen; 5527 if (len == 0) 5528 break; 5529 if (m->m_next == NULL) { 5530 goto extend; 5531 } 5532 mp = &m->m_next; 5533 m = m->m_next; 5534 } 5535out: 5536 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) { 5537 VERIFY(flags & M_COPYBACK0_EXTEND); 5538 m->m_pkthdr.len = totlen; 5539 } 5540 5541 return (0); 5542 5543enobufs: 5544 return (ENOBUFS); 5545} 5546 5547uint64_t 5548mcl_to_paddr(char *addr) 5549{ 5550 vm_offset_t base_phys; 5551 5552 if (!MBUF_IN_MAP(addr)) 5553 return (0); 5554 base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)]; 5555 5556 if (base_phys == 0) 5557 return (0); 5558 return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK))); 5559} 5560 5561/* 5562 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft. 5563 * And really copy the thing. That way, we don't "precompute" checksums 5564 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for 5565 * small packets, don't dup into a cluster. That way received packets 5566 * don't take up too much room in the sockbuf (cf. sbspace()). 5567 */ 5568int MDFail; 5569 5570struct mbuf * 5571m_dup(struct mbuf *m, int how) 5572{ 5573 struct mbuf *n, **np; 5574 struct mbuf *top; 5575 int copyhdr = 0; 5576 5577 np = ⊤ 5578 top = NULL; 5579 if (m->m_flags & M_PKTHDR) 5580 copyhdr = 1; 5581 5582 /* 5583 * Quick check: if we have one mbuf and its data fits in an 5584 * mbuf with packet header, just copy and go. 5585 */ 5586 if (m->m_next == NULL) { 5587 /* Then just move the data into an mbuf and be done... */ 5588 if (copyhdr) { 5589 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) { 5590 if ((n = _M_GETHDR(how, m->m_type)) == NULL) 5591 return (NULL); 5592 n->m_len = m->m_len; 5593 m_dup_pkthdr(n, m, how); 5594 bcopy(m->m_data, n->m_data, m->m_len); 5595 return (n); 5596 } 5597 } else if (m->m_len <= MLEN) { 5598 if ((n = _M_GET(how, m->m_type)) == NULL) 5599 return (NULL); 5600 bcopy(m->m_data, n->m_data, m->m_len); 5601 n->m_len = m->m_len; 5602 return (n); 5603 } 5604 } 5605 while (m != NULL) { 5606#if BLUE_DEBUG 5607 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len, 5608 m->m_data); 5609#endif 5610 if (copyhdr) 5611 n = _M_GETHDR(how, m->m_type); 5612 else 5613 n = _M_GET(how, m->m_type); 5614 if (n == NULL) 5615 goto nospace; 5616 if (m->m_flags & M_EXT) { 5617 if (m->m_len <= m_maxsize(MC_CL)) 5618 MCLGET(n, how); 5619 else if (m->m_len <= m_maxsize(MC_BIGCL)) 5620 n = m_mbigget(n, how); 5621 else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0) 5622 n = m_m16kget(n, how); 5623 if (!(n->m_flags & M_EXT)) { 5624 (void) m_free(n); 5625 goto nospace; 5626 } 5627 } 5628 *np = n; 5629 if (copyhdr) { 5630 /* Don't use M_COPY_PKTHDR: preserve m_data */ 5631 m_dup_pkthdr(n, m, how); 5632 copyhdr = 0; 5633 if (!(n->m_flags & M_EXT)) 5634 n->m_data = n->m_pktdat; 5635 } 5636 n->m_len = m->m_len; 5637 /* 5638 * Get the dup on the same bdry as the original 5639 * Assume that the two mbufs have the same offset to data area 5640 * (up to word boundaries) 5641 */ 5642 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len); 5643 m = m->m_next; 5644 np = &n->m_next; 5645#if BLUE_DEBUG 5646 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len, 5647 n->m_data); 5648#endif 5649 } 5650 5651 if (top == NULL) 5652 MDFail++; 5653 return (top); 5654 5655nospace: 5656 m_freem(top); 5657 MDFail++; 5658 return (NULL); 5659} 5660 5661#define MBUF_MULTIPAGES(m) \ 5662 (((m)->m_flags & M_EXT) && \ 5663 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \ 5664 (!IS_P2ALIGNED((m)->m_data, NBPG) && \ 5665 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len)))) 5666 5667static struct mbuf * 5668m_expand(struct mbuf *m, struct mbuf **last) 5669{ 5670 struct mbuf *top = NULL; 5671 struct mbuf **nm = ⊤ 5672 uintptr_t data0, data; 5673 unsigned int len0, len; 5674 5675 VERIFY(MBUF_MULTIPAGES(m)); 5676 VERIFY(m->m_next == NULL); 5677 data0 = (uintptr_t)m->m_data; 5678 len0 = m->m_len; 5679 *last = top; 5680 5681 for (;;) { 5682 struct mbuf *n; 5683 5684 data = data0; 5685 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG) 5686 len = NBPG; 5687 else if (!IS_P2ALIGNED(data, NBPG) && 5688 P2ROUNDUP(data, NBPG) < (data + len0)) 5689 len = P2ROUNDUP(data, NBPG) - data; 5690 else 5691 len = len0; 5692 5693 VERIFY(len > 0); 5694 VERIFY(m->m_flags & M_EXT); 5695 m->m_data = (void *)data; 5696 m->m_len = len; 5697 5698 *nm = *last = m; 5699 nm = &m->m_next; 5700 m->m_next = NULL; 5701 5702 data0 += len; 5703 len0 -= len; 5704 if (len0 == 0) 5705 break; 5706 5707 n = _M_RETRY(M_DONTWAIT, MT_DATA); 5708 if (n == NULL) { 5709 m_freem(top); 5710 top = *last = NULL; 5711 break; 5712 } 5713 5714 n->m_ext = m->m_ext; 5715 m_incref(m); 5716 n->m_flags |= M_EXT; 5717 m = n; 5718 } 5719 return (top); 5720} 5721 5722struct mbuf * 5723m_normalize(struct mbuf *m) 5724{ 5725 struct mbuf *top = NULL; 5726 struct mbuf **nm = ⊤ 5727 boolean_t expanded = FALSE; 5728 5729 while (m != NULL) { 5730 struct mbuf *n; 5731 5732 n = m->m_next; 5733 m->m_next = NULL; 5734 5735 /* Does the data cross one or more page boundaries? */ 5736 if (MBUF_MULTIPAGES(m)) { 5737 struct mbuf *last; 5738 if ((m = m_expand(m, &last)) == NULL) { 5739 m_freem(n); 5740 m_freem(top); 5741 top = NULL; 5742 break; 5743 } 5744 *nm = m; 5745 nm = &last->m_next; 5746 expanded = TRUE; 5747 } else { 5748 *nm = m; 5749 nm = &m->m_next; 5750 } 5751 m = n; 5752 } 5753 if (expanded) 5754 atomic_add_32(&mb_normalized, 1); 5755 return (top); 5756} 5757 5758/* 5759 * Append the specified data to the indicated mbuf chain, 5760 * Extend the mbuf chain if the new data does not fit in 5761 * existing space. 5762 * 5763 * Return 1 if able to complete the job; otherwise 0. 5764 */ 5765int 5766m_append(struct mbuf *m0, int len, caddr_t cp) 5767{ 5768 struct mbuf *m, *n; 5769 int remainder, space; 5770 5771 for (m = m0; m->m_next != NULL; m = m->m_next) 5772 ; 5773 remainder = len; 5774 space = M_TRAILINGSPACE(m); 5775 if (space > 0) { 5776 /* 5777 * Copy into available space. 5778 */ 5779 if (space > remainder) 5780 space = remainder; 5781 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 5782 m->m_len += space; 5783 cp += space, remainder -= space; 5784 } 5785 while (remainder > 0) { 5786 /* 5787 * Allocate a new mbuf; could check space 5788 * and allocate a cluster instead. 5789 */ 5790 n = m_get(M_WAITOK, m->m_type); 5791 if (n == NULL) 5792 break; 5793 n->m_len = min(MLEN, remainder); 5794 bcopy(cp, mtod(n, caddr_t), n->m_len); 5795 cp += n->m_len; 5796 remainder -= n->m_len; 5797 m->m_next = n; 5798 m = n; 5799 } 5800 if (m0->m_flags & M_PKTHDR) 5801 m0->m_pkthdr.len += len - remainder; 5802 return (remainder == 0); 5803} 5804 5805struct mbuf * 5806m_last(struct mbuf *m) 5807{ 5808 while (m->m_next != NULL) 5809 m = m->m_next; 5810 return (m); 5811} 5812 5813unsigned int 5814m_fixhdr(struct mbuf *m0) 5815{ 5816 u_int len; 5817 5818 VERIFY(m0->m_flags & M_PKTHDR); 5819 5820 len = m_length2(m0, NULL); 5821 m0->m_pkthdr.len = len; 5822 return (len); 5823} 5824 5825unsigned int 5826m_length2(struct mbuf *m0, struct mbuf **last) 5827{ 5828 struct mbuf *m; 5829 u_int len; 5830 5831 len = 0; 5832 for (m = m0; m != NULL; m = m->m_next) { 5833 len += m->m_len; 5834 if (m->m_next == NULL) 5835 break; 5836 } 5837 if (last != NULL) 5838 *last = m; 5839 return (len); 5840} 5841 5842/* 5843 * Defragment a mbuf chain, returning the shortest possible chain of mbufs 5844 * and clusters. If allocation fails and this cannot be completed, NULL will 5845 * be returned, but the passed in chain will be unchanged. Upon success, 5846 * the original chain will be freed, and the new chain will be returned. 5847 * 5848 * If a non-packet header is passed in, the original mbuf (chain?) will 5849 * be returned unharmed. 5850 * 5851 * If offset is specfied, the first mbuf in the chain will have a leading 5852 * space of the amount stated by the "off" parameter. 5853 * 5854 * This routine requires that the m_pkthdr.header field of the original 5855 * mbuf chain is cleared by the caller. 5856 */ 5857struct mbuf * 5858m_defrag_offset(struct mbuf *m0, u_int32_t off, int how) 5859{ 5860 struct mbuf *m_new = NULL, *m_final = NULL; 5861 int progress = 0, length, pktlen; 5862 5863 if (!(m0->m_flags & M_PKTHDR)) 5864 return (m0); 5865 5866 VERIFY(off < MHLEN); 5867 m_fixhdr(m0); /* Needed sanity check */ 5868 5869 pktlen = m0->m_pkthdr.len + off; 5870 if (pktlen > MHLEN) 5871 m_final = m_getcl(how, MT_DATA, M_PKTHDR); 5872 else 5873 m_final = m_gethdr(how, MT_DATA); 5874 5875 if (m_final == NULL) 5876 goto nospace; 5877 5878 if (off > 0) { 5879 pktlen -= off; 5880 m_final->m_data += off; 5881 } 5882 5883 /* 5884 * Caller must have handled the contents pointed to by this 5885 * pointer before coming here, as otherwise it will point to 5886 * the original mbuf which will get freed upon success. 5887 */ 5888 VERIFY(m0->m_pkthdr.pkt_hdr == NULL); 5889 5890 if (m_dup_pkthdr(m_final, m0, how) == 0) 5891 goto nospace; 5892 5893 m_new = m_final; 5894 5895 while (progress < pktlen) { 5896 length = pktlen - progress; 5897 if (length > MCLBYTES) 5898 length = MCLBYTES; 5899 length -= ((m_new == m_final) ? off : 0); 5900 5901 if (m_new == NULL) { 5902 if (length > MLEN) 5903 m_new = m_getcl(how, MT_DATA, 0); 5904 else 5905 m_new = m_get(how, MT_DATA); 5906 if (m_new == NULL) 5907 goto nospace; 5908 } 5909 5910 m_copydata(m0, progress, length, mtod(m_new, caddr_t)); 5911 progress += length; 5912 m_new->m_len = length; 5913 if (m_new != m_final) 5914 m_cat(m_final, m_new); 5915 m_new = NULL; 5916 } 5917 m_freem(m0); 5918 m0 = m_final; 5919 return (m0); 5920nospace: 5921 if (m_final) 5922 m_freem(m_final); 5923 return (NULL); 5924} 5925 5926struct mbuf * 5927m_defrag(struct mbuf *m0, int how) 5928{ 5929 return (m_defrag_offset(m0, 0, how)); 5930} 5931 5932void 5933m_mchtype(struct mbuf *m, int t) 5934{ 5935 mtype_stat_inc(t); 5936 mtype_stat_dec(m->m_type); 5937 (m)->m_type = t; 5938} 5939 5940void * 5941m_mtod(struct mbuf *m) 5942{ 5943 return (MTOD(m, void *)); 5944} 5945 5946struct mbuf * 5947m_dtom(void *x) 5948{ 5949 return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1))); 5950} 5951 5952void 5953m_mcheck(struct mbuf *m) 5954{ 5955 _MCHECK(m); 5956} 5957 5958/* 5959 * Return a pointer to mbuf/offset of location in mbuf chain. 5960 */ 5961struct mbuf * 5962m_getptr(struct mbuf *m, int loc, int *off) 5963{ 5964 5965 while (loc >= 0) { 5966 /* Normal end of search. */ 5967 if (m->m_len > loc) { 5968 *off = loc; 5969 return (m); 5970 } else { 5971 loc -= m->m_len; 5972 if (m->m_next == NULL) { 5973 if (loc == 0) { 5974 /* Point at the end of valid data. */ 5975 *off = m->m_len; 5976 return (m); 5977 } 5978 return (NULL); 5979 } 5980 m = m->m_next; 5981 } 5982 } 5983 return (NULL); 5984} 5985 5986/* 5987 * Inform the corresponding mcache(s) that there's a waiter below. 5988 */ 5989static void 5990mbuf_waiter_inc(mbuf_class_t class, boolean_t comp) 5991{ 5992 mcache_waiter_inc(m_cache(class)); 5993 if (comp) { 5994 if (class == MC_CL) { 5995 mcache_waiter_inc(m_cache(MC_MBUF_CL)); 5996 } else if (class == MC_BIGCL) { 5997 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); 5998 } else if (class == MC_16KCL) { 5999 mcache_waiter_inc(m_cache(MC_MBUF_16KCL)); 6000 } else { 6001 mcache_waiter_inc(m_cache(MC_MBUF_CL)); 6002 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); 6003 } 6004 } 6005} 6006 6007/* 6008 * Inform the corresponding mcache(s) that there's no more waiter below. 6009 */ 6010static void 6011mbuf_waiter_dec(mbuf_class_t class, boolean_t comp) 6012{ 6013 mcache_waiter_dec(m_cache(class)); 6014 if (comp) { 6015 if (class == MC_CL) { 6016 mcache_waiter_dec(m_cache(MC_MBUF_CL)); 6017 } else if (class == MC_BIGCL) { 6018 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); 6019 } else if (class == MC_16KCL) { 6020 mcache_waiter_dec(m_cache(MC_MBUF_16KCL)); 6021 } else { 6022 mcache_waiter_dec(m_cache(MC_MBUF_CL)); 6023 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); 6024 } 6025 } 6026} 6027 6028/* 6029 * Called during slab (blocking and non-blocking) allocation. If there 6030 * is at least one waiter, and the time since the first waiter is blocked 6031 * is greater than the watchdog timeout, panic the system. 6032 */ 6033static void 6034mbuf_watchdog(void) 6035{ 6036 struct timeval now; 6037 unsigned int since; 6038 6039 if (mb_waiters == 0 || !mb_watchdog) 6040 return; 6041 6042 microuptime(&now); 6043 since = now.tv_sec - mb_wdtstart.tv_sec; 6044 if (since >= MB_WDT_MAXTIME) { 6045 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__, 6046 mb_waiters, since, mbuf_dump()); 6047 /* NOTREACHED */ 6048 } 6049} 6050 6051/* 6052 * Called during blocking allocation. Returns TRUE if one or more objects 6053 * are available at the per-CPU caches layer and that allocation should be 6054 * retried at that level. 6055 */ 6056static boolean_t 6057mbuf_sleep(mbuf_class_t class, unsigned int num, int wait) 6058{ 6059 boolean_t mcache_retry = FALSE; 6060 6061 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 6062 6063 /* Check if there's anything at the cache layer */ 6064 if (mbuf_cached_above(class, wait)) { 6065 mcache_retry = TRUE; 6066 goto done; 6067 } 6068 6069 /* Nothing? Then try hard to get it from somewhere */ 6070 m_reclaim(class, num, (wait & MCR_COMP)); 6071 6072 /* We tried hard and got something? */ 6073 if (m_infree(class) > 0) { 6074 mbstat.m_wait++; 6075 goto done; 6076 } else if (mbuf_cached_above(class, wait)) { 6077 mbstat.m_wait++; 6078 mcache_retry = TRUE; 6079 goto done; 6080 } else if (wait & MCR_TRYHARD) { 6081 mcache_retry = TRUE; 6082 goto done; 6083 } 6084 6085 /* 6086 * There's really nothing for us right now; inform the 6087 * cache(s) that there is a waiter below and go to sleep. 6088 */ 6089 mbuf_waiter_inc(class, (wait & MCR_COMP)); 6090 6091 VERIFY(!(wait & MCR_NOSLEEP)); 6092 6093 /* 6094 * If this is the first waiter, arm the watchdog timer. Otherwise 6095 * check if we need to panic the system due to watchdog timeout. 6096 */ 6097 if (mb_waiters == 0) 6098 microuptime(&mb_wdtstart); 6099 else 6100 mbuf_watchdog(); 6101 6102 mb_waiters++; 6103 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL); 6104 6105 /* We are now up; stop getting notified until next round */ 6106 mbuf_waiter_dec(class, (wait & MCR_COMP)); 6107 6108 /* We waited and got something */ 6109 if (m_infree(class) > 0) { 6110 mbstat.m_wait++; 6111 goto done; 6112 } else if (mbuf_cached_above(class, wait)) { 6113 mbstat.m_wait++; 6114 mcache_retry = TRUE; 6115 } 6116done: 6117 return (mcache_retry); 6118} 6119 6120static void 6121mbuf_worker_thread(void) 6122{ 6123 int mbuf_expand; 6124 6125 while (1) { 6126 lck_mtx_lock(mbuf_mlock); 6127 6128 mbuf_expand = 0; 6129 if (mbuf_expand_mcl) { 6130 int n; 6131 6132 /* Adjust to current number of cluster in use */ 6133 n = mbuf_expand_mcl - 6134 (m_total(MC_CL) - m_infree(MC_CL)); 6135 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) 6136 n = m_maxlimit(MC_CL) - m_total(MC_CL); 6137 mbuf_expand_mcl = 0; 6138 6139 if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0) 6140 mbuf_expand++; 6141 } 6142 if (mbuf_expand_big) { 6143 int n; 6144 6145 /* Adjust to current number of 4 KB cluster in use */ 6146 n = mbuf_expand_big - 6147 (m_total(MC_BIGCL) - m_infree(MC_BIGCL)); 6148 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) 6149 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL); 6150 mbuf_expand_big = 0; 6151 6152 if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0) 6153 mbuf_expand++; 6154 } 6155 if (mbuf_expand_16k) { 6156 int n; 6157 6158 /* Adjust to current number of 16 KB cluster in use */ 6159 n = mbuf_expand_16k - 6160 (m_total(MC_16KCL) - m_infree(MC_16KCL)); 6161 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) 6162 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL); 6163 mbuf_expand_16k = 0; 6164 6165 if (n > 0) 6166 (void) freelist_populate(MC_16KCL, n, M_WAIT); 6167 } 6168 6169 /* 6170 * Because we can run out of memory before filling the mbuf 6171 * map, we should not allocate more clusters than they are 6172 * mbufs -- otherwise we could have a large number of useless 6173 * clusters allocated. 6174 */ 6175 if (mbuf_expand) { 6176 while (m_total(MC_MBUF) < 6177 (m_total(MC_BIGCL) + m_total(MC_CL))) { 6178 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) 6179 break; 6180 } 6181 } 6182 6183 lck_mtx_unlock(mbuf_mlock); 6184 6185 assert_wait(&mbuf_worker_run, THREAD_UNINT); 6186 (void) thread_block((thread_continue_t)mbuf_worker_thread); 6187 } 6188} 6189 6190static void 6191mbuf_worker_thread_init(void) 6192{ 6193 mbuf_worker_ready++; 6194 mbuf_worker_thread(); 6195} 6196 6197static mcl_slab_t * 6198slab_get(void *buf) 6199{ 6200 mcl_slabg_t *slg; 6201 unsigned int ix, k; 6202 6203 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 6204 6205 VERIFY(MBUF_IN_MAP(buf)); 6206 ix = ((char *)buf - (char *)mbutl) >> MBSHIFT; 6207 VERIFY(ix < maxslabgrp); 6208 6209 if ((slg = slabstbl[ix]) == NULL) { 6210 /* 6211 * In the current implementation, we never shrink the memory 6212 * pool (hence the cluster map); if we attempt to reallocate 6213 * a cluster group when it's already allocated, panic since 6214 * this is a sign of a memory corruption (slabstbl[ix] got 6215 * nullified). This also means that there shouldn't be any 6216 * hole in the kernel sub-map for the mbuf pool. 6217 */ 6218 ++slabgrp; 6219 VERIFY(ix < slabgrp); 6220 /* 6221 * Slabs expansion can only be done single threaded; when 6222 * we get here, it must be as a result of m_clalloc() which 6223 * is serialized and therefore mb_clalloc_busy must be set. 6224 */ 6225 VERIFY(mb_clalloc_busy); 6226 lck_mtx_unlock(mbuf_mlock); 6227 6228 /* This is a new buffer; create the slabs group for it */ 6229 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP, 6230 M_WAITOK | M_ZERO); 6231 VERIFY(slg != NULL); 6232 6233 lck_mtx_lock(mbuf_mlock); 6234 /* 6235 * No other thread could have gone into m_clalloc() after 6236 * we dropped the lock above, so verify that it's true. 6237 */ 6238 VERIFY(mb_clalloc_busy); 6239 6240 slabstbl[ix] = slg; 6241 6242 /* Chain each slab in the group to its forward neighbor */ 6243 for (k = 1; k < NSLABSPMB; k++) 6244 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k]; 6245 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL); 6246 6247 /* And chain the last slab in the previous group to this */ 6248 if (ix > 0) { 6249 VERIFY(slabstbl[ix - 1]-> 6250 slg_slab[NSLABSPMB - 1].sl_next == NULL); 6251 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next = 6252 &slg->slg_slab[0]; 6253 } 6254 } 6255 6256 ix = MTOBG(buf) % NSLABSPMB; 6257 VERIFY(ix < NSLABSPMB); 6258 6259 return (&slg->slg_slab[ix]); 6260} 6261 6262static void 6263slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags, 6264 void *base, void *head, unsigned int len, int refcnt, int chunks) 6265{ 6266 sp->sl_class = class; 6267 sp->sl_flags = flags; 6268 sp->sl_base = base; 6269 sp->sl_head = head; 6270 sp->sl_len = len; 6271 sp->sl_refcnt = refcnt; 6272 sp->sl_chunks = chunks; 6273 slab_detach(sp); 6274} 6275 6276static void 6277slab_insert(mcl_slab_t *sp, mbuf_class_t class) 6278{ 6279 VERIFY(slab_is_detached(sp)); 6280 m_slab_cnt(class)++; 6281 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link); 6282 sp->sl_flags &= ~SLF_DETACHED; 6283 if (class == MC_16KCL) { 6284 int k; 6285 for (k = 1; k < NSLABSP16KB; k++) { 6286 sp = sp->sl_next; 6287 /* Next slab must already be present */ 6288 VERIFY(sp != NULL); 6289 VERIFY(slab_is_detached(sp)); 6290 sp->sl_flags &= ~SLF_DETACHED; 6291 } 6292 } 6293} 6294 6295static void 6296slab_remove(mcl_slab_t *sp, mbuf_class_t class) 6297{ 6298 VERIFY(!slab_is_detached(sp)); 6299 VERIFY(m_slab_cnt(class) > 0); 6300 m_slab_cnt(class)--; 6301 TAILQ_REMOVE(&m_slablist(class), sp, sl_link); 6302 slab_detach(sp); 6303 if (class == MC_16KCL) { 6304 int k; 6305 for (k = 1; k < NSLABSP16KB; k++) { 6306 sp = sp->sl_next; 6307 /* Next slab must already be present */ 6308 VERIFY(sp != NULL); 6309 VERIFY(!slab_is_detached(sp)); 6310 slab_detach(sp); 6311 } 6312 } 6313} 6314 6315static boolean_t 6316slab_inrange(mcl_slab_t *sp, void *buf) 6317{ 6318 return ((uintptr_t)buf >= (uintptr_t)sp->sl_base && 6319 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len)); 6320} 6321 6322#undef panic 6323 6324static void 6325slab_nextptr_panic(mcl_slab_t *sp, void *addr) 6326{ 6327 int i; 6328 unsigned int chunk_len = sp->sl_len / sp->sl_chunks; 6329 uintptr_t buf = (uintptr_t)sp->sl_base; 6330 6331 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) { 6332 void *next = ((mcache_obj_t *)buf)->obj_next; 6333 if (next != addr) 6334 continue; 6335 if (!mclverify) { 6336 if (next != NULL && !MBUF_IN_MAP(next)) { 6337 mcache_t *cp = m_cache(sp->sl_class); 6338 panic("%s: %s buffer %p in slab %p modified " 6339 "after free at offset 0: %p out of range " 6340 "[%p-%p)\n", __func__, cp->mc_name, 6341 (void *)buf, sp, next, mbutl, embutl); 6342 /* NOTREACHED */ 6343 } 6344 } else { 6345 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class, 6346 (mcache_obj_t *)buf); 6347 mcl_audit_verify_nextptr(next, mca); 6348 } 6349 } 6350} 6351 6352static void 6353slab_detach(mcl_slab_t *sp) 6354{ 6355 sp->sl_link.tqe_next = (mcl_slab_t *)-1; 6356 sp->sl_link.tqe_prev = (mcl_slab_t **)-1; 6357 sp->sl_flags |= SLF_DETACHED; 6358} 6359 6360static boolean_t 6361slab_is_detached(mcl_slab_t *sp) 6362{ 6363 return ((intptr_t)sp->sl_link.tqe_next == -1 && 6364 (intptr_t)sp->sl_link.tqe_prev == -1 && 6365 (sp->sl_flags & SLF_DETACHED)); 6366} 6367 6368static void 6369mcl_audit_init(void *buf, mcache_audit_t **mca_list, 6370 mcache_obj_t **con_list, size_t con_size, unsigned int num) 6371{ 6372 mcache_audit_t *mca, *mca_tail; 6373 mcache_obj_t *con = NULL; 6374 boolean_t save_contents = (con_list != NULL); 6375 unsigned int i, ix; 6376 6377 ASSERT(num <= NMBPBG); 6378 ASSERT(con_list == NULL || con_size != 0); 6379 6380 ix = MTOBG(buf); 6381 VERIFY(ix < maxclaudit); 6382 6383 /* Make sure we haven't been here before */ 6384 for (i = 0; i < NMBPBG; i++) 6385 VERIFY(mclaudit[ix].cl_audit[i] == NULL); 6386 6387 mca = mca_tail = *mca_list; 6388 if (save_contents) 6389 con = *con_list; 6390 6391 for (i = 0; i < num; i++) { 6392 mcache_audit_t *next; 6393 6394 next = mca->mca_next; 6395 bzero(mca, sizeof (*mca)); 6396 mca->mca_next = next; 6397 mclaudit[ix].cl_audit[i] = mca; 6398 6399 /* Attach the contents buffer if requested */ 6400 if (save_contents) { 6401 mcl_saved_contents_t *msc = 6402 (mcl_saved_contents_t *)(void *)con; 6403 6404 VERIFY(msc != NULL); 6405 VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t))); 6406 VERIFY(con_size == sizeof (*msc)); 6407 mca->mca_contents_size = con_size; 6408 mca->mca_contents = msc; 6409 con = con->obj_next; 6410 bzero(mca->mca_contents, mca->mca_contents_size); 6411 } 6412 6413 mca_tail = mca; 6414 mca = mca->mca_next; 6415 } 6416 6417 if (save_contents) 6418 *con_list = con; 6419 6420 *mca_list = mca_tail->mca_next; 6421 mca_tail->mca_next = NULL; 6422} 6423 6424/* 6425 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return 6426 * the corresponding audit structure for that buffer. 6427 */ 6428static mcache_audit_t * 6429mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o) 6430{ 6431 mcache_audit_t *mca = NULL; 6432 int ix = MTOBG(o); 6433 6434 VERIFY(ix < maxclaudit); 6435 VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG))); 6436 6437 switch (class) { 6438 case MC_MBUF: 6439 /* 6440 * For the mbuf case, find the index of the page 6441 * used by the mbuf and use that index to locate the 6442 * base address of the page. Then find out the 6443 * mbuf index relative to the page base and use 6444 * it to locate the audit structure. 6445 */ 6446 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG); 6447 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)]; 6448 break; 6449 6450 case MC_CL: 6451 /* 6452 * Same thing as above, but for 2KB clusters in a page. 6453 */ 6454 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG); 6455 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)]; 6456 break; 6457 6458 case MC_BIGCL: 6459 case MC_16KCL: 6460 /* 6461 * Same as above, but only return the first element. 6462 */ 6463 mca = mclaudit[ix].cl_audit[0]; 6464 break; 6465 6466 default: 6467 VERIFY(0); 6468 /* NOTREACHED */ 6469 } 6470 6471 return (mca); 6472} 6473 6474static void 6475mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite, 6476 boolean_t alloc) 6477{ 6478 struct mbuf *m = addr; 6479 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next; 6480 6481 VERIFY(mca->mca_contents != NULL && 6482 mca->mca_contents_size == AUDIT_CONTENTS_SIZE); 6483 6484 if (mclverify) 6485 mcl_audit_verify_nextptr(next, mca); 6486 6487 if (!alloc) { 6488 /* Save constructed mbuf fields */ 6489 mcl_audit_save_mbuf(m, mca); 6490 if (mclverify) { 6491 mcache_set_pattern(MCACHE_FREE_PATTERN, m, 6492 m_maxsize(MC_MBUF)); 6493 } 6494 ((mcache_obj_t *)m)->obj_next = next; 6495 return; 6496 } 6497 6498 /* Check if the buffer has been corrupted while in freelist */ 6499 if (mclverify) { 6500 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF)); 6501 } 6502 /* Restore constructed mbuf fields */ 6503 mcl_audit_restore_mbuf(m, mca, composite); 6504} 6505 6506static void 6507mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite) 6508{ 6509 struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca); 6510 6511 if (composite) { 6512 struct mbuf *next = m->m_next; 6513 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL && 6514 MBUF_IS_COMPOSITE(ms)); 6515 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE); 6516 /* 6517 * We could have hand-picked the mbuf fields and restore 6518 * them individually, but that will be a maintenance 6519 * headache. Instead, restore everything that was saved; 6520 * the mbuf layer will recheck and reinitialize anyway. 6521 */ 6522 bcopy(ms, m, MCA_SAVED_MBUF_SIZE); 6523 m->m_next = next; 6524 } else { 6525 /* 6526 * For a regular mbuf (no cluster attached) there's nothing 6527 * to restore other than the type field, which is expected 6528 * to be MT_FREE. 6529 */ 6530 m->m_type = ms->m_type; 6531 } 6532 _MCHECK(m); 6533} 6534 6535static void 6536mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca) 6537{ 6538 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE); 6539 _MCHECK(m); 6540 bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE); 6541} 6542 6543static void 6544mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc, 6545 boolean_t save_next) 6546{ 6547 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next; 6548 6549 if (!alloc) { 6550 if (mclverify) { 6551 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size); 6552 } 6553 if (save_next) { 6554 mcl_audit_verify_nextptr(next, mca); 6555 ((mcache_obj_t *)addr)->obj_next = next; 6556 } 6557 } else if (mclverify) { 6558 /* Check if the buffer has been corrupted while in freelist */ 6559 mcl_audit_verify_nextptr(next, mca); 6560 mcache_audit_free_verify_set(mca, addr, 0, size); 6561 } 6562} 6563 6564static void 6565mcl_audit_scratch(mcache_audit_t *mca) 6566{ 6567 void *stack[MCACHE_STACK_DEPTH + 1]; 6568 mcl_scratch_audit_t *msa; 6569 struct timeval now; 6570 6571 VERIFY(mca->mca_contents != NULL); 6572 msa = MCA_SAVED_SCRATCH_PTR(mca); 6573 6574 msa->msa_pthread = msa->msa_thread; 6575 msa->msa_thread = current_thread(); 6576 bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack)); 6577 msa->msa_pdepth = msa->msa_depth; 6578 bzero(stack, sizeof (stack)); 6579 msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1; 6580 bcopy(&stack[1], msa->msa_stack, sizeof (mca->mca_pstack)); 6581 6582 msa->msa_ptstamp = msa->msa_tstamp; 6583 microuptime(&now); 6584 /* tstamp is in ms relative to base_ts */ 6585 msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000); 6586 if ((now.tv_sec - mb_start.tv_sec) > 0) 6587 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000); 6588} 6589 6590static void 6591mcl_audit_mcheck_panic(struct mbuf *m) 6592{ 6593 mcache_audit_t *mca; 6594 6595 MRANGE(m); 6596 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 6597 6598 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n", 6599 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca)); 6600 /* NOTREACHED */ 6601} 6602 6603static void 6604mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca) 6605{ 6606 if (next != NULL && !MBUF_IN_MAP(next) && 6607 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) { 6608 panic("mcl_audit: buffer %p modified after free at offset 0: " 6609 "%p out of range [%p-%p)\n%s\n", 6610 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca)); 6611 /* NOTREACHED */ 6612 } 6613} 6614 6615/* This function turns on mbuf leak detection */ 6616static void 6617mleak_activate(void) 6618{ 6619 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR; 6620 PE_parse_boot_argn("mleak_sample_factor", 6621 &mleak_table.mleak_sample_factor, 6622 sizeof (mleak_table.mleak_sample_factor)); 6623 6624 if (mleak_table.mleak_sample_factor == 0) 6625 mclfindleak = 0; 6626 6627 if (mclfindleak == 0) 6628 return; 6629 6630 vm_size_t alloc_size = 6631 mleak_alloc_buckets * sizeof (struct mallocation); 6632 vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace); 6633 6634 MALLOC(mleak_allocations, struct mallocation *, alloc_size, 6635 M_TEMP, M_WAITOK | M_ZERO); 6636 VERIFY(mleak_allocations != NULL); 6637 6638 MALLOC(mleak_traces, struct mtrace *, trace_size, 6639 M_TEMP, M_WAITOK | M_ZERO); 6640 VERIFY(mleak_traces != NULL); 6641 6642 MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES), 6643 M_TEMP, M_WAITOK | M_ZERO); 6644 VERIFY(mleak_stat != NULL); 6645 mleak_stat->ml_cnt = MLEAK_NUM_TRACES; 6646#ifdef __LP64__ 6647 mleak_stat->ml_isaddr64 = 1; 6648#endif /* __LP64__ */ 6649} 6650 6651static void 6652mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc) 6653{ 6654 int temp; 6655 6656 if (mclfindleak == 0) 6657 return; 6658 6659 if (!alloc) 6660 return (mleak_free(addr)); 6661 6662 temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1); 6663 6664 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) { 6665 uintptr_t bt[MLEAK_STACK_DEPTH]; 6666 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH); 6667 mleak_log(bt, addr, logged, num); 6668 } 6669} 6670 6671/* 6672 * This function records the allocation in the mleak_allocations table 6673 * and the backtrace in the mleak_traces table; if allocation slot is in use, 6674 * replace old allocation with new one if the trace slot is in use, return 6675 * (or increment refcount if same trace). 6676 */ 6677static boolean_t 6678mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num) 6679{ 6680 struct mallocation *allocation; 6681 struct mtrace *trace; 6682 uint32_t trace_index; 6683 6684 /* Quit if someone else modifying the tables */ 6685 if (!lck_mtx_try_lock_spin(mleak_lock)) { 6686 mleak_table.total_conflicts++; 6687 return (FALSE); 6688 } 6689 6690 allocation = &mleak_allocations[hashaddr((uintptr_t)addr, 6691 mleak_alloc_buckets)]; 6692 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets); 6693 trace = &mleak_traces[trace_index]; 6694 6695 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]); 6696 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]); 6697 6698 allocation->hitcount++; 6699 trace->hitcount++; 6700 6701 /* 6702 * If the allocation bucket we want is occupied 6703 * and the occupier has the same trace, just bail. 6704 */ 6705 if (allocation->element != NULL && 6706 trace_index == allocation->trace_index) { 6707 mleak_table.alloc_collisions++; 6708 lck_mtx_unlock(mleak_lock); 6709 return (TRUE); 6710 } 6711 6712 /* 6713 * Store the backtrace in the traces array; 6714 * Size of zero = trace bucket is free. 6715 */ 6716 if (trace->allocs > 0 && 6717 bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) { 6718 /* Different, unique trace, but the same hash! Bail out. */ 6719 trace->collisions++; 6720 mleak_table.trace_collisions++; 6721 lck_mtx_unlock(mleak_lock); 6722 return (TRUE); 6723 } else if (trace->allocs > 0) { 6724 /* Same trace, already added, so increment refcount */ 6725 trace->allocs++; 6726 } else { 6727 /* Found an unused trace bucket, so record the trace here */ 6728 if (trace->depth != 0) { 6729 /* this slot previously used but not currently in use */ 6730 mleak_table.trace_overwrites++; 6731 } 6732 mleak_table.trace_recorded++; 6733 trace->allocs = 1; 6734 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t))); 6735 trace->depth = depth; 6736 trace->collisions = 0; 6737 } 6738 6739 /* Step 2: Store the allocation record in the allocations array */ 6740 if (allocation->element != NULL) { 6741 /* 6742 * Replace an existing allocation. No need to preserve 6743 * because only a subset of the allocations are being 6744 * recorded anyway. 6745 */ 6746 mleak_table.alloc_collisions++; 6747 } else if (allocation->trace_index != 0) { 6748 mleak_table.alloc_overwrites++; 6749 } 6750 allocation->element = addr; 6751 allocation->trace_index = trace_index; 6752 allocation->count = num; 6753 mleak_table.alloc_recorded++; 6754 mleak_table.outstanding_allocs++; 6755 6756 lck_mtx_unlock(mleak_lock); 6757 return (TRUE); 6758} 6759 6760static void 6761mleak_free(mcache_obj_t *addr) 6762{ 6763 while (addr != NULL) { 6764 struct mallocation *allocation = &mleak_allocations 6765 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)]; 6766 6767 if (allocation->element == addr && 6768 allocation->trace_index < mleak_trace_buckets) { 6769 lck_mtx_lock_spin(mleak_lock); 6770 if (allocation->element == addr && 6771 allocation->trace_index < mleak_trace_buckets) { 6772 struct mtrace *trace; 6773 trace = &mleak_traces[allocation->trace_index]; 6774 /* allocs = 0 means trace bucket is unused */ 6775 if (trace->allocs > 0) 6776 trace->allocs--; 6777 if (trace->allocs == 0) 6778 trace->depth = 0; 6779 /* NULL element means alloc bucket is unused */ 6780 allocation->element = NULL; 6781 mleak_table.outstanding_allocs--; 6782 } 6783 lck_mtx_unlock(mleak_lock); 6784 } 6785 addr = addr->obj_next; 6786 } 6787} 6788 6789static void 6790mleak_sort_traces() 6791{ 6792 int i, j, k; 6793 struct mtrace *swap; 6794 6795 for(i = 0; i < MLEAK_NUM_TRACES; i++) 6796 mleak_top_trace[i] = NULL; 6797 6798 for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) 6799 { 6800 if (mleak_traces[i].allocs <= 0) 6801 continue; 6802 6803 mleak_top_trace[j] = &mleak_traces[i]; 6804 for (k = j; k > 0; k--) { 6805 if (mleak_top_trace[k]->allocs <= 6806 mleak_top_trace[k-1]->allocs) 6807 break; 6808 6809 swap = mleak_top_trace[k-1]; 6810 mleak_top_trace[k-1] = mleak_top_trace[k]; 6811 mleak_top_trace[k] = swap; 6812 } 6813 j++; 6814 } 6815 6816 j--; 6817 for(; i < mleak_trace_buckets; i++) { 6818 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) 6819 continue; 6820 6821 mleak_top_trace[j] = &mleak_traces[i]; 6822 6823 for (k = j; k > 0; k--) { 6824 if (mleak_top_trace[k]->allocs <= 6825 mleak_top_trace[k-1]->allocs) 6826 break; 6827 6828 swap = mleak_top_trace[k-1]; 6829 mleak_top_trace[k-1] = mleak_top_trace[k]; 6830 mleak_top_trace[k] = swap; 6831 } 6832 } 6833} 6834 6835static void 6836mleak_update_stats() 6837{ 6838 mleak_trace_stat_t *mltr; 6839 int i; 6840 6841 VERIFY(mleak_stat != NULL); 6842#ifdef __LP64__ 6843 VERIFY(mleak_stat->ml_isaddr64); 6844#else 6845 VERIFY(!mleak_stat->ml_isaddr64); 6846#endif /* !__LP64__ */ 6847 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES); 6848 6849 mleak_sort_traces(); 6850 6851 mltr = &mleak_stat->ml_trace[0]; 6852 bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES); 6853 for (i = 0; i < MLEAK_NUM_TRACES; i++) { 6854 int j; 6855 6856 if (mleak_top_trace[i] == NULL || 6857 mleak_top_trace[i]->allocs == 0) 6858 continue; 6859 6860 mltr->mltr_collisions = mleak_top_trace[i]->collisions; 6861 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount; 6862 mltr->mltr_allocs = mleak_top_trace[i]->allocs; 6863 mltr->mltr_depth = mleak_top_trace[i]->depth; 6864 6865 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH); 6866 for (j = 0; j < mltr->mltr_depth; j++) 6867 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j]; 6868 6869 mltr++; 6870 } 6871} 6872 6873static struct mbtypes { 6874 int mt_type; 6875 const char *mt_name; 6876} mbtypes[] = { 6877 { MT_DATA, "data" }, 6878 { MT_OOBDATA, "oob data" }, 6879 { MT_CONTROL, "ancillary data" }, 6880 { MT_HEADER, "packet headers" }, 6881 { MT_SOCKET, "socket structures" }, 6882 { MT_PCB, "protocol control blocks" }, 6883 { MT_RTABLE, "routing table entries" }, 6884 { MT_HTABLE, "IMP host table entries" }, 6885 { MT_ATABLE, "address resolution tables" }, 6886 { MT_FTABLE, "fragment reassembly queue headers" }, 6887 { MT_SONAME, "socket names and addresses" }, 6888 { MT_SOOPTS, "socket options" }, 6889 { MT_RIGHTS, "access rights" }, 6890 { MT_IFADDR, "interface addresses" }, 6891 { MT_TAG, "packet tags" }, 6892 { 0, NULL } 6893}; 6894 6895#define MBUF_DUMP_BUF_CHK() { \ 6896 clen -= k; \ 6897 if (clen < 1) \ 6898 goto done; \ 6899 c += k; \ 6900} 6901 6902static char * 6903mbuf_dump(void) 6904{ 6905 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct; 6906 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0; 6907 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0; 6908 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0; 6909 int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short); 6910 uint8_t seen[256]; 6911 struct mbtypes *mp; 6912 mb_class_stat_t *sp; 6913 mleak_trace_stat_t *mltr; 6914 char *c = mbuf_dump_buf; 6915 int i, k, clen = MBUF_DUMP_BUF_SIZE; 6916 6917 mbuf_dump_buf[0] = '\0'; 6918 6919 /* synchronize all statistics in the mbuf table */ 6920 mbuf_stat_sync(); 6921 mbuf_mtypes_sync(TRUE); 6922 6923 sp = &mb_stat->mbs_class[0]; 6924 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) { 6925 u_int32_t mem; 6926 6927 if (m_class(i) == MC_MBUF) { 6928 m_mbufs = sp->mbcl_active; 6929 } else if (m_class(i) == MC_CL) { 6930 m_clfree = sp->mbcl_total - sp->mbcl_active; 6931 } else if (m_class(i) == MC_BIGCL) { 6932 m_bigclfree = sp->mbcl_total - sp->mbcl_active; 6933 } else if (njcl > 0 && m_class(i) == MC_16KCL) { 6934 m_16kclfree = sp->mbcl_total - sp->mbcl_active; 6935 m_16kclusters = sp->mbcl_total; 6936 } else if (m_class(i) == MC_MBUF_CL) { 6937 m_mbufclfree = sp->mbcl_total - sp->mbcl_active; 6938 } else if (m_class(i) == MC_MBUF_BIGCL) { 6939 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active; 6940 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) { 6941 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active; 6942 } 6943 6944 mem = sp->mbcl_ctotal * sp->mbcl_size; 6945 totmem += mem; 6946 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) * 6947 sp->mbcl_size; 6948 6949 } 6950 6951 /* adjust free counts to include composite caches */ 6952 m_clfree += m_mbufclfree; 6953 m_bigclfree += m_mbufbigclfree; 6954 m_16kclfree += m_mbuf16kclfree; 6955 6956 totmbufs = 0; 6957 for (mp = mbtypes; mp->mt_name != NULL; mp++) 6958 totmbufs += mbstat.m_mtypes[mp->mt_type]; 6959 if (totmbufs > m_mbufs) 6960 totmbufs = m_mbufs; 6961 k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs); 6962 MBUF_DUMP_BUF_CHK(); 6963 6964 bzero(&seen, sizeof (seen)); 6965 for (mp = mbtypes; mp->mt_name != NULL; mp++) { 6966 if (mbstat.m_mtypes[mp->mt_type] != 0) { 6967 seen[mp->mt_type] = 1; 6968 k = snprintf(c, clen, "\t%u mbufs allocated to %s\n", 6969 mbstat.m_mtypes[mp->mt_type], mp->mt_name); 6970 MBUF_DUMP_BUF_CHK(); 6971 } 6972 } 6973 seen[MT_FREE] = 1; 6974 for (i = 0; i < nmbtypes; i++) 6975 if (!seen[i] && mbstat.m_mtypes[i] != 0) { 6976 k = snprintf(c, clen, "\t%u mbufs allocated to " 6977 "<mbuf type %d>\n", mbstat.m_mtypes[i], i); 6978 MBUF_DUMP_BUF_CHK(); 6979 } 6980 if ((m_mbufs - totmbufs) > 0) { 6981 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n", 6982 m_mbufs - totmbufs); 6983 MBUF_DUMP_BUF_CHK(); 6984 } 6985 k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n" 6986 "%u/%u mbuf 4KB clusters in use\n", 6987 (unsigned int)(mbstat.m_clusters - m_clfree), 6988 (unsigned int)mbstat.m_clusters, 6989 (unsigned int)(mbstat.m_bigclusters - m_bigclfree), 6990 (unsigned int)mbstat.m_bigclusters); 6991 MBUF_DUMP_BUF_CHK(); 6992 6993 if (njcl > 0) { 6994 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n", 6995 m_16kclusters - m_16kclfree, m_16kclusters, 6996 njclbytes / 1024); 6997 MBUF_DUMP_BUF_CHK(); 6998 } 6999 totused = totmem - totfree; 7000 if (totmem == 0) { 7001 totpct = 0; 7002 } else if (totused < (ULONG_MAX / 100)) { 7003 totpct = (totused * 100) / totmem; 7004 } else { 7005 u_long totmem1 = totmem / 100; 7006 u_long totused1 = totused / 100; 7007 totpct = (totused1 * 100) / totmem1; 7008 } 7009 k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% " 7010 "in use)\n", totmem / 1024, totpct); 7011 MBUF_DUMP_BUF_CHK(); 7012 7013 /* mbuf leak detection statistics */ 7014 mleak_update_stats(); 7015 7016 k = snprintf(c, clen, "\nmbuf leak detection table:\n"); 7017 MBUF_DUMP_BUF_CHK(); 7018 k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n", 7019 mleak_table.mleak_capture / mleak_table.mleak_sample_factor, 7020 mleak_table.mleak_sample_factor); 7021 MBUF_DUMP_BUF_CHK(); 7022 k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n", 7023 mleak_table.outstanding_allocs); 7024 MBUF_DUMP_BUF_CHK(); 7025 k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n", 7026 mleak_table.alloc_recorded, mleak_table.trace_recorded); 7027 MBUF_DUMP_BUF_CHK(); 7028 k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n", 7029 mleak_table.alloc_collisions, mleak_table.trace_collisions); 7030 MBUF_DUMP_BUF_CHK(); 7031 k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n", 7032 mleak_table.alloc_overwrites, mleak_table.trace_overwrites); 7033 MBUF_DUMP_BUF_CHK(); 7034 k = snprintf(c, clen, "\tlock conflicts: %llu\n\n", 7035 mleak_table.total_conflicts); 7036 MBUF_DUMP_BUF_CHK(); 7037 7038 k = snprintf(c, clen, "top %d outstanding traces:\n", 7039 mleak_stat->ml_cnt); 7040 MBUF_DUMP_BUF_CHK(); 7041 for (i = 0; i < mleak_stat->ml_cnt; i++) { 7042 mltr = &mleak_stat->ml_trace[i]; 7043 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), " 7044 "%llu hit(s), %llu collision(s)\n", (i + 1), 7045 mltr->mltr_allocs, mltr->mltr_hitcount, 7046 mltr->mltr_collisions); 7047 MBUF_DUMP_BUF_CHK(); 7048 } 7049 7050 if (mleak_stat->ml_isaddr64) 7051 k = snprintf(c, clen, MB_LEAK_HDR_64); 7052 else 7053 k = snprintf(c, clen, MB_LEAK_HDR_32); 7054 MBUF_DUMP_BUF_CHK(); 7055 7056 for (i = 0; i < MLEAK_STACK_DEPTH; i++) { 7057 int j; 7058 k = snprintf(c, clen, "%2d: ", (i + 1)); 7059 MBUF_DUMP_BUF_CHK(); 7060 for (j = 0; j < mleak_stat->ml_cnt; j++) { 7061 mltr = &mleak_stat->ml_trace[j]; 7062 if (i < mltr->mltr_depth) { 7063 if (mleak_stat->ml_isaddr64) { 7064 k = snprintf(c, clen, "0x%0llx ", 7065 mltr->mltr_addr[i]); 7066 } else { 7067 k = snprintf(c, clen, 7068 "0x%08x ", 7069 (u_int32_t)mltr->mltr_addr[i]); 7070 } 7071 } else { 7072 if (mleak_stat->ml_isaddr64) 7073 k = snprintf(c, clen, 7074 MB_LEAK_SPACING_64); 7075 else 7076 k = snprintf(c, clen, 7077 MB_LEAK_SPACING_32); 7078 } 7079 MBUF_DUMP_BUF_CHK(); 7080 } 7081 k = snprintf(c, clen, "\n"); 7082 MBUF_DUMP_BUF_CHK(); 7083 } 7084done: 7085 return (mbuf_dump_buf); 7086} 7087 7088#undef MBUF_DUMP_BUF_CHK 7089 7090/* 7091 * Convert between a regular and a packet header mbuf. Caller is responsible 7092 * for setting or clearing M_PKTHDR; this routine does the rest of the work. 7093 */ 7094int 7095m_reinit(struct mbuf *m, int hdr) 7096{ 7097 int ret = 0; 7098 7099 if (hdr) { 7100 VERIFY(!(m->m_flags & M_PKTHDR)); 7101 if (!(m->m_flags & M_EXT) && 7102 (m->m_data != m->m_dat || m->m_len > 0)) { 7103 /* 7104 * If there's no external cluster attached and the 7105 * mbuf appears to contain user data, we cannot 7106 * safely convert this to a packet header mbuf, 7107 * as the packet header structure might overlap 7108 * with the data. 7109 */ 7110 printf("%s: cannot set M_PKTHDR on altered mbuf %p, " 7111 "m_data %p (expected %p), m_len %d (expected 0)\n", 7112 __func__, m, m->m_data, m->m_dat, m->m_len); 7113 ret = EBUSY; 7114 } else { 7115 VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat); 7116 m->m_flags |= M_PKTHDR; 7117 MBUF_INIT_PKTHDR(m); 7118 } 7119 } else { 7120 /* Check for scratch area overflow */ 7121 m_redzone_verify(m); 7122 /* Free the aux data and tags if there is any */ 7123 m_tag_delete_chain(m, NULL); 7124 m->m_flags &= ~M_PKTHDR; 7125 } 7126 7127 return (ret); 7128} 7129 7130void 7131m_scratch_init(struct mbuf *m) 7132{ 7133 VERIFY(m->m_flags & M_PKTHDR); 7134 7135 bzero(&m->m_pkthdr.pkt_mpriv, sizeof (m->m_pkthdr.pkt_mpriv)); 7136} 7137 7138u_int32_t 7139m_scratch_get(struct mbuf *m, u_int8_t **p) 7140{ 7141 VERIFY(m->m_flags & M_PKTHDR); 7142 7143 if (mcltrace) { 7144 mcache_audit_t *mca; 7145 7146 lck_mtx_lock(mbuf_mlock); 7147 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 7148 if (mca->mca_uflags & MB_SCVALID) 7149 mcl_audit_scratch(mca); 7150 lck_mtx_unlock(mbuf_mlock); 7151 } 7152 7153 *p = (u_int8_t *)&m->m_pkthdr.pkt_mpriv; 7154 return (sizeof (m->m_pkthdr.pkt_mpriv)); 7155} 7156 7157static void 7158m_redzone_init(struct mbuf *m) 7159{ 7160 VERIFY(m->m_flags & M_PKTHDR); 7161 /* 7162 * Each mbuf has a unique red zone pattern, which is a XOR 7163 * of the red zone cookie and the address of the mbuf. 7164 */ 7165 m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie; 7166} 7167 7168static void 7169m_redzone_verify(struct mbuf *m) 7170{ 7171 u_int32_t mb_redzone; 7172 7173 VERIFY(m->m_flags & M_PKTHDR); 7174 7175 mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie; 7176 if (m->m_pkthdr.redzone != mb_redzone) { 7177 panic("mbuf %p redzone violation with value 0x%x " 7178 "(instead of 0x%x, using cookie 0x%x)\n", 7179 m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie); 7180 /* NOTREACHED */ 7181 } 7182} 7183 7184SYSCTL_DECL(_kern_ipc); 7185SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, 7186 CTLFLAG_RD | CTLFLAG_LOCKED, 7187 0, 0, mbstat_sysctl, "S,mbstat", ""); 7188SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, 7189 CTLFLAG_RD | CTLFLAG_LOCKED, 7190 0, 0, mb_stat_sysctl, "S,mb_stat", ""); 7191SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace, 7192 CTLFLAG_RD | CTLFLAG_LOCKED, 7193 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", ""); 7194SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table, 7195 CTLFLAG_RD | CTLFLAG_LOCKED, 7196 0, 0, mleak_table_sysctl, "S,mleak_table", ""); 7197SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor, 7198 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, ""); 7199SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, 7200 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, ""); 7201SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog, 7202 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, ""); 7203