prof.h revision 261071
1/******************************************************************************/ 2#ifdef JEMALLOC_H_TYPES 3 4typedef struct prof_bt_s prof_bt_t; 5typedef struct prof_cnt_s prof_cnt_t; 6typedef struct prof_thr_cnt_s prof_thr_cnt_t; 7typedef struct prof_ctx_s prof_ctx_t; 8typedef struct prof_tdata_s prof_tdata_t; 9 10/* Option defaults. */ 11#define PROF_PREFIX_DEFAULT "jeprof" 12#define LG_PROF_SAMPLE_DEFAULT 19 13#define LG_PROF_INTERVAL_DEFAULT -1 14 15/* 16 * Hard limit on stack backtrace depth. The version of prof_backtrace() that 17 * is based on __builtin_return_address() necessarily has a hard-coded number 18 * of backtrace frame handlers, and should be kept in sync with this setting. 19 */ 20#define PROF_BT_MAX 128 21 22/* Maximum number of backtraces to store in each per thread LRU cache. */ 23#define PROF_TCMAX 1024 24 25/* Initial hash table size. */ 26#define PROF_CKH_MINITEMS 64 27 28/* Size of memory buffer to use when writing dump files. */ 29#define PROF_DUMP_BUFSIZE 65536 30 31/* Size of stack-allocated buffer used by prof_printf(). */ 32#define PROF_PRINTF_BUFSIZE 128 33 34/* 35 * Number of mutexes shared among all ctx's. No space is allocated for these 36 * unless profiling is enabled, so it's okay to over-provision. 37 */ 38#define PROF_NCTX_LOCKS 1024 39 40/* 41 * prof_tdata pointers close to NULL are used to encode state information that 42 * is used for cleaning up during thread shutdown. 43 */ 44#define PROF_TDATA_STATE_REINCARNATED ((prof_tdata_t *)(uintptr_t)1) 45#define PROF_TDATA_STATE_PURGATORY ((prof_tdata_t *)(uintptr_t)2) 46#define PROF_TDATA_STATE_MAX PROF_TDATA_STATE_PURGATORY 47 48#endif /* JEMALLOC_H_TYPES */ 49/******************************************************************************/ 50#ifdef JEMALLOC_H_STRUCTS 51 52struct prof_bt_s { 53 /* Backtrace, stored as len program counters. */ 54 void **vec; 55 unsigned len; 56}; 57 58#ifdef JEMALLOC_PROF_LIBGCC 59/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */ 60typedef struct { 61 prof_bt_t *bt; 62 unsigned nignore; 63 unsigned max; 64} prof_unwind_data_t; 65#endif 66 67struct prof_cnt_s { 68 /* 69 * Profiling counters. An allocation/deallocation pair can operate on 70 * different prof_thr_cnt_t objects that are linked into the same 71 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go 72 * negative. In principle it is possible for the *bytes counters to 73 * overflow/underflow, but a general solution would require something 74 * like 128-bit counters; this implementation doesn't bother to solve 75 * that problem. 76 */ 77 int64_t curobjs; 78 int64_t curbytes; 79 uint64_t accumobjs; 80 uint64_t accumbytes; 81}; 82 83struct prof_thr_cnt_s { 84 /* Linkage into prof_ctx_t's cnts_ql. */ 85 ql_elm(prof_thr_cnt_t) cnts_link; 86 87 /* Linkage into thread's LRU. */ 88 ql_elm(prof_thr_cnt_t) lru_link; 89 90 /* 91 * Associated context. If a thread frees an object that it did not 92 * allocate, it is possible that the context is not cached in the 93 * thread's hash table, in which case it must be able to look up the 94 * context, insert a new prof_thr_cnt_t into the thread's hash table, 95 * and link it into the prof_ctx_t's cnts_ql. 96 */ 97 prof_ctx_t *ctx; 98 99 /* 100 * Threads use memory barriers to update the counters. Since there is 101 * only ever one writer, the only challenge is for the reader to get a 102 * consistent read of the counters. 103 * 104 * The writer uses this series of operations: 105 * 106 * 1) Increment epoch to an odd number. 107 * 2) Update counters. 108 * 3) Increment epoch to an even number. 109 * 110 * The reader must assure 1) that the epoch is even while it reads the 111 * counters, and 2) that the epoch doesn't change between the time it 112 * starts and finishes reading the counters. 113 */ 114 unsigned epoch; 115 116 /* Profiling counters. */ 117 prof_cnt_t cnts; 118}; 119 120struct prof_ctx_s { 121 /* Associated backtrace. */ 122 prof_bt_t *bt; 123 124 /* Protects nlimbo, cnt_merged, and cnts_ql. */ 125 malloc_mutex_t *lock; 126 127 /* 128 * Number of threads that currently cause this ctx to be in a state of 129 * limbo due to one of: 130 * - Initializing per thread counters associated with this ctx. 131 * - Preparing to destroy this ctx. 132 * - Dumping a heap profile that includes this ctx. 133 * nlimbo must be 1 (single destroyer) in order to safely destroy the 134 * ctx. 135 */ 136 unsigned nlimbo; 137 138 /* Temporary storage for summation during dump. */ 139 prof_cnt_t cnt_summed; 140 141 /* When threads exit, they merge their stats into cnt_merged. */ 142 prof_cnt_t cnt_merged; 143 144 /* 145 * List of profile counters, one for each thread that has allocated in 146 * this context. 147 */ 148 ql_head(prof_thr_cnt_t) cnts_ql; 149 150 /* Linkage for list of contexts to be dumped. */ 151 ql_elm(prof_ctx_t) dump_link; 152}; 153typedef ql_head(prof_ctx_t) prof_ctx_list_t; 154 155struct prof_tdata_s { 156 /* 157 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread keeps a 158 * cache of backtraces, with associated thread-specific prof_thr_cnt_t 159 * objects. Other threads may read the prof_thr_cnt_t contents, but no 160 * others will ever write them. 161 * 162 * Upon thread exit, the thread must merge all the prof_thr_cnt_t 163 * counter data into the associated prof_ctx_t objects, and unlink/free 164 * the prof_thr_cnt_t objects. 165 */ 166 ckh_t bt2cnt; 167 168 /* LRU for contents of bt2cnt. */ 169 ql_head(prof_thr_cnt_t) lru_ql; 170 171 /* Backtrace vector, used for calls to prof_backtrace(). */ 172 void **vec; 173 174 /* Sampling state. */ 175 uint64_t prng_state; 176 uint64_t threshold; 177 uint64_t accum; 178 179 /* State used to avoid dumping while operating on prof internals. */ 180 bool enq; 181 bool enq_idump; 182 bool enq_gdump; 183}; 184 185#endif /* JEMALLOC_H_STRUCTS */ 186/******************************************************************************/ 187#ifdef JEMALLOC_H_EXTERNS 188 189extern bool opt_prof; 190/* 191 * Even if opt_prof is true, sampling can be temporarily disabled by setting 192 * opt_prof_active to false. No locking is used when updating opt_prof_active, 193 * so there are no guarantees regarding how long it will take for all threads 194 * to notice state changes. 195 */ 196extern bool opt_prof_active; 197extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ 198extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */ 199extern bool opt_prof_gdump; /* High-water memory dumping. */ 200extern bool opt_prof_final; /* Final profile dumping. */ 201extern bool opt_prof_leak; /* Dump leak summary at exit. */ 202extern bool opt_prof_accum; /* Report cumulative bytes. */ 203extern char opt_prof_prefix[ 204 /* Minimize memory bloat for non-prof builds. */ 205#ifdef JEMALLOC_PROF 206 PATH_MAX + 207#endif 208 1]; 209 210/* 211 * Profile dump interval, measured in bytes allocated. Each arena triggers a 212 * profile dump when it reaches this threshold. The effect is that the 213 * interval between profile dumps averages prof_interval, though the actual 214 * interval between dumps will tend to be sporadic, and the interval will be a 215 * maximum of approximately (prof_interval * narenas). 216 */ 217extern uint64_t prof_interval; 218 219/* 220 * If true, promote small sampled objects to large objects, since small run 221 * headers do not have embedded profile context pointers. 222 */ 223extern bool prof_promote; 224 225void bt_init(prof_bt_t *bt, void **vec); 226void prof_backtrace(prof_bt_t *bt, unsigned nignore); 227prof_thr_cnt_t *prof_lookup(prof_bt_t *bt); 228#ifdef JEMALLOC_JET 229size_t prof_bt_count(void); 230typedef int (prof_dump_open_t)(bool, const char *); 231extern prof_dump_open_t *prof_dump_open; 232#endif 233void prof_idump(void); 234bool prof_mdump(const char *filename); 235void prof_gdump(void); 236prof_tdata_t *prof_tdata_init(void); 237void prof_tdata_cleanup(void *arg); 238void prof_boot0(void); 239void prof_boot1(void); 240bool prof_boot2(void); 241void prof_prefork(void); 242void prof_postfork_parent(void); 243void prof_postfork_child(void); 244 245#endif /* JEMALLOC_H_EXTERNS */ 246/******************************************************************************/ 247#ifdef JEMALLOC_H_INLINES 248 249#define PROF_ALLOC_PREP(nignore, size, ret) do { \ 250 prof_tdata_t *prof_tdata; \ 251 prof_bt_t bt; \ 252 \ 253 assert(size == s2u(size)); \ 254 \ 255 prof_tdata = prof_tdata_get(true); \ 256 if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) { \ 257 if (prof_tdata != NULL) \ 258 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ 259 else \ 260 ret = NULL; \ 261 break; \ 262 } \ 263 \ 264 if (opt_prof_active == false) { \ 265 /* Sampling is currently inactive, so avoid sampling. */\ 266 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ 267 } else if (opt_lg_prof_sample == 0) { \ 268 /* Don't bother with sampling logic, since sampling */\ 269 /* interval is 1. */\ 270 bt_init(&bt, prof_tdata->vec); \ 271 prof_backtrace(&bt, nignore); \ 272 ret = prof_lookup(&bt); \ 273 } else { \ 274 if (prof_tdata->threshold == 0) { \ 275 /* Initialize. Seed the prng differently for */\ 276 /* each thread. */\ 277 prof_tdata->prng_state = \ 278 (uint64_t)(uintptr_t)&size; \ 279 prof_sample_threshold_update(prof_tdata); \ 280 } \ 281 \ 282 /* Determine whether to capture a backtrace based on */\ 283 /* whether size is enough for prof_accum to reach */\ 284 /* prof_tdata->threshold. However, delay updating */\ 285 /* these variables until prof_{m,re}alloc(), because */\ 286 /* we don't know for sure that the allocation will */\ 287 /* succeed. */\ 288 /* */\ 289 /* Use subtraction rather than addition to avoid */\ 290 /* potential integer overflow. */\ 291 if (size >= prof_tdata->threshold - \ 292 prof_tdata->accum) { \ 293 bt_init(&bt, prof_tdata->vec); \ 294 prof_backtrace(&bt, nignore); \ 295 ret = prof_lookup(&bt); \ 296 } else \ 297 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ 298 } \ 299} while (0) 300 301#ifndef JEMALLOC_ENABLE_INLINE 302malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *) 303 304prof_tdata_t *prof_tdata_get(bool create); 305void prof_sample_threshold_update(prof_tdata_t *prof_tdata); 306prof_ctx_t *prof_ctx_get(const void *ptr); 307void prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx); 308bool prof_sample_accum_update(size_t size); 309void prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt); 310void prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt, 311 size_t old_usize, prof_ctx_t *old_ctx); 312void prof_free(const void *ptr, size_t size); 313#endif 314 315#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_)) 316/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ 317malloc_tsd_externs(prof_tdata, prof_tdata_t *) 318malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL, 319 prof_tdata_cleanup) 320 321JEMALLOC_INLINE prof_tdata_t * 322prof_tdata_get(bool create) 323{ 324 prof_tdata_t *prof_tdata; 325 326 cassert(config_prof); 327 328 prof_tdata = *prof_tdata_tsd_get(); 329 if (create && prof_tdata == NULL) 330 prof_tdata = prof_tdata_init(); 331 332 return (prof_tdata); 333} 334 335JEMALLOC_INLINE void 336prof_sample_threshold_update(prof_tdata_t *prof_tdata) 337{ 338 /* 339 * The body of this function is compiled out unless heap profiling is 340 * enabled, so that it is possible to compile jemalloc with floating 341 * point support completely disabled. Avoiding floating point code is 342 * important on memory-constrained systems, but it also enables a 343 * workaround for versions of glibc that don't properly save/restore 344 * floating point registers during dynamic lazy symbol loading (which 345 * internally calls into whatever malloc implementation happens to be 346 * integrated into the application). Note that some compilers (e.g. 347 * gcc 4.8) may use floating point registers for fast memory moves, so 348 * jemalloc must be compiled with such optimizations disabled (e.g. 349 * -mno-sse) in order for the workaround to be complete. 350 */ 351#ifdef JEMALLOC_PROF 352 uint64_t r; 353 double u; 354 355 cassert(config_prof); 356 357 /* 358 * Compute sample threshold as a geometrically distributed random 359 * variable with mean (2^opt_lg_prof_sample). 360 * 361 * __ __ 362 * | log(u) | 1 363 * prof_tdata->threshold = | -------- |, where p = ------------------- 364 * | log(1-p) | opt_lg_prof_sample 365 * 2 366 * 367 * For more information on the math, see: 368 * 369 * Non-Uniform Random Variate Generation 370 * Luc Devroye 371 * Springer-Verlag, New York, 1986 372 * pp 500 373 * (http://luc.devroye.org/rnbookindex.html) 374 */ 375 prng64(r, 53, prof_tdata->prng_state, 376 UINT64_C(6364136223846793005), UINT64_C(1442695040888963407)); 377 u = (double)r * (1.0/9007199254740992.0L); 378 prof_tdata->threshold = (uint64_t)(log(u) / 379 log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample)))) 380 + (uint64_t)1U; 381#endif 382} 383 384JEMALLOC_INLINE prof_ctx_t * 385prof_ctx_get(const void *ptr) 386{ 387 prof_ctx_t *ret; 388 arena_chunk_t *chunk; 389 390 cassert(config_prof); 391 assert(ptr != NULL); 392 393 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 394 if (chunk != ptr) { 395 /* Region. */ 396 ret = arena_prof_ctx_get(ptr); 397 } else 398 ret = huge_prof_ctx_get(ptr); 399 400 return (ret); 401} 402 403JEMALLOC_INLINE void 404prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx) 405{ 406 arena_chunk_t *chunk; 407 408 cassert(config_prof); 409 assert(ptr != NULL); 410 411 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 412 if (chunk != ptr) { 413 /* Region. */ 414 arena_prof_ctx_set(ptr, usize, ctx); 415 } else 416 huge_prof_ctx_set(ptr, ctx); 417} 418 419JEMALLOC_INLINE bool 420prof_sample_accum_update(size_t size) 421{ 422 prof_tdata_t *prof_tdata; 423 424 cassert(config_prof); 425 /* Sampling logic is unnecessary if the interval is 1. */ 426 assert(opt_lg_prof_sample != 0); 427 428 prof_tdata = prof_tdata_get(false); 429 if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) 430 return (true); 431 432 /* Take care to avoid integer overflow. */ 433 if (size >= prof_tdata->threshold - prof_tdata->accum) { 434 prof_tdata->accum -= (prof_tdata->threshold - size); 435 /* Compute new sample threshold. */ 436 prof_sample_threshold_update(prof_tdata); 437 while (prof_tdata->accum >= prof_tdata->threshold) { 438 prof_tdata->accum -= prof_tdata->threshold; 439 prof_sample_threshold_update(prof_tdata); 440 } 441 return (false); 442 } else { 443 prof_tdata->accum += size; 444 return (true); 445 } 446} 447 448JEMALLOC_INLINE void 449prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) 450{ 451 452 cassert(config_prof); 453 assert(ptr != NULL); 454 assert(usize == isalloc(ptr, true)); 455 456 if (opt_lg_prof_sample != 0) { 457 if (prof_sample_accum_update(usize)) { 458 /* 459 * Don't sample. For malloc()-like allocation, it is 460 * always possible to tell in advance how large an 461 * object's usable size will be, so there should never 462 * be a difference between the usize passed to 463 * PROF_ALLOC_PREP() and prof_malloc(). 464 */ 465 assert((uintptr_t)cnt == (uintptr_t)1U); 466 } 467 } 468 469 if ((uintptr_t)cnt > (uintptr_t)1U) { 470 prof_ctx_set(ptr, usize, cnt->ctx); 471 472 cnt->epoch++; 473 /*********/ 474 mb_write(); 475 /*********/ 476 cnt->cnts.curobjs++; 477 cnt->cnts.curbytes += usize; 478 if (opt_prof_accum) { 479 cnt->cnts.accumobjs++; 480 cnt->cnts.accumbytes += usize; 481 } 482 /*********/ 483 mb_write(); 484 /*********/ 485 cnt->epoch++; 486 /*********/ 487 mb_write(); 488 /*********/ 489 } else 490 prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U); 491} 492 493JEMALLOC_INLINE void 494prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt, 495 size_t old_usize, prof_ctx_t *old_ctx) 496{ 497 prof_thr_cnt_t *told_cnt; 498 499 cassert(config_prof); 500 assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U); 501 502 if (ptr != NULL) { 503 assert(usize == isalloc(ptr, true)); 504 if (opt_lg_prof_sample != 0) { 505 if (prof_sample_accum_update(usize)) { 506 /* 507 * Don't sample. The usize passed to 508 * PROF_ALLOC_PREP() was larger than what 509 * actually got allocated, so a backtrace was 510 * captured for this allocation, even though 511 * its actual usize was insufficient to cross 512 * the sample threshold. 513 */ 514 cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 515 } 516 } 517 } 518 519 if ((uintptr_t)old_ctx > (uintptr_t)1U) { 520 told_cnt = prof_lookup(old_ctx->bt); 521 if (told_cnt == NULL) { 522 /* 523 * It's too late to propagate OOM for this realloc(), 524 * so operate directly on old_cnt->ctx->cnt_merged. 525 */ 526 malloc_mutex_lock(old_ctx->lock); 527 old_ctx->cnt_merged.curobjs--; 528 old_ctx->cnt_merged.curbytes -= old_usize; 529 malloc_mutex_unlock(old_ctx->lock); 530 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 531 } 532 } else 533 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 534 535 if ((uintptr_t)told_cnt > (uintptr_t)1U) 536 told_cnt->epoch++; 537 if ((uintptr_t)cnt > (uintptr_t)1U) { 538 prof_ctx_set(ptr, usize, cnt->ctx); 539 cnt->epoch++; 540 } else if (ptr != NULL) 541 prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U); 542 /*********/ 543 mb_write(); 544 /*********/ 545 if ((uintptr_t)told_cnt > (uintptr_t)1U) { 546 told_cnt->cnts.curobjs--; 547 told_cnt->cnts.curbytes -= old_usize; 548 } 549 if ((uintptr_t)cnt > (uintptr_t)1U) { 550 cnt->cnts.curobjs++; 551 cnt->cnts.curbytes += usize; 552 if (opt_prof_accum) { 553 cnt->cnts.accumobjs++; 554 cnt->cnts.accumbytes += usize; 555 } 556 } 557 /*********/ 558 mb_write(); 559 /*********/ 560 if ((uintptr_t)told_cnt > (uintptr_t)1U) 561 told_cnt->epoch++; 562 if ((uintptr_t)cnt > (uintptr_t)1U) 563 cnt->epoch++; 564 /*********/ 565 mb_write(); /* Not strictly necessary. */ 566} 567 568JEMALLOC_INLINE void 569prof_free(const void *ptr, size_t size) 570{ 571 prof_ctx_t *ctx = prof_ctx_get(ptr); 572 573 cassert(config_prof); 574 575 if ((uintptr_t)ctx > (uintptr_t)1) { 576 prof_thr_cnt_t *tcnt; 577 assert(size == isalloc(ptr, true)); 578 tcnt = prof_lookup(ctx->bt); 579 580 if (tcnt != NULL) { 581 tcnt->epoch++; 582 /*********/ 583 mb_write(); 584 /*********/ 585 tcnt->cnts.curobjs--; 586 tcnt->cnts.curbytes -= size; 587 /*********/ 588 mb_write(); 589 /*********/ 590 tcnt->epoch++; 591 /*********/ 592 mb_write(); 593 /*********/ 594 } else { 595 /* 596 * OOM during free() cannot be propagated, so operate 597 * directly on cnt->ctx->cnt_merged. 598 */ 599 malloc_mutex_lock(ctx->lock); 600 ctx->cnt_merged.curobjs--; 601 ctx->cnt_merged.curbytes -= size; 602 malloc_mutex_unlock(ctx->lock); 603 } 604 } 605} 606#endif 607 608#endif /* JEMALLOC_H_INLINES */ 609/******************************************************************************/ 610