lockstat.c revision 284297
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <stdio.h> 29#include <stddef.h> 30#include <stdlib.h> 31#include <stdarg.h> 32#include <string.h> 33#include <strings.h> 34#include <ctype.h> 35#include <fcntl.h> 36#include <unistd.h> 37#include <errno.h> 38#include <limits.h> 39#include <sys/types.h> 40#include <sys/modctl.h> 41#include <sys/stat.h> 42#include <sys/wait.h> 43#include <dtrace.h> 44#include <sys/lockstat.h> 45#include <alloca.h> 46#include <signal.h> 47#include <assert.h> 48 49#ifdef illumos 50#define GETOPT_EOF EOF 51#else 52#include <sys/time.h> 53#include <sys/resource.h> 54 55#define mergesort(a, b, c, d) lsmergesort(a, b, c, d) 56#define GETOPT_EOF (-1) 57 58typedef uintptr_t pc_t; 59#endif 60 61#define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V" 62 63#define LS_MAX_STACK_DEPTH 50 64#define LS_MAX_EVENTS 64 65 66typedef struct lsrec { 67 struct lsrec *ls_next; /* next in hash chain */ 68 uintptr_t ls_lock; /* lock address */ 69 uintptr_t ls_caller; /* caller address */ 70 uint32_t ls_count; /* cumulative event count */ 71 uint32_t ls_event; /* type of event */ 72 uintptr_t ls_refcnt; /* cumulative reference count */ 73 uint64_t ls_time; /* cumulative event duration */ 74 uint32_t ls_hist[64]; /* log2(duration) histogram */ 75 uintptr_t ls_stack[LS_MAX_STACK_DEPTH]; 76} lsrec_t; 77 78typedef struct lsdata { 79 struct lsrec *lsd_next; /* next available */ 80 int lsd_count; /* number of records */ 81} lsdata_t; 82 83/* 84 * Definitions for the types of experiments which can be run. They are 85 * listed in increasing order of memory cost and processing time cost. 86 * The numerical value of each type is the number of bytes needed per record. 87 */ 88#define LS_BASIC offsetof(lsrec_t, ls_time) 89#define LS_TIME offsetof(lsrec_t, ls_hist[0]) 90#define LS_HIST offsetof(lsrec_t, ls_stack[0]) 91#define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth]) 92 93static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t); 94static void report_trace(FILE *, lsrec_t **); 95 96extern int symtab_init(void); 97extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *); 98extern uintptr_t sym_to_addr(char *name); 99extern size_t sym_size(char *name); 100extern char *strtok_r(char *, const char *, char **); 101 102#define DEFAULT_NRECS 10000 103#define DEFAULT_HZ 97 104#define MAX_HZ 1000 105#define MIN_AGGSIZE (16 * 1024) 106#define MAX_AGGSIZE (32 * 1024 * 1024) 107 108static int g_stkdepth; 109static int g_topn = INT_MAX; 110static hrtime_t g_elapsed; 111static int g_rates = 0; 112static int g_pflag = 0; 113static int g_Pflag = 0; 114static int g_wflag = 0; 115static int g_Wflag = 0; 116static int g_cflag = 0; 117static int g_kflag = 0; 118static int g_gflag = 0; 119static int g_Vflag = 0; 120static int g_tracing = 0; 121static size_t g_recsize; 122static size_t g_nrecs; 123static int g_nrecs_used; 124static uchar_t g_enabled[LS_MAX_EVENTS]; 125static hrtime_t g_min_duration[LS_MAX_EVENTS]; 126static dtrace_hdl_t *g_dtp; 127static char *g_predicate; 128static char *g_ipredicate; 129static char *g_prog; 130static int g_proglen; 131static int g_dropped; 132 133typedef struct ls_event_info { 134 char ev_type; 135 char ev_lhdr[20]; 136 char ev_desc[80]; 137 char ev_units[10]; 138 char ev_name[DTRACE_NAMELEN]; 139 char *ev_predicate; 140 char *ev_acquire; 141} ls_event_info_t; 142 143static ls_event_info_t g_event_info[LS_MAX_EVENTS] = { 144 { 'C', "Lock", "Adaptive mutex spin", "nsec", 145 "lockstat:::adaptive-spin" }, 146 { 'C', "Lock", "Adaptive mutex block", "nsec", 147 "lockstat:::adaptive-block" }, 148 { 'C', "Lock", "Spin lock spin", "nsec", 149 "lockstat:::spin-spin" }, 150 { 'C', "Lock", "Thread lock spin", "nsec", 151 "lockstat:::thread-spin" }, 152 { 'C', "Lock", "R/W writer blocked by writer", "nsec", 153 "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" }, 154 { 'C', "Lock", "R/W writer blocked by readers", "nsec", 155 "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" }, 156 { 'C', "Lock", "R/W reader blocked by writer", "nsec", 157 "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" }, 158 { 'C', "Lock", "R/W reader blocked by write wanted", "nsec", 159 "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" }, 160 { 'C', "Lock", "R/W writer spin on writer", "nsec", 161 "lockstat:::rw-spin", "arg2 == 0 && arg3 == 1" }, 162 { 'C', "Lock", "R/W writer spin on readers", "nsec", 163 "lockstat:::rw-spin", "arg2 == 0 && arg3 == 0 && arg4" }, 164 { 'C', "Lock", "R/W reader spin on writer", "nsec", 165 "lockstat:::rw-spin", "arg2 != 0 && arg3 == 1" }, 166 { 'C', "Lock", "R/W reader spin on write wanted", "nsec", 167 "lockstat:::rw-spin", "arg2 != 0 && arg3 == 0 && arg4" }, 168 { 'C', "Lock", "SX exclusive block", "nsec", 169 "lockstat:::sx-block", "arg2 == 0" }, 170 { 'C', "Lock", "SX shared block", "nsec", 171 "lockstat:::sx-block", "arg2 != 0" }, 172 { 'C', "Lock", "SX exclusive spin", "nsec", 173 "lockstat:::sx-spin", "arg2 == 0" }, 174 { 'C', "Lock", "SX shared spin", "nsec", 175 "lockstat:::sx-spin", "arg2 != 0" }, 176 { 'C', "Lock", "Unknown event (type 16)", "units" }, 177 { 'C', "Lock", "Unknown event (type 17)", "units" }, 178 { 'C', "Lock", "Unknown event (type 18)", "units" }, 179 { 'C', "Lock", "Unknown event (type 19)", "units" }, 180 { 'C', "Lock", "Unknown event (type 20)", "units" }, 181 { 'C', "Lock", "Unknown event (type 21)", "units" }, 182 { 'C', "Lock", "Unknown event (type 22)", "units" }, 183 { 'C', "Lock", "Unknown event (type 23)", "units" }, 184 { 'C', "Lock", "Unknown event (type 24)", "units" }, 185 { 'C', "Lock", "Unknown event (type 25)", "units" }, 186 { 'C', "Lock", "Unknown event (type 26)", "units" }, 187 { 'C', "Lock", "Unknown event (type 27)", "units" }, 188 { 'C', "Lock", "Unknown event (type 28)", "units" }, 189 { 'C', "Lock", "Unknown event (type 29)", "units" }, 190 { 'C', "Lock", "Unknown event (type 30)", "units" }, 191 { 'C', "Lock", "Unknown event (type 31)", "units" }, 192 { 'H', "Lock", "Adaptive mutex hold", "nsec", 193 "lockstat:::adaptive-release", NULL, 194 "lockstat:::adaptive-acquire" }, 195 { 'H', "Lock", "Spin lock hold", "nsec", 196 "lockstat:::spin-release", NULL, 197 "lockstat:::spin-acquire" }, 198 { 'H', "Lock", "R/W writer hold", "nsec", 199 "lockstat::rw_wunlock:rw-release", NULL, 200 "lockstat::rw_wlock:rw-acquire" }, 201 { 'H', "Lock", "R/W reader hold", "nsec", 202 "lockstat::rw_runlock:rw-release", NULL, 203 "lockstat::rw_rlock:rw-acquire" }, 204 { 'H', "Lock", "SX shared hold", "nsec", 205 "lockstat::sx_sunlock:sx-release", NULL, 206 "lockstat::sx_slock:sx-acquire" }, 207 { 'H', "Lock", "SX exclusive hold", "nsec", 208 "lockstat::sx_xunlock:sx-release", NULL, 209 "lockstat::sx_xlock:sx-acquire" }, 210 { 'H', "Lock", "Unknown event (type 38)", "units" }, 211 { 'H', "Lock", "Unknown event (type 39)", "units" }, 212 { 'H', "Lock", "Unknown event (type 40)", "units" }, 213 { 'H', "Lock", "Unknown event (type 41)", "units" }, 214 { 'H', "Lock", "Unknown event (type 42)", "units" }, 215 { 'H', "Lock", "Unknown event (type 43)", "units" }, 216 { 'H', "Lock", "Unknown event (type 44)", "units" }, 217 { 'H', "Lock", "Unknown event (type 45)", "units" }, 218 { 'H', "Lock", "Unknown event (type 46)", "units" }, 219 { 'H', "Lock", "Unknown event (type 47)", "units" }, 220 { 'H', "Lock", "Unknown event (type 48)", "units" }, 221 { 'H', "Lock", "Unknown event (type 49)", "units" }, 222 { 'H', "Lock", "Unknown event (type 50)", "units" }, 223 { 'H', "Lock", "Unknown event (type 51)", "units" }, 224 { 'H', "Lock", "Unknown event (type 52)", "units" }, 225 { 'H', "Lock", "Unknown event (type 53)", "units" }, 226 { 'H', "Lock", "Unknown event (type 54)", "units" }, 227 { 'H', "Lock", "Unknown event (type 55)", "units" }, 228#ifdef illumos 229 { 'I', "CPU+PIL", "Profiling interrupt", "nsec", 230#else 231 { 'I', "CPU+Pri_Class", "Profiling interrupt", "nsec", 232#endif 233 "profile:::profile-97", NULL }, 234 { 'I', "Lock", "Unknown event (type 57)", "units" }, 235 { 'I', "Lock", "Unknown event (type 58)", "units" }, 236 { 'I', "Lock", "Unknown event (type 59)", "units" }, 237 { 'E', "Lock", "Recursive lock entry detected", "(N/A)", 238 "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" }, 239 { 'E', "Lock", "Lockstat enter failure", "(N/A)" }, 240 { 'E', "Lock", "Lockstat exit failure", "nsec" }, 241 { 'E', "Lock", "Lockstat record failure", "(N/A)" }, 242}; 243 244#ifndef illumos 245static char *g_pri_class[] = { 246 "", 247 "Intr", 248 "RealT", 249 "TShar", 250 "Idle" 251}; 252#endif 253 254static void 255fail(int do_perror, const char *message, ...) 256{ 257 va_list args; 258 int save_errno = errno; 259 260 va_start(args, message); 261 (void) fprintf(stderr, "lockstat: "); 262 (void) vfprintf(stderr, message, args); 263 va_end(args); 264 if (do_perror) 265 (void) fprintf(stderr, ": %s", strerror(save_errno)); 266 (void) fprintf(stderr, "\n"); 267 exit(2); 268} 269 270static void 271dfail(const char *message, ...) 272{ 273 va_list args; 274 275 va_start(args, message); 276 (void) fprintf(stderr, "lockstat: "); 277 (void) vfprintf(stderr, message, args); 278 va_end(args); 279 (void) fprintf(stderr, ": %s\n", 280 dtrace_errmsg(g_dtp, dtrace_errno(g_dtp))); 281 282 exit(2); 283} 284 285static void 286show_events(char event_type, char *desc) 287{ 288 int i, first = -1, last; 289 290 for (i = 0; i < LS_MAX_EVENTS; i++) { 291 ls_event_info_t *evp = &g_event_info[i]; 292 if (evp->ev_type != event_type || 293 strncmp(evp->ev_desc, "Unknown event", 13) == 0) 294 continue; 295 if (first == -1) 296 first = i; 297 last = i; 298 } 299 300 (void) fprintf(stderr, 301 "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n", 302 desc, event_type, first, last); 303 304 for (i = first; i <= last; i++) 305 (void) fprintf(stderr, 306 "%4d = %s\n", i, g_event_info[i].ev_desc); 307} 308 309static void 310usage(void) 311{ 312 (void) fprintf(stderr, 313 "Usage: lockstat [options] command [args]\n" 314 "\nGeneral options:\n\n" 315 " -V print the corresponding D program\n" 316 "\nEvent selection options:\n\n" 317 " -C watch contention events [on by default]\n" 318 " -E watch error events [off by default]\n" 319 " -H watch hold events [off by default]\n" 320 " -I watch interrupt events [off by default]\n" 321 " -A watch all lock events [equivalent to -CH]\n" 322 " -e event_list only watch the specified events (shown below);\n" 323 " <event_list> is a comma-separated list of\n" 324 " events or ranges of events, e.g. 1,4-7,35\n" 325 " -i rate interrupt rate for -I [default: %d Hz]\n" 326 "\nData gathering options:\n\n" 327 " -b basic statistics (lock, caller, event count)\n" 328 " -t timing for all events [default]\n" 329 " -h histograms for event times\n" 330 " -s depth stack traces <depth> deep\n" 331 " -x opt[=val] enable or modify DTrace options\n" 332 "\nData filtering options:\n\n" 333 " -n nrecords maximum number of data records [default: %d]\n" 334 " -l lock[,size] only watch <lock>, which can be specified as a\n" 335 " symbolic name or hex address; <size> defaults\n" 336 " to the ELF symbol size if available, 1 if not\n" 337 " -f func[,size] only watch events generated by <func>\n" 338 " -d duration only watch events longer than <duration>\n" 339 " -T trace (rather than sample) events\n" 340 "\nData reporting options:\n\n" 341 " -c coalesce lock data for arrays like pse_mutex[]\n" 342 " -k coalesce PCs within functions\n" 343 " -g show total events generated by function\n" 344 " -w wherever: don't distinguish events by caller\n" 345 " -W whichever: don't distinguish events by lock\n" 346 " -R display rates rather than counts\n" 347 " -p parsable output format (awk(1)-friendly)\n" 348 " -P sort lock data by (count * avg_time) product\n" 349 " -D n only display top <n> events of each type\n" 350 " -o filename send output to <filename>\n", 351 DEFAULT_HZ, DEFAULT_NRECS); 352 353 show_events('C', "Contention"); 354 show_events('H', "Hold-time"); 355 show_events('I', "Interrupt"); 356 show_events('E', "Error"); 357 (void) fprintf(stderr, "\n"); 358 359 exit(1); 360} 361 362static int 363lockcmp(lsrec_t *a, lsrec_t *b) 364{ 365 int i; 366 367 if (a->ls_event < b->ls_event) 368 return (-1); 369 if (a->ls_event > b->ls_event) 370 return (1); 371 372 for (i = g_stkdepth - 1; i >= 0; i--) { 373 if (a->ls_stack[i] < b->ls_stack[i]) 374 return (-1); 375 if (a->ls_stack[i] > b->ls_stack[i]) 376 return (1); 377 } 378 379 if (a->ls_caller < b->ls_caller) 380 return (-1); 381 if (a->ls_caller > b->ls_caller) 382 return (1); 383 384 if (a->ls_lock < b->ls_lock) 385 return (-1); 386 if (a->ls_lock > b->ls_lock) 387 return (1); 388 389 return (0); 390} 391 392static int 393countcmp(lsrec_t *a, lsrec_t *b) 394{ 395 if (a->ls_event < b->ls_event) 396 return (-1); 397 if (a->ls_event > b->ls_event) 398 return (1); 399 400 return (b->ls_count - a->ls_count); 401} 402 403static int 404timecmp(lsrec_t *a, lsrec_t *b) 405{ 406 if (a->ls_event < b->ls_event) 407 return (-1); 408 if (a->ls_event > b->ls_event) 409 return (1); 410 411 if (a->ls_time < b->ls_time) 412 return (1); 413 if (a->ls_time > b->ls_time) 414 return (-1); 415 416 return (0); 417} 418 419static int 420lockcmp_anywhere(lsrec_t *a, lsrec_t *b) 421{ 422 if (a->ls_event < b->ls_event) 423 return (-1); 424 if (a->ls_event > b->ls_event) 425 return (1); 426 427 if (a->ls_lock < b->ls_lock) 428 return (-1); 429 if (a->ls_lock > b->ls_lock) 430 return (1); 431 432 return (0); 433} 434 435static int 436lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b) 437{ 438 if (a->ls_event < b->ls_event) 439 return (-1); 440 if (a->ls_event > b->ls_event) 441 return (1); 442 443 if (a->ls_lock < b->ls_lock) 444 return (-1); 445 if (a->ls_lock > b->ls_lock) 446 return (1); 447 448 return (b->ls_count - a->ls_count); 449} 450 451static int 452sitecmp_anylock(lsrec_t *a, lsrec_t *b) 453{ 454 int i; 455 456 if (a->ls_event < b->ls_event) 457 return (-1); 458 if (a->ls_event > b->ls_event) 459 return (1); 460 461 for (i = g_stkdepth - 1; i >= 0; i--) { 462 if (a->ls_stack[i] < b->ls_stack[i]) 463 return (-1); 464 if (a->ls_stack[i] > b->ls_stack[i]) 465 return (1); 466 } 467 468 if (a->ls_caller < b->ls_caller) 469 return (-1); 470 if (a->ls_caller > b->ls_caller) 471 return (1); 472 473 return (0); 474} 475 476static int 477site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b) 478{ 479 int i; 480 481 if (a->ls_event < b->ls_event) 482 return (-1); 483 if (a->ls_event > b->ls_event) 484 return (1); 485 486 for (i = g_stkdepth - 1; i >= 0; i--) { 487 if (a->ls_stack[i] < b->ls_stack[i]) 488 return (-1); 489 if (a->ls_stack[i] > b->ls_stack[i]) 490 return (1); 491 } 492 493 if (a->ls_caller < b->ls_caller) 494 return (-1); 495 if (a->ls_caller > b->ls_caller) 496 return (1); 497 498 return (b->ls_count - a->ls_count); 499} 500 501static void 502lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n) 503{ 504 int m = n / 2; 505 int i, j; 506 507 if (m > 1) 508 lsmergesort(cmp, a, b, m); 509 if (n - m > 1) 510 lsmergesort(cmp, a + m, b + m, n - m); 511 for (i = m; i > 0; i--) 512 b[i - 1] = a[i - 1]; 513 for (j = m - 1; j < n - 1; j++) 514 b[n + m - j - 2] = a[j + 1]; 515 while (i < j) 516 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--]; 517 *a = b[i]; 518} 519 520static void 521coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n) 522{ 523 int i, j; 524 lsrec_t *target, *current; 525 526 target = lock[0]; 527 528 for (i = 1; i < n; i++) { 529 current = lock[i]; 530 if (cmp(current, target) != 0) { 531 target = current; 532 continue; 533 } 534 current->ls_event = LS_MAX_EVENTS; 535 target->ls_count += current->ls_count; 536 target->ls_refcnt += current->ls_refcnt; 537 if (g_recsize < LS_TIME) 538 continue; 539 target->ls_time += current->ls_time; 540 if (g_recsize < LS_HIST) 541 continue; 542 for (j = 0; j < 64; j++) 543 target->ls_hist[j] += current->ls_hist[j]; 544 } 545} 546 547static void 548coalesce_symbol(uintptr_t *addrp) 549{ 550 uintptr_t symoff; 551 size_t symsize; 552 553 if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize) 554 *addrp -= symoff; 555} 556 557static void 558predicate_add(char **pred, char *what, char *cmp, uintptr_t value) 559{ 560 char *new; 561 int len, newlen; 562 563 if (what == NULL) 564 return; 565 566 if (*pred == NULL) { 567 *pred = malloc(1); 568 *pred[0] = '\0'; 569 } 570 571 len = strlen(*pred); 572 newlen = len + strlen(what) + 32 + strlen("( && )"); 573 new = malloc(newlen); 574 575 if (*pred[0] != '\0') { 576 if (cmp != NULL) { 577 (void) sprintf(new, "(%s) && (%s %s 0x%p)", 578 *pred, what, cmp, (void *)value); 579 } else { 580 (void) sprintf(new, "(%s) && (%s)", *pred, what); 581 } 582 } else { 583 if (cmp != NULL) { 584 (void) sprintf(new, "%s %s 0x%p", 585 what, cmp, (void *)value); 586 } else { 587 (void) sprintf(new, "%s", what); 588 } 589 } 590 591 free(*pred); 592 *pred = new; 593} 594 595static void 596predicate_destroy(char **pred) 597{ 598 free(*pred); 599 *pred = NULL; 600} 601 602static void 603filter_add(char **filt, char *what, uintptr_t base, uintptr_t size) 604{ 605 char buf[256], *c = buf, *new; 606 int len, newlen; 607 608 if (*filt == NULL) { 609 *filt = malloc(1); 610 *filt[0] = '\0'; 611 } 612 613#ifdef illumos 614 (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ? 615 " || " : "", what, (void *)base, what, (void *)(base + size)); 616#else 617 (void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ? 618 " || " : "", what, (void *)base, what, (void *)(base + size)); 619#endif 620 621 newlen = (len = strlen(*filt) + 1) + strlen(c); 622 new = malloc(newlen); 623 bcopy(*filt, new, len); 624 (void) strcat(new, c); 625 free(*filt); 626 *filt = new; 627} 628 629static void 630filter_destroy(char **filt) 631{ 632 free(*filt); 633 *filt = NULL; 634} 635 636static void 637dprog_add(const char *fmt, ...) 638{ 639 va_list args; 640 int size, offs; 641 char c; 642 643 va_start(args, fmt); 644 size = vsnprintf(&c, 1, fmt, args) + 1; 645 va_end(args); 646 647 if (g_proglen == 0) { 648 offs = 0; 649 } else { 650 offs = g_proglen - 1; 651 } 652 653 g_proglen = offs + size; 654 655 if ((g_prog = realloc(g_prog, g_proglen)) == NULL) 656 fail(1, "failed to reallocate program text"); 657 658 va_start(args, fmt); 659 (void) vsnprintf(&g_prog[offs], size, fmt, args); 660 va_end(args); 661} 662 663/* 664 * This function may read like an open sewer, but keep in mind that programs 665 * that generate other programs are rarely pretty. If one has the unenviable 666 * task of maintaining or -- worse -- extending this code, use the -V option 667 * to examine the D program as generated by this function. 668 */ 669static void 670dprog_addevent(int event) 671{ 672 ls_event_info_t *info = &g_event_info[event]; 673 char *pred = NULL; 674 char stack[20]; 675 const char *arg0, *caller; 676 char *arg1 = "arg1"; 677 char buf[80]; 678 hrtime_t dur; 679 int depth; 680 681 if (info->ev_name[0] == '\0') 682 return; 683 684 if (info->ev_type == 'I') { 685 /* 686 * For interrupt events, arg0 (normally the lock pointer) is 687 * the CPU address plus the current pil, and arg1 (normally 688 * the number of nanoseconds) is the number of nanoseconds 689 * late -- and it's stored in arg2. 690 */ 691#ifdef illumos 692 arg0 = "(uintptr_t)curthread->t_cpu + \n" 693 "\t curthread->t_cpu->cpu_profile_pil"; 694#else 695 arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n" 696 "\t 0x01000000 + curthread->td_pri_class"; 697#endif 698 caller = "(uintptr_t)arg0"; 699 arg1 = "arg2"; 700 } else { 701 arg0 = "(uintptr_t)arg0"; 702 caller = "caller"; 703 } 704 705 if (g_recsize > LS_HIST) { 706 for (depth = 0; g_recsize > LS_STACK(depth); depth++) 707 continue; 708 709 if (g_tracing) { 710 (void) sprintf(stack, "\tstack(%d);\n", depth); 711 } else { 712 (void) sprintf(stack, ", stack(%d)", depth); 713 } 714 } else { 715 (void) sprintf(stack, ""); 716 } 717 718 if (info->ev_acquire != NULL) { 719 /* 720 * If this is a hold event, we need to generate an additional 721 * clause for the acquire; the clause for the release will be 722 * generated with the aggregating statement, below. 723 */ 724 dprog_add("%s\n", info->ev_acquire); 725 predicate_add(&pred, info->ev_predicate, NULL, 0); 726 predicate_add(&pred, g_predicate, NULL, 0); 727 if (pred != NULL) 728 dprog_add("/%s/\n", pred); 729 730 dprog_add("{\n"); 731 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event); 732 733 if (info->ev_type == 'H') { 734 dprog_add("\t%s = timestamp;\n", buf); 735 } else { 736 /* 737 * If this isn't a hold event, it's the recursive 738 * error event. For this, we simply bump the 739 * thread-local, per-lock count. 740 */ 741 dprog_add("\t%s++;\n", buf); 742 } 743 744 dprog_add("}\n\n"); 745 predicate_destroy(&pred); 746 pred = NULL; 747 748 if (info->ev_type == 'E') { 749 /* 750 * If this is the recursive lock error event, we need 751 * to generate an additional clause to decrement the 752 * thread-local, per-lock count. This assures that we 753 * only execute the aggregating clause if we have 754 * recursive entry. 755 */ 756 dprog_add("%s\n", info->ev_name); 757 dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf); 758 } 759 760 predicate_add(&pred, buf, NULL, 0); 761 762 if (info->ev_type == 'H') { 763 (void) sprintf(buf, "timestamp -\n\t " 764 "self->ev%d[(uintptr_t)arg0]", event); 765 } 766 767 arg1 = buf; 768 } else { 769 predicate_add(&pred, info->ev_predicate, NULL, 0); 770 if (info->ev_type != 'I') 771 predicate_add(&pred, g_predicate, NULL, 0); 772 else 773 predicate_add(&pred, g_ipredicate, NULL, 0); 774 } 775 776 if ((dur = g_min_duration[event]) != 0) 777 predicate_add(&pred, arg1, ">=", dur); 778 779 dprog_add("%s\n", info->ev_name); 780 781 if (pred != NULL) 782 dprog_add("/%s/\n", pred); 783 predicate_destroy(&pred); 784 785 dprog_add("{\n"); 786 787 if (g_tracing) { 788 dprog_add("\ttrace(%dULL);\n", event); 789 dprog_add("\ttrace(%s);\n", arg0); 790 dprog_add("\ttrace(%s);\n", caller); 791 dprog_add(stack); 792 } else { 793 /* 794 * The ordering here is important: when we process the 795 * aggregate, we count on the fact that @avg appears before 796 * @hist in program order to assure that @avg is assigned the 797 * first aggregation variable ID and @hist assigned the 798 * second; see the comment in process_aggregate() for details. 799 */ 800 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n", 801 event, arg0, caller, stack, arg1); 802 803 if (g_recsize >= LS_HIST) { 804 dprog_add("\t@hist[%dULL, %s, %s%s] = quantize" 805 "(%s);\n", event, arg0, caller, stack, arg1); 806 } 807 } 808 809 if (info->ev_acquire != NULL) 810 dprog_add("\tself->ev%d[arg0] = 0;\n", event); 811 812 dprog_add("}\n\n"); 813} 814 815static void 816dprog_compile() 817{ 818 dtrace_prog_t *prog; 819 dtrace_proginfo_t info; 820 821 if (g_Vflag) { 822 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n"); 823 (void) fputs(g_prog, stderr); 824 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n"); 825 } 826 827 if ((prog = dtrace_program_strcompile(g_dtp, g_prog, 828 DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL) 829 dfail("failed to compile program"); 830 831 if (dtrace_program_exec(g_dtp, prog, &info) == -1) 832 dfail("failed to enable probes"); 833 834 if (dtrace_go(g_dtp) != 0) 835 dfail("couldn't start tracing"); 836} 837 838static void 839#ifdef illumos 840status_fire(void) 841#else 842status_fire(int i) 843#endif 844{} 845 846static void 847status_init(void) 848{ 849 dtrace_optval_t val, status, agg; 850 struct sigaction act; 851 struct itimerspec ts; 852 struct sigevent ev; 853 timer_t tid; 854 855 if (dtrace_getopt(g_dtp, "statusrate", &status) == -1) 856 dfail("failed to get 'statusrate'"); 857 858 if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1) 859 dfail("failed to get 'statusrate'"); 860 861 /* 862 * We would want to awaken at a rate that is the GCD of the statusrate 863 * and the aggrate -- but that seems a bit absurd. Instead, we'll 864 * simply awaken at a rate that is the more frequent of the two, which 865 * assures that we're never later than the interval implied by the 866 * more frequent rate. 867 */ 868 val = status < agg ? status : agg; 869 870 (void) sigemptyset(&act.sa_mask); 871 act.sa_flags = 0; 872 act.sa_handler = status_fire; 873 (void) sigaction(SIGUSR1, &act, NULL); 874 875 ev.sigev_notify = SIGEV_SIGNAL; 876 ev.sigev_signo = SIGUSR1; 877 878 if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1) 879 dfail("cannot create CLOCK_REALTIME timer"); 880 881 ts.it_value.tv_sec = val / NANOSEC; 882 ts.it_value.tv_nsec = val % NANOSEC; 883 ts.it_interval = ts.it_value; 884 885 if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1) 886 dfail("cannot set time on CLOCK_REALTIME timer"); 887} 888 889static void 890status_check(void) 891{ 892 if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0) 893 dfail("failed to snap aggregate"); 894 895 if (dtrace_status(g_dtp) == -1) 896 dfail("dtrace_status()"); 897} 898 899static void 900lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data) 901{ 902 bzero(lsrec, g_recsize); 903 lsrec->ls_count = 1; 904 905 if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3)) 906 fail(0, "truncated DTrace record"); 907 908 if (rec->dtrd_size != sizeof (uint64_t)) 909 fail(0, "bad event size in first record"); 910 911 /* LINTED - alignment */ 912 lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset)); 913 rec++; 914 915 if (rec->dtrd_size != sizeof (uintptr_t)) 916 fail(0, "bad lock address size in second record"); 917 918 /* LINTED - alignment */ 919 lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset)); 920 rec++; 921 922 if (rec->dtrd_size != sizeof (uintptr_t)) 923 fail(0, "bad caller size in third record"); 924 925 /* LINTED - alignment */ 926 lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset)); 927 rec++; 928 929 if (g_recsize > LS_HIST) { 930 int frames, i; 931 pc_t *stack; 932 933 frames = rec->dtrd_size / sizeof (pc_t); 934 /* LINTED - alignment */ 935 stack = (pc_t *)(data + rec->dtrd_offset); 936 937 for (i = 1; i < frames; i++) 938 lsrec->ls_stack[i - 1] = stack[i]; 939 } 940} 941 942/*ARGSUSED*/ 943static int 944count_aggregate(const dtrace_aggdata_t *agg, void *arg) 945{ 946 *((size_t *)arg) += 1; 947 948 return (DTRACE_AGGWALK_NEXT); 949} 950 951static int 952process_aggregate(const dtrace_aggdata_t *agg, void *arg) 953{ 954 const dtrace_aggdesc_t *aggdesc = agg->dtada_desc; 955 caddr_t data = agg->dtada_data; 956 lsdata_t *lsdata = arg; 957 lsrec_t *lsrec = lsdata->lsd_next; 958 const dtrace_recdesc_t *rec; 959 uint64_t *avg, *quantized; 960 int i, j; 961 962 assert(lsdata->lsd_count < g_nrecs); 963 964 /* 965 * Aggregation variable IDs are guaranteed to be generated in program 966 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE 967 * plus one. As "avg" appears before "hist" in program order, we know 968 * that "avg" will be allocated the first aggregation variable ID, and 969 * "hist" will be allocated the second aggregation variable ID -- and 970 * we therefore use the aggregation variable ID to differentiate the 971 * cases. 972 */ 973 if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) { 974 /* 975 * If this is the histogram entry. We'll copy the quantized 976 * data into lc_hist, and jump over the rest. 977 */ 978 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 979 980 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2) 981 fail(0, "bad variable ID in aggregation record"); 982 983 if (rec->dtrd_size != 984 DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t)) 985 fail(0, "bad quantize size in aggregation record"); 986 987 /* LINTED - alignment */ 988 quantized = (uint64_t *)(data + rec->dtrd_offset); 989 990 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0; 991 i < DTRACE_QUANTIZE_NBUCKETS; i++, j++) 992 lsrec->ls_hist[j] = quantized[i]; 993 994 goto out; 995 } 996 997 lsrec_fill(lsrec, &aggdesc->dtagd_rec[1], 998 aggdesc->dtagd_nrecs - 1, data); 999 1000 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 1001 1002 if (rec->dtrd_size != 2 * sizeof (uint64_t)) 1003 fail(0, "bad avg size in aggregation record"); 1004 1005 /* LINTED - alignment */ 1006 avg = (uint64_t *)(data + rec->dtrd_offset); 1007 lsrec->ls_count = (uint32_t)avg[0]; 1008 lsrec->ls_time = (uintptr_t)avg[1]; 1009 1010 if (g_recsize >= LS_HIST) 1011 return (DTRACE_AGGWALK_NEXT); 1012 1013out: 1014 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 1015 lsdata->lsd_count++; 1016 1017 return (DTRACE_AGGWALK_NEXT); 1018} 1019 1020static int 1021process_trace(const dtrace_probedata_t *pdata, void *arg) 1022{ 1023 lsdata_t *lsdata = arg; 1024 lsrec_t *lsrec = lsdata->lsd_next; 1025 dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc; 1026 caddr_t data = pdata->dtpda_data; 1027 1028 if (lsdata->lsd_count >= g_nrecs) 1029 return (DTRACE_CONSUME_NEXT); 1030 1031 lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data); 1032 1033 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 1034 lsdata->lsd_count++; 1035 1036 return (DTRACE_CONSUME_NEXT); 1037} 1038 1039static int 1040process_data(FILE *out, char *data) 1041{ 1042 lsdata_t lsdata; 1043 1044 /* LINTED - alignment */ 1045 lsdata.lsd_next = (lsrec_t *)data; 1046 lsdata.lsd_count = 0; 1047 1048 if (g_tracing) { 1049 if (dtrace_consume(g_dtp, out, 1050 process_trace, NULL, &lsdata) != 0) 1051 dfail("failed to consume buffer"); 1052 1053 return (lsdata.lsd_count); 1054 } 1055 1056 if (dtrace_aggregate_walk_keyvarsorted(g_dtp, 1057 process_aggregate, &lsdata) != 0) 1058 dfail("failed to walk aggregate"); 1059 1060 return (lsdata.lsd_count); 1061} 1062 1063/*ARGSUSED*/ 1064static int 1065drophandler(const dtrace_dropdata_t *data, void *arg) 1066{ 1067 g_dropped++; 1068 (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg); 1069 return (DTRACE_HANDLE_OK); 1070} 1071 1072int 1073main(int argc, char **argv) 1074{ 1075 char *data_buf; 1076 lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf; 1077 FILE *out = stdout; 1078 int c; 1079 pid_t child; 1080 int status; 1081 int i, j; 1082 hrtime_t duration; 1083 char *addrp, *offp, *sizep, *evp, *lastp, *p; 1084 uintptr_t addr; 1085 size_t size, off; 1086 int events_specified = 0; 1087 int exec_errno = 0; 1088 uint32_t event; 1089 char *filt = NULL, *ifilt = NULL; 1090 static uint64_t ev_count[LS_MAX_EVENTS + 1]; 1091 static uint64_t ev_time[LS_MAX_EVENTS + 1]; 1092 dtrace_optval_t aggsize; 1093 char aggstr[10]; 1094 long ncpus; 1095 int dynvar = 0; 1096 int err; 1097 1098 if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) { 1099 fail(0, "cannot open dtrace library: %s", 1100 dtrace_errmsg(NULL, err)); 1101 } 1102 1103 if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1) 1104 dfail("couldn't establish drop handler"); 1105 1106 if (symtab_init() == -1) 1107 fail(1, "can't load kernel symbols"); 1108 1109 g_nrecs = DEFAULT_NRECS; 1110 1111 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) { 1112 switch (c) { 1113 case 'b': 1114 g_recsize = LS_BASIC; 1115 break; 1116 1117 case 't': 1118 g_recsize = LS_TIME; 1119 break; 1120 1121 case 'h': 1122 g_recsize = LS_HIST; 1123 break; 1124 1125 case 's': 1126 if (!isdigit(optarg[0])) 1127 usage(); 1128 g_stkdepth = atoi(optarg); 1129 if (g_stkdepth > LS_MAX_STACK_DEPTH) 1130 fail(0, "max stack depth is %d", 1131 LS_MAX_STACK_DEPTH); 1132 g_recsize = LS_STACK(g_stkdepth); 1133 break; 1134 1135 case 'n': 1136 if (!isdigit(optarg[0])) 1137 usage(); 1138 g_nrecs = atoi(optarg); 1139 break; 1140 1141 case 'd': 1142 if (!isdigit(optarg[0])) 1143 usage(); 1144 duration = atoll(optarg); 1145 1146 /* 1147 * XXX -- durations really should be per event 1148 * since the units are different, but it's hard 1149 * to express this nicely in the interface. 1150 * Not clear yet what the cleanest solution is. 1151 */ 1152 for (i = 0; i < LS_MAX_EVENTS; i++) 1153 if (g_event_info[i].ev_type != 'E') 1154 g_min_duration[i] = duration; 1155 1156 break; 1157 1158 case 'i': 1159 if (!isdigit(optarg[0])) 1160 usage(); 1161 i = atoi(optarg); 1162 if (i <= 0) 1163 usage(); 1164 if (i > MAX_HZ) 1165 fail(0, "max interrupt rate is %d Hz", MAX_HZ); 1166 1167 for (j = 0; j < LS_MAX_EVENTS; j++) 1168 if (strcmp(g_event_info[j].ev_desc, 1169 "Profiling interrupt") == 0) 1170 break; 1171 1172 (void) sprintf(g_event_info[j].ev_name, 1173 "profile:::profile-%d", i); 1174 break; 1175 1176 case 'l': 1177 case 'f': 1178 addrp = strtok(optarg, ","); 1179 sizep = strtok(NULL, ","); 1180 addrp = strtok(optarg, ",+"); 1181 offp = strtok(NULL, ","); 1182 1183 size = sizep ? strtoul(sizep, NULL, 0) : 1; 1184 off = offp ? strtoul(offp, NULL, 0) : 0; 1185 1186 if (addrp[0] == '0') { 1187 addr = strtoul(addrp, NULL, 16) + off; 1188 } else { 1189 addr = sym_to_addr(addrp) + off; 1190 if (sizep == NULL) 1191 size = sym_size(addrp) - off; 1192 if (addr - off == 0) 1193 fail(0, "symbol '%s' not found", addrp); 1194 if (size == 0) 1195 size = 1; 1196 } 1197 1198 1199 if (c == 'l') { 1200 filter_add(&filt, "arg0", addr, size); 1201 } else { 1202 filter_add(&filt, "caller", addr, size); 1203 filter_add(&ifilt, "arg0", addr, size); 1204 } 1205 break; 1206 1207 case 'e': 1208 evp = strtok_r(optarg, ",", &lastp); 1209 while (evp) { 1210 int ev1, ev2; 1211 char *evp2; 1212 1213 (void) strtok(evp, "-"); 1214 evp2 = strtok(NULL, "-"); 1215 ev1 = atoi(evp); 1216 ev2 = evp2 ? atoi(evp2) : ev1; 1217 if ((uint_t)ev1 >= LS_MAX_EVENTS || 1218 (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2) 1219 fail(0, "-e events out of range"); 1220 for (i = ev1; i <= ev2; i++) 1221 g_enabled[i] = 1; 1222 evp = strtok_r(NULL, ",", &lastp); 1223 } 1224 events_specified = 1; 1225 break; 1226 1227 case 'c': 1228 g_cflag = 1; 1229 break; 1230 1231 case 'k': 1232 g_kflag = 1; 1233 break; 1234 1235 case 'w': 1236 g_wflag = 1; 1237 break; 1238 1239 case 'W': 1240 g_Wflag = 1; 1241 break; 1242 1243 case 'g': 1244 g_gflag = 1; 1245 break; 1246 1247 case 'C': 1248 case 'E': 1249 case 'H': 1250 case 'I': 1251 for (i = 0; i < LS_MAX_EVENTS; i++) 1252 if (g_event_info[i].ev_type == c) 1253 g_enabled[i] = 1; 1254 events_specified = 1; 1255 break; 1256 1257 case 'A': 1258 for (i = 0; i < LS_MAX_EVENTS; i++) 1259 if (strchr("CH", g_event_info[i].ev_type)) 1260 g_enabled[i] = 1; 1261 events_specified = 1; 1262 break; 1263 1264 case 'T': 1265 g_tracing = 1; 1266 break; 1267 1268 case 'D': 1269 if (!isdigit(optarg[0])) 1270 usage(); 1271 g_topn = atoi(optarg); 1272 break; 1273 1274 case 'R': 1275 g_rates = 1; 1276 break; 1277 1278 case 'p': 1279 g_pflag = 1; 1280 break; 1281 1282 case 'P': 1283 g_Pflag = 1; 1284 break; 1285 1286 case 'o': 1287 if ((out = fopen(optarg, "w")) == NULL) 1288 fail(1, "error opening file"); 1289 break; 1290 1291 case 'V': 1292 g_Vflag = 1; 1293 break; 1294 1295 default: 1296 if (strchr(LOCKSTAT_OPTSTR, c) == NULL) 1297 usage(); 1298 } 1299 } 1300 1301 if (filt != NULL) { 1302 predicate_add(&g_predicate, filt, NULL, 0); 1303 filter_destroy(&filt); 1304 } 1305 1306 if (ifilt != NULL) { 1307 predicate_add(&g_ipredicate, ifilt, NULL, 0); 1308 filter_destroy(&ifilt); 1309 } 1310 1311 if (g_recsize == 0) { 1312 if (g_gflag) { 1313 g_stkdepth = LS_MAX_STACK_DEPTH; 1314 g_recsize = LS_STACK(g_stkdepth); 1315 } else { 1316 g_recsize = LS_TIME; 1317 } 1318 } 1319 1320 if (g_gflag && g_recsize <= LS_STACK(0)) 1321 fail(0, "'-g' requires at least '-s 1' data gathering"); 1322 1323 /* 1324 * Make sure the alignment is reasonable 1325 */ 1326 g_recsize = -(-g_recsize & -sizeof (uint64_t)); 1327 1328 for (i = 0; i < LS_MAX_EVENTS; i++) { 1329 /* 1330 * If no events were specified, enable -C. 1331 */ 1332 if (!events_specified && g_event_info[i].ev_type == 'C') 1333 g_enabled[i] = 1; 1334 } 1335 1336 for (i = 0; i < LS_MAX_EVENTS; i++) { 1337 if (!g_enabled[i]) 1338 continue; 1339 1340 if (g_event_info[i].ev_acquire != NULL) { 1341 /* 1342 * If we've enabled a hold event, we must explicitly 1343 * allocate dynamic variable space. 1344 */ 1345 dynvar = 1; 1346 } 1347 1348 dprog_addevent(i); 1349 } 1350 1351 /* 1352 * Make sure there are remaining arguments to specify a child command 1353 * to execute. 1354 */ 1355 if (argc <= optind) 1356 usage(); 1357 1358 if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1) 1359 dfail("couldn't determine number of online CPUs"); 1360 1361 /* 1362 * By default, we set our data buffer size to be the number of records 1363 * multiplied by the size of the record, doubled to account for some 1364 * DTrace slop and divided by the number of CPUs. We silently clamp 1365 * the aggregation size at both a minimum and a maximum to prevent 1366 * absurdly low or high values. 1367 */ 1368 if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE) 1369 aggsize = MIN_AGGSIZE; 1370 1371 if (aggsize > MAX_AGGSIZE) 1372 aggsize = MAX_AGGSIZE; 1373 1374 (void) sprintf(aggstr, "%lld", (long long)aggsize); 1375 1376 if (!g_tracing) { 1377 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1) 1378 dfail("failed to set 'bufsize'"); 1379 1380 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1) 1381 dfail("failed to set 'aggsize'"); 1382 1383 if (dynvar) { 1384 /* 1385 * If we're using dynamic variables, we set our 1386 * dynamic variable size to be one megabyte per CPU, 1387 * with a hard-limit of 32 megabytes. This may still 1388 * be too small in some cases, but it can be tuned 1389 * manually via -x if need be. 1390 */ 1391 (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32); 1392 1393 if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1) 1394 dfail("failed to set 'dynvarsize'"); 1395 } 1396 } else { 1397 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1) 1398 dfail("failed to set 'bufsize'"); 1399 } 1400 1401 if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1) 1402 dfail("failed to set 'statusrate'"); 1403 1404 optind = 1; 1405 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) { 1406 switch (c) { 1407 case 'x': 1408 if ((p = strchr(optarg, '=')) != NULL) 1409 *p++ = '\0'; 1410 1411 if (dtrace_setopt(g_dtp, optarg, p) != 0) 1412 dfail("failed to set -x %s", optarg); 1413 break; 1414 } 1415 } 1416 1417 argc -= optind; 1418 argv += optind; 1419 1420 dprog_compile(); 1421 status_init(); 1422 1423 g_elapsed = -gethrtime(); 1424 1425 /* 1426 * Spawn the specified command and wait for it to complete. 1427 */ 1428 child = fork(); 1429 if (child == -1) 1430 fail(1, "cannot fork"); 1431 if (child == 0) { 1432 (void) dtrace_close(g_dtp); 1433 (void) execvp(argv[0], &argv[0]); 1434 exec_errno = errno; 1435 exit(127); 1436 } 1437 1438#ifdef illumos 1439 while (waitpid(child, &status, WEXITED) != child) 1440#else 1441 while (waitpid(child, &status, 0) != child) 1442#endif 1443 status_check(); 1444 1445 g_elapsed += gethrtime(); 1446 1447 if (WIFEXITED(status)) { 1448 if (WEXITSTATUS(status) != 0) { 1449 if (exec_errno != 0) { 1450 errno = exec_errno; 1451 fail(1, "could not execute %s", argv[0]); 1452 } 1453 (void) fprintf(stderr, 1454 "lockstat: warning: %s exited with code %d\n", 1455 argv[0], WEXITSTATUS(status)); 1456 } 1457 } else { 1458 (void) fprintf(stderr, 1459 "lockstat: warning: %s died on signal %d\n", 1460 argv[0], WTERMSIG(status)); 1461 } 1462 1463 if (dtrace_stop(g_dtp) == -1) 1464 dfail("failed to stop dtrace"); 1465 1466 /* 1467 * Before we read out the results, we need to allocate our buffer. 1468 * If we're tracing, then we'll just use the precalculated size. If 1469 * we're not, then we'll take a snapshot of the aggregate, and walk 1470 * it to count the number of records. 1471 */ 1472 if (!g_tracing) { 1473 if (dtrace_aggregate_snap(g_dtp) != 0) 1474 dfail("failed to snap aggregate"); 1475 1476 g_nrecs = 0; 1477 1478 if (dtrace_aggregate_walk(g_dtp, 1479 count_aggregate, &g_nrecs) != 0) 1480 dfail("failed to walk aggregate"); 1481 } 1482 1483#ifdef illumos 1484 if ((data_buf = memalign(sizeof (uint64_t), 1485 (g_nrecs + 1) * g_recsize)) == NULL) 1486#else 1487 if (posix_memalign((void **)&data_buf, sizeof (uint64_t), 1488 (g_nrecs + 1) * g_recsize) ) 1489#endif 1490 fail(1, "Memory allocation failed"); 1491 1492 /* 1493 * Read out the DTrace data. 1494 */ 1495 g_nrecs_used = process_data(out, data_buf); 1496 1497 if (g_nrecs_used > g_nrecs || g_dropped) 1498 (void) fprintf(stderr, "lockstat: warning: " 1499 "ran out of data records (use -n for more)\n"); 1500 1501 /* LINTED - alignment */ 1502 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1503 /* LINTED - alignment */ 1504 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1505 ev_count[lsp->ls_event] += lsp->ls_count; 1506 ev_time[lsp->ls_event] += lsp->ls_time; 1507 } 1508 1509 /* 1510 * If -g was specified, convert stacks into individual records. 1511 */ 1512 if (g_gflag) { 1513 lsrec_t *newlsp, *oldlsp; 1514 1515#ifdef illumos 1516 newlsp = memalign(sizeof (uint64_t), 1517 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1518#else 1519 posix_memalign((void **)&newlsp, sizeof (uint64_t), 1520 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1521#endif 1522 if (newlsp == NULL) 1523 fail(1, "Cannot allocate space for -g processing"); 1524 lsp = newlsp; 1525 /* LINTED - alignment */ 1526 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1527 /* LINTED - alignment */ 1528 oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) { 1529 int fr; 1530 int caller_in_stack = 0; 1531 1532 if (oldlsp->ls_count == 0) 1533 continue; 1534 1535 for (fr = 0; fr < g_stkdepth; fr++) { 1536 if (oldlsp->ls_stack[fr] == 0) 1537 break; 1538 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller) 1539 caller_in_stack = 1; 1540 bcopy(oldlsp, lsp, LS_TIME); 1541 lsp->ls_caller = oldlsp->ls_stack[fr]; 1542 /* LINTED - alignment */ 1543 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1544 } 1545 if (!caller_in_stack) { 1546 bcopy(oldlsp, lsp, LS_TIME); 1547 /* LINTED - alignment */ 1548 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1549 } 1550 } 1551 g_nrecs = g_nrecs_used = 1552 ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME; 1553 g_recsize = LS_TIME; 1554 g_stkdepth = 0; 1555 free(data_buf); 1556 data_buf = (char *)newlsp; 1557 } 1558 1559 if ((sort_buf = calloc(2 * (g_nrecs + 1), 1560 sizeof (void *))) == NULL) 1561 fail(1, "Sort buffer allocation failed"); 1562 merge_buf = sort_buf + (g_nrecs + 1); 1563 1564 /* 1565 * Build the sort buffer, discarding zero-count records along the way. 1566 */ 1567 /* LINTED - alignment */ 1568 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1569 /* LINTED - alignment */ 1570 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1571 if (lsp->ls_count == 0) 1572 lsp->ls_event = LS_MAX_EVENTS; 1573 sort_buf[i] = lsp; 1574 } 1575 1576 if (g_nrecs_used == 0) 1577 exit(0); 1578 1579 /* 1580 * Add a sentinel after the last record 1581 */ 1582 sort_buf[i] = lsp; 1583 lsp->ls_event = LS_MAX_EVENTS; 1584 1585 if (g_tracing) { 1586 report_trace(out, sort_buf); 1587 return (0); 1588 } 1589 1590 /* 1591 * Application of -g may have resulted in multiple records 1592 * with the same signature; coalesce them. 1593 */ 1594 if (g_gflag) { 1595 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1596 coalesce(lockcmp, sort_buf, g_nrecs_used); 1597 } 1598 1599 /* 1600 * Coalesce locks within the same symbol if -c option specified. 1601 * Coalesce PCs within the same function if -k option specified. 1602 */ 1603 if (g_cflag || g_kflag) { 1604 for (i = 0; i < g_nrecs_used; i++) { 1605 int fr; 1606 lsp = sort_buf[i]; 1607 if (g_cflag) 1608 coalesce_symbol(&lsp->ls_lock); 1609 if (g_kflag) { 1610 for (fr = 0; fr < g_stkdepth; fr++) 1611 coalesce_symbol(&lsp->ls_stack[fr]); 1612 coalesce_symbol(&lsp->ls_caller); 1613 } 1614 } 1615 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1616 coalesce(lockcmp, sort_buf, g_nrecs_used); 1617 } 1618 1619 /* 1620 * Coalesce callers if -w option specified 1621 */ 1622 if (g_wflag) { 1623 mergesort(lock_and_count_cmp_anywhere, 1624 sort_buf, merge_buf, g_nrecs_used); 1625 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used); 1626 } 1627 1628 /* 1629 * Coalesce locks if -W option specified 1630 */ 1631 if (g_Wflag) { 1632 mergesort(site_and_count_cmp_anylock, 1633 sort_buf, merge_buf, g_nrecs_used); 1634 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used); 1635 } 1636 1637 /* 1638 * Sort data by contention count (ls_count) or total time (ls_time), 1639 * depending on g_Pflag. Override g_Pflag if time wasn't measured. 1640 */ 1641 if (g_recsize < LS_TIME) 1642 g_Pflag = 0; 1643 1644 if (g_Pflag) 1645 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used); 1646 else 1647 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used); 1648 1649 /* 1650 * Display data by event type 1651 */ 1652 first = &sort_buf[0]; 1653 while ((event = (*first)->ls_event) < LS_MAX_EVENTS) { 1654 current = first; 1655 while ((lsp = *current)->ls_event == event) 1656 current++; 1657 report_stats(out, first, current - first, ev_count[event], 1658 ev_time[event]); 1659 first = current; 1660 } 1661 1662 return (0); 1663} 1664 1665static char * 1666format_symbol(char *buf, uintptr_t addr, int show_size) 1667{ 1668 uintptr_t symoff; 1669 char *symname; 1670 size_t symsize; 1671 1672 symname = addr_to_sym(addr, &symoff, &symsize); 1673 1674 if (show_size && symoff == 0) 1675 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize); 1676 else if (symoff == 0) 1677 (void) sprintf(buf, "%s", symname); 1678 else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */ 1679#ifdef illumos 1680 (void) sprintf(buf, "%s+%ld", symname, (long)symoff); 1681#else 1682 (void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]); 1683#endif 1684 else if (symoff <= symsize || (symoff < 256 && addr != symoff)) 1685 (void) sprintf(buf, "%s+0x%llx", symname, 1686 (unsigned long long)symoff); 1687 else 1688 (void) sprintf(buf, "0x%llx", (unsigned long long)addr); 1689 return (buf); 1690} 1691 1692static void 1693report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count, 1694 uint64_t total_time) 1695{ 1696 uint32_t event = sort_buf[0]->ls_event; 1697 lsrec_t *lsp; 1698 double ptotal = 0.0; 1699 double percent; 1700 int i, j, fr; 1701 int displayed; 1702 int first_bin, last_bin, max_bin_count, total_bin_count; 1703 int rectype; 1704 char buf[256]; 1705 char lhdr[80], chdr[80]; 1706 1707 rectype = g_recsize; 1708 1709 if (g_topn == 0) { 1710 (void) fprintf(out, "%20llu %s\n", 1711 g_rates == 0 ? total_count : 1712 ((unsigned long long)total_count * NANOSEC) / g_elapsed, 1713 g_event_info[event].ev_desc); 1714 return; 1715 } 1716 1717 (void) sprintf(lhdr, "%s%s", 1718 g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr); 1719 (void) sprintf(chdr, "%s%s", 1720 g_wflag ? "Hottest " : "", "Caller"); 1721 1722 if (!g_pflag) 1723 (void) fprintf(out, 1724 "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n", 1725 g_event_info[event].ev_desc, (double)total_count, 1726 (double)g_elapsed / NANOSEC, 1727 (double)total_count * NANOSEC / g_elapsed); 1728 1729 if (!g_pflag && rectype < LS_HIST) { 1730 (void) sprintf(buf, "%s", g_event_info[event].ev_units); 1731 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1732 g_rates ? "ops/s" : "Count", 1733 g_gflag ? "genr" : "indv", 1734 "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr); 1735 (void) fprintf(out, "---------------------------------" 1736 "----------------------------------------------\n"); 1737 } 1738 1739 displayed = 0; 1740 for (i = 0; i < nrecs; i++) { 1741 lsp = sort_buf[i]; 1742 1743 if (displayed++ >= g_topn) 1744 break; 1745 1746 if (g_pflag) { 1747 int j; 1748 1749 (void) fprintf(out, "%u %u", 1750 lsp->ls_event, lsp->ls_count); 1751 (void) fprintf(out, " %s", 1752 format_symbol(buf, lsp->ls_lock, g_cflag)); 1753 (void) fprintf(out, " %s", 1754 format_symbol(buf, lsp->ls_caller, 0)); 1755 (void) fprintf(out, " %f", 1756 (double)lsp->ls_refcnt / lsp->ls_count); 1757 if (rectype >= LS_TIME) 1758 (void) fprintf(out, " %llu", 1759 (unsigned long long)lsp->ls_time); 1760 if (rectype >= LS_HIST) { 1761 for (j = 0; j < 64; j++) 1762 (void) fprintf(out, " %u", 1763 lsp->ls_hist[j]); 1764 } 1765 for (j = 0; j < LS_MAX_STACK_DEPTH; j++) { 1766 if (rectype <= LS_STACK(j) || 1767 lsp->ls_stack[j] == 0) 1768 break; 1769 (void) fprintf(out, " %s", 1770 format_symbol(buf, lsp->ls_stack[j], 0)); 1771 } 1772 (void) fprintf(out, "\n"); 1773 continue; 1774 } 1775 1776 if (rectype >= LS_HIST) { 1777 (void) fprintf(out, "---------------------------------" 1778 "----------------------------------------------\n"); 1779 (void) sprintf(buf, "%s", 1780 g_event_info[event].ev_units); 1781 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1782 g_rates ? "ops/s" : "Count", 1783 g_gflag ? "genr" : "indv", 1784 "cuml", "rcnt", buf, lhdr, chdr); 1785 } 1786 1787 if (g_Pflag && total_time != 0) 1788 percent = (lsp->ls_time * 100.00) / total_time; 1789 else 1790 percent = (lsp->ls_count * 100.00) / total_count; 1791 1792 ptotal += percent; 1793 1794 if (rectype >= LS_TIME) 1795 (void) sprintf(buf, "%llu", 1796 (unsigned long long)(lsp->ls_time / lsp->ls_count)); 1797 else 1798 buf[0] = '\0'; 1799 1800 (void) fprintf(out, "%5llu ", 1801 g_rates == 0 ? lsp->ls_count : 1802 ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed); 1803 1804 (void) fprintf(out, "%3.0f%% ", percent); 1805 1806 if (g_gflag) 1807 (void) fprintf(out, "---- "); 1808 else 1809 (void) fprintf(out, "%3.0f%% ", ptotal); 1810 1811 (void) fprintf(out, "%4.2f %8s ", 1812 (double)lsp->ls_refcnt / lsp->ls_count, buf); 1813 1814 (void) fprintf(out, "%-22s ", 1815 format_symbol(buf, lsp->ls_lock, g_cflag)); 1816 1817 (void) fprintf(out, "%-24s\n", 1818 format_symbol(buf, lsp->ls_caller, 0)); 1819 1820 if (rectype < LS_HIST) 1821 continue; 1822 1823 (void) fprintf(out, "\n"); 1824 (void) fprintf(out, "%10s %31s %-9s %-24s\n", 1825 g_event_info[event].ev_units, 1826 "------ Time Distribution ------", 1827 g_rates ? "ops/s" : "count", 1828 rectype > LS_STACK(0) ? "Stack" : ""); 1829 1830 first_bin = 0; 1831 while (lsp->ls_hist[first_bin] == 0) 1832 first_bin++; 1833 1834 last_bin = 63; 1835 while (lsp->ls_hist[last_bin] == 0) 1836 last_bin--; 1837 1838 max_bin_count = 0; 1839 total_bin_count = 0; 1840 for (j = first_bin; j <= last_bin; j++) { 1841 total_bin_count += lsp->ls_hist[j]; 1842 if (lsp->ls_hist[j] > max_bin_count) 1843 max_bin_count = lsp->ls_hist[j]; 1844 } 1845 1846 /* 1847 * If we went a few frames below the caller, ignore them 1848 */ 1849 for (fr = 3; fr > 0; fr--) 1850 if (lsp->ls_stack[fr] == lsp->ls_caller) 1851 break; 1852 1853 for (j = first_bin; j <= last_bin; j++) { 1854 uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count; 1855 (void) fprintf(out, "%10llu |%s%s %-9u ", 1856 1ULL << j, 1857 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth, 1858 " " + depth, 1859 g_rates == 0 ? lsp->ls_hist[j] : 1860 (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) / 1861 g_elapsed)); 1862 if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) { 1863 (void) fprintf(out, "\n"); 1864 continue; 1865 } 1866 (void) fprintf(out, "%-24s\n", 1867 format_symbol(buf, lsp->ls_stack[fr], 0)); 1868 fr++; 1869 } 1870 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1871 (void) fprintf(out, "%15s %-36s %-24s\n", "", "", 1872 format_symbol(buf, lsp->ls_stack[fr], 0)); 1873 fr++; 1874 } 1875 } 1876 1877 if (!g_pflag) 1878 (void) fprintf(out, "---------------------------------" 1879 "----------------------------------------------\n"); 1880 1881 (void) fflush(out); 1882} 1883 1884static void 1885report_trace(FILE *out, lsrec_t **sort_buf) 1886{ 1887 lsrec_t *lsp; 1888 int i, fr; 1889 int rectype; 1890 char buf[256], buf2[256]; 1891 1892 rectype = g_recsize; 1893 1894 if (!g_pflag) { 1895 (void) fprintf(out, "%5s %7s %11s %-24s %-24s\n", 1896 "Event", "Time", "Owner", "Lock", "Caller"); 1897 (void) fprintf(out, "---------------------------------" 1898 "----------------------------------------------\n"); 1899 } 1900 1901 for (i = 0; i < g_nrecs_used; i++) { 1902 1903 lsp = sort_buf[i]; 1904 1905 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0) 1906 continue; 1907 1908 (void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n", 1909 lsp->ls_event, (unsigned long long)lsp->ls_time, 1910 (void *)lsp->ls_next, 1911 format_symbol(buf, lsp->ls_lock, 0), 1912 format_symbol(buf2, lsp->ls_caller, 0)); 1913 1914 if (rectype <= LS_STACK(0)) 1915 continue; 1916 1917 /* 1918 * If we went a few frames below the caller, ignore them 1919 */ 1920 for (fr = 3; fr > 0; fr--) 1921 if (lsp->ls_stack[fr] == lsp->ls_caller) 1922 break; 1923 1924 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1925 (void) fprintf(out, "%53s %-24s\n", "", 1926 format_symbol(buf, lsp->ls_stack[fr], 0)); 1927 fr++; 1928 } 1929 (void) fprintf(out, "\n"); 1930 } 1931 1932 (void) fflush(out); 1933} 1934