lockstat.c revision 192867
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <stdio.h> 29#include <stddef.h> 30#include <stdlib.h> 31#include <stdarg.h> 32#include <string.h> 33#include <strings.h> 34#include <ctype.h> 35#include <fcntl.h> 36#include <unistd.h> 37#include <errno.h> 38#include <limits.h> 39#include <sys/types.h> 40#include <sys/modctl.h> 41#include <sys/stat.h> 42#include <sys/wait.h> 43#include <dtrace.h> 44#include <sys/lockstat.h> 45#include <alloca.h> 46#include <signal.h> 47#include <assert.h> 48 49#if !defined(sun) 50/* needed for FreeBSD */ 51#include <sys/time.h> 52#include <sys/resource.h> 53typedef uintptr_t pc_t; 54 55#define mergesort(a, b, c, d) lsmergesort(a, b, c, d) 56#endif 57 58#define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V" 59 60#define LS_MAX_STACK_DEPTH 50 61#define LS_MAX_EVENTS 64 62 63typedef struct lsrec { 64 struct lsrec *ls_next; /* next in hash chain */ 65 uintptr_t ls_lock; /* lock address */ 66 uintptr_t ls_caller; /* caller address */ 67 uint32_t ls_count; /* cumulative event count */ 68 uint32_t ls_event; /* type of event */ 69 uintptr_t ls_refcnt; /* cumulative reference count */ 70 uint64_t ls_time; /* cumulative event duration */ 71 uint32_t ls_hist[64]; /* log2(duration) histogram */ 72 uintptr_t ls_stack[LS_MAX_STACK_DEPTH]; 73} lsrec_t; 74 75typedef struct lsdata { 76 struct lsrec *lsd_next; /* next available */ 77 int lsd_count; /* number of records */ 78} lsdata_t; 79 80/* 81 * Definitions for the types of experiments which can be run. They are 82 * listed in increasing order of memory cost and processing time cost. 83 * The numerical value of each type is the number of bytes needed per record. 84 */ 85#define LS_BASIC offsetof(lsrec_t, ls_time) 86#define LS_TIME offsetof(lsrec_t, ls_hist[0]) 87#define LS_HIST offsetof(lsrec_t, ls_stack[0]) 88#define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth]) 89 90static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t); 91static void report_trace(FILE *, lsrec_t **); 92 93extern int symtab_init(void); 94extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *); 95extern uintptr_t sym_to_addr(char *name); 96extern size_t sym_size(char *name); 97extern char *strtok_r(char *, const char *, char **); 98 99#define DEFAULT_NRECS 10000 100#define DEFAULT_HZ 97 101#define MAX_HZ 1000 102#define MIN_AGGSIZE (16 * 1024) 103#define MAX_AGGSIZE (32 * 1024 * 1024) 104 105static int g_stkdepth; 106static int g_topn = INT_MAX; 107static hrtime_t g_elapsed; 108static int g_rates = 0; 109static int g_pflag = 0; 110static int g_Pflag = 0; 111static int g_wflag = 0; 112static int g_Wflag = 0; 113static int g_cflag = 0; 114static int g_kflag = 0; 115static int g_gflag = 0; 116static int g_Vflag = 0; 117static int g_tracing = 0; 118static size_t g_recsize; 119static size_t g_nrecs; 120static int g_nrecs_used; 121static uchar_t g_enabled[LS_MAX_EVENTS]; 122static hrtime_t g_min_duration[LS_MAX_EVENTS]; 123static dtrace_hdl_t *g_dtp; 124static char *g_predicate; 125static char *g_ipredicate; 126static char *g_prog; 127static int g_proglen; 128static int g_dropped; 129 130typedef struct ls_event_info { 131 char ev_type; 132 char ev_lhdr[20]; 133 char ev_desc[80]; 134 char ev_units[10]; 135 char ev_name[DTRACE_NAMELEN]; 136 char *ev_predicate; 137 char *ev_acquire; 138} ls_event_info_t; 139 140static ls_event_info_t g_event_info[LS_MAX_EVENTS] = { 141 { 'C', "Lock", "Adaptive mutex spin", "nsec", 142 "lockstat:::adaptive-spin" }, 143 { 'C', "Lock", "Adaptive mutex block", "nsec", 144 "lockstat:::adaptive-block" }, 145 { 'C', "Lock", "Spin lock spin", "nsec", 146 "lockstat:::spin-spin" }, 147 { 'C', "Lock", "Thread lock spin", "nsec", 148 "lockstat:::thread-spin" }, 149 { 'C', "Lock", "R/W writer blocked by writer", "nsec", 150 "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" }, 151 { 'C', "Lock", "R/W writer blocked by readers", "nsec", 152 "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" }, 153 { 'C', "Lock", "R/W reader blocked by writer", "nsec", 154 "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" }, 155 { 'C', "Lock", "R/W reader blocked by write wanted", "nsec", 156 "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" }, 157 { 'C', "Lock", "Unknown event (type 8)", "units" }, 158 { 'C', "Lock", "Unknown event (type 9)", "units" }, 159 { 'C', "Lock", "Unknown event (type 10)", "units" }, 160 { 'C', "Lock", "Unknown event (type 11)", "units" }, 161 { 'C', "Lock", "Unknown event (type 12)", "units" }, 162 { 'C', "Lock", "Unknown event (type 13)", "units" }, 163 { 'C', "Lock", "Unknown event (type 14)", "units" }, 164 { 'C', "Lock", "Unknown event (type 15)", "units" }, 165 { 'C', "Lock", "Unknown event (type 16)", "units" }, 166 { 'C', "Lock", "Unknown event (type 17)", "units" }, 167 { 'C', "Lock", "Unknown event (type 18)", "units" }, 168 { 'C', "Lock", "Unknown event (type 19)", "units" }, 169 { 'C', "Lock", "Unknown event (type 20)", "units" }, 170 { 'C', "Lock", "Unknown event (type 21)", "units" }, 171 { 'C', "Lock", "Unknown event (type 22)", "units" }, 172 { 'C', "Lock", "Unknown event (type 23)", "units" }, 173 { 'C', "Lock", "Unknown event (type 24)", "units" }, 174 { 'C', "Lock", "Unknown event (type 25)", "units" }, 175 { 'C', "Lock", "Unknown event (type 26)", "units" }, 176 { 'C', "Lock", "Unknown event (type 27)", "units" }, 177 { 'C', "Lock", "Unknown event (type 28)", "units" }, 178 { 'C', "Lock", "Unknown event (type 29)", "units" }, 179 { 'C', "Lock", "Unknown event (type 30)", "units" }, 180 { 'C', "Lock", "Unknown event (type 31)", "units" }, 181 { 'H', "Lock", "Adaptive mutex hold", "nsec", 182 "lockstat:::adaptive-release", NULL, 183 "lockstat:::adaptive-acquire" }, 184 { 'H', "Lock", "Spin lock hold", "nsec", 185 "lockstat:::spin-release", NULL, 186 "lockstat:::spin-acquire" }, 187 { 'H', "Lock", "R/W writer hold", "nsec", 188 "lockstat:::rw-release", "arg1 == 0", 189 "lockstat:::rw-acquire" }, 190 { 'H', "Lock", "R/W reader hold", "nsec", 191 "lockstat:::rw-release", "arg1 != 0", 192 "lockstat:::rw-acquire" }, 193 { 'H', "Lock", "Unknown event (type 36)", "units" }, 194 { 'H', "Lock", "Unknown event (type 37)", "units" }, 195 { 'H', "Lock", "Unknown event (type 38)", "units" }, 196 { 'H', "Lock", "Unknown event (type 39)", "units" }, 197 { 'H', "Lock", "Unknown event (type 40)", "units" }, 198 { 'H', "Lock", "Unknown event (type 41)", "units" }, 199 { 'H', "Lock", "Unknown event (type 42)", "units" }, 200 { 'H', "Lock", "Unknown event (type 43)", "units" }, 201 { 'H', "Lock", "Unknown event (type 44)", "units" }, 202 { 'H', "Lock", "Unknown event (type 45)", "units" }, 203 { 'H', "Lock", "Unknown event (type 46)", "units" }, 204 { 'H', "Lock", "Unknown event (type 47)", "units" }, 205 { 'H', "Lock", "Unknown event (type 48)", "units" }, 206 { 'H', "Lock", "Unknown event (type 49)", "units" }, 207 { 'H', "Lock", "Unknown event (type 50)", "units" }, 208 { 'H', "Lock", "Unknown event (type 51)", "units" }, 209 { 'H', "Lock", "Unknown event (type 52)", "units" }, 210 { 'H', "Lock", "Unknown event (type 53)", "units" }, 211 { 'H', "Lock", "Unknown event (type 54)", "units" }, 212 { 'H', "Lock", "Unknown event (type 55)", "units" }, 213#if defined(sun) 214 { 'I', "CPU+PIL", "Profiling interrupt", "nsec", 215#else 216 /* FreeBSD */ 217 { 'I', "CPU+Pri_Class", "Profiling interrupt", "nsec", 218#endif 219 "profile:::profile-97", NULL }, 220 { 'I', "Lock", "Unknown event (type 57)", "units" }, 221 { 'I', "Lock", "Unknown event (type 58)", "units" }, 222 { 'I', "Lock", "Unknown event (type 59)", "units" }, 223 { 'E', "Lock", "Recursive lock entry detected", "(N/A)", 224 "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" }, 225 { 'E', "Lock", "Lockstat enter failure", "(N/A)" }, 226 { 'E', "Lock", "Lockstat exit failure", "nsec" }, 227 { 'E', "Lock", "Lockstat record failure", "(N/A)" }, 228}; 229 230#if !defined(sun) 231static char *g_pri_class[] = { 232 "", 233 "Intr", 234 "RealT", 235 "TShar", 236 "Idle" 237}; 238#endif 239 240static void 241fail(int do_perror, const char *message, ...) 242{ 243 va_list args; 244 int save_errno = errno; 245 246 va_start(args, message); 247 (void) fprintf(stderr, "lockstat: "); 248 (void) vfprintf(stderr, message, args); 249 va_end(args); 250 if (do_perror) 251 (void) fprintf(stderr, ": %s", strerror(save_errno)); 252 (void) fprintf(stderr, "\n"); 253 exit(2); 254} 255 256static void 257dfail(const char *message, ...) 258{ 259 va_list args; 260 261 va_start(args, message); 262 (void) fprintf(stderr, "lockstat: "); 263 (void) vfprintf(stderr, message, args); 264 va_end(args); 265 (void) fprintf(stderr, ": %s\n", 266 dtrace_errmsg(g_dtp, dtrace_errno(g_dtp))); 267 268 exit(2); 269} 270 271static void 272show_events(char event_type, char *desc) 273{ 274 int i, first = -1, last; 275 276 for (i = 0; i < LS_MAX_EVENTS; i++) { 277 ls_event_info_t *evp = &g_event_info[i]; 278 if (evp->ev_type != event_type || 279 strncmp(evp->ev_desc, "Unknown event", 13) == 0) 280 continue; 281 if (first == -1) 282 first = i; 283 last = i; 284 } 285 286 (void) fprintf(stderr, 287 "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n", 288 desc, event_type, first, last); 289 290 for (i = first; i <= last; i++) 291 (void) fprintf(stderr, 292 "%4d = %s\n", i, g_event_info[i].ev_desc); 293} 294 295static void 296usage(void) 297{ 298 (void) fprintf(stderr, 299 "Usage: lockstat [options] command [args]\n" 300 "\nEvent selection options:\n\n" 301 " -C watch contention events [on by default]\n" 302 " -E watch error events [off by default]\n" 303 " -H watch hold events [off by default]\n" 304 " -I watch interrupt events [off by default]\n" 305 " -A watch all lock events [equivalent to -CH]\n" 306 " -e event_list only watch the specified events (shown below);\n" 307 " <event_list> is a comma-separated list of\n" 308 " events or ranges of events, e.g. 1,4-7,35\n" 309 " -i rate interrupt rate for -I [default: %d Hz]\n" 310 "\nData gathering options:\n\n" 311 " -b basic statistics (lock, caller, event count)\n" 312 " -t timing for all events [default]\n" 313 " -h histograms for event times\n" 314 " -s depth stack traces <depth> deep\n" 315 " -x opt[=val] enable or modify DTrace options\n" 316 "\nData filtering options:\n\n" 317 " -n nrecords maximum number of data records [default: %d]\n" 318 " -l lock[,size] only watch <lock>, which can be specified as a\n" 319 " symbolic name or hex address; <size> defaults\n" 320 " to the ELF symbol size if available, 1 if not\n" 321 " -f func[,size] only watch events generated by <func>\n" 322 " -d duration only watch events longer than <duration>\n" 323 " -T trace (rather than sample) events\n" 324 "\nData reporting options:\n\n" 325 " -c coalesce lock data for arrays like pse_mutex[]\n" 326 " -k coalesce PCs within functions\n" 327 " -g show total events generated by function\n" 328 " -w wherever: don't distinguish events by caller\n" 329 " -W whichever: don't distinguish events by lock\n" 330 " -R display rates rather than counts\n" 331 " -p parsable output format (awk(1)-friendly)\n" 332 " -P sort lock data by (count * avg_time) product\n" 333 " -D n only display top <n> events of each type\n" 334 " -o filename send output to <filename>\n", 335 DEFAULT_HZ, DEFAULT_NRECS); 336 337 show_events('C', "Contention"); 338 show_events('H', "Hold-time"); 339 show_events('I', "Interrupt"); 340 show_events('E', "Error"); 341 (void) fprintf(stderr, "\n"); 342 343 exit(1); 344} 345 346static int 347lockcmp(lsrec_t *a, lsrec_t *b) 348{ 349 int i; 350 351 if (a->ls_event < b->ls_event) 352 return (-1); 353 if (a->ls_event > b->ls_event) 354 return (1); 355 356 for (i = g_stkdepth - 1; i >= 0; i--) { 357 if (a->ls_stack[i] < b->ls_stack[i]) 358 return (-1); 359 if (a->ls_stack[i] > b->ls_stack[i]) 360 return (1); 361 } 362 363 if (a->ls_caller < b->ls_caller) 364 return (-1); 365 if (a->ls_caller > b->ls_caller) 366 return (1); 367 368 if (a->ls_lock < b->ls_lock) 369 return (-1); 370 if (a->ls_lock > b->ls_lock) 371 return (1); 372 373 return (0); 374} 375 376static int 377countcmp(lsrec_t *a, lsrec_t *b) 378{ 379 if (a->ls_event < b->ls_event) 380 return (-1); 381 if (a->ls_event > b->ls_event) 382 return (1); 383 384 return (b->ls_count - a->ls_count); 385} 386 387static int 388timecmp(lsrec_t *a, lsrec_t *b) 389{ 390 if (a->ls_event < b->ls_event) 391 return (-1); 392 if (a->ls_event > b->ls_event) 393 return (1); 394 395 if (a->ls_time < b->ls_time) 396 return (1); 397 if (a->ls_time > b->ls_time) 398 return (-1); 399 400 return (0); 401} 402 403static int 404lockcmp_anywhere(lsrec_t *a, lsrec_t *b) 405{ 406 if (a->ls_event < b->ls_event) 407 return (-1); 408 if (a->ls_event > b->ls_event) 409 return (1); 410 411 if (a->ls_lock < b->ls_lock) 412 return (-1); 413 if (a->ls_lock > b->ls_lock) 414 return (1); 415 416 return (0); 417} 418 419static int 420lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b) 421{ 422 if (a->ls_event < b->ls_event) 423 return (-1); 424 if (a->ls_event > b->ls_event) 425 return (1); 426 427 if (a->ls_lock < b->ls_lock) 428 return (-1); 429 if (a->ls_lock > b->ls_lock) 430 return (1); 431 432 return (b->ls_count - a->ls_count); 433} 434 435static int 436sitecmp_anylock(lsrec_t *a, lsrec_t *b) 437{ 438 int i; 439 440 if (a->ls_event < b->ls_event) 441 return (-1); 442 if (a->ls_event > b->ls_event) 443 return (1); 444 445 for (i = g_stkdepth - 1; i >= 0; i--) { 446 if (a->ls_stack[i] < b->ls_stack[i]) 447 return (-1); 448 if (a->ls_stack[i] > b->ls_stack[i]) 449 return (1); 450 } 451 452 if (a->ls_caller < b->ls_caller) 453 return (-1); 454 if (a->ls_caller > b->ls_caller) 455 return (1); 456 457 return (0); 458} 459 460static int 461site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b) 462{ 463 int i; 464 465 if (a->ls_event < b->ls_event) 466 return (-1); 467 if (a->ls_event > b->ls_event) 468 return (1); 469 470 for (i = g_stkdepth - 1; i >= 0; i--) { 471 if (a->ls_stack[i] < b->ls_stack[i]) 472 return (-1); 473 if (a->ls_stack[i] > b->ls_stack[i]) 474 return (1); 475 } 476 477 if (a->ls_caller < b->ls_caller) 478 return (-1); 479 if (a->ls_caller > b->ls_caller) 480 return (1); 481 482 return (b->ls_count - a->ls_count); 483} 484 485static void 486lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n) 487{ 488 int m = n / 2; 489 int i, j; 490 491 if (m > 1) 492 lsmergesort(cmp, a, b, m); 493 if (n - m > 1) 494 lsmergesort(cmp, a + m, b + m, n - m); 495 for (i = m; i > 0; i--) 496 b[i - 1] = a[i - 1]; 497 for (j = m - 1; j < n - 1; j++) 498 b[n + m - j - 2] = a[j + 1]; 499 while (i < j) 500 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--]; 501 *a = b[i]; 502} 503 504static void 505coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n) 506{ 507 int i, j; 508 lsrec_t *target, *current; 509 510 target = lock[0]; 511 512 for (i = 1; i < n; i++) { 513 current = lock[i]; 514 if (cmp(current, target) != 0) { 515 target = current; 516 continue; 517 } 518 current->ls_event = LS_MAX_EVENTS; 519 target->ls_count += current->ls_count; 520 target->ls_refcnt += current->ls_refcnt; 521 if (g_recsize < LS_TIME) 522 continue; 523 target->ls_time += current->ls_time; 524 if (g_recsize < LS_HIST) 525 continue; 526 for (j = 0; j < 64; j++) 527 target->ls_hist[j] += current->ls_hist[j]; 528 } 529} 530 531static void 532coalesce_symbol(uintptr_t *addrp) 533{ 534 uintptr_t symoff; 535 size_t symsize; 536 537 if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize) 538 *addrp -= symoff; 539} 540 541static void 542predicate_add(char **pred, char *what, char *cmp, uintptr_t value) 543{ 544 char *new; 545 int len, newlen; 546 547 if (what == NULL) 548 return; 549 550 if (*pred == NULL) { 551 *pred = malloc(1); 552 *pred[0] = '\0'; 553 } 554 555 len = strlen(*pred); 556 newlen = len + strlen(what) + 32 + strlen("( && )"); 557 new = malloc(newlen); 558 559 if (*pred[0] != '\0') { 560 if (cmp != NULL) { 561 (void) sprintf(new, "(%s) && (%s %s 0x%p)", 562 *pred, what, cmp, (void *)value); 563 } else { 564 (void) sprintf(new, "(%s) && (%s)", *pred, what); 565 } 566 } else { 567 if (cmp != NULL) { 568 (void) sprintf(new, "%s %s 0x%p", 569 what, cmp, (void *)value); 570 } else { 571 (void) sprintf(new, "%s", what); 572 } 573 } 574 575 free(*pred); 576 *pred = new; 577} 578 579static void 580predicate_destroy(char **pred) 581{ 582 free(*pred); 583 *pred = NULL; 584} 585 586static void 587filter_add(char **filt, char *what, uintptr_t base, uintptr_t size) 588{ 589 char buf[256], *c = buf, *new; 590 int len, newlen; 591 592 if (*filt == NULL) { 593 *filt = malloc(1); 594 *filt[0] = '\0'; 595 } 596 597#if defined(sun) 598 (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ? 599 " || " : "", what, (void *)base, what, (void *)(base + size)); 600#else 601 (void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ? 602 " || " : "", what, (void *)base, what, (void *)(base + size)); 603#endif 604 605 newlen = (len = strlen(*filt) + 1) + strlen(c); 606 new = malloc(newlen); 607 bcopy(*filt, new, len); 608 (void) strcat(new, c); 609 free(*filt); 610 *filt = new; 611} 612 613static void 614filter_destroy(char **filt) 615{ 616 free(*filt); 617 *filt = NULL; 618} 619 620static void 621dprog_add(const char *fmt, ...) 622{ 623 va_list args; 624 int size, offs; 625 char c; 626 627 va_start(args, fmt); 628 size = vsnprintf(&c, 1, fmt, args) + 1; 629 va_end(args); 630 631 if (g_proglen == 0) { 632 offs = 0; 633 } else { 634 offs = g_proglen - 1; 635 } 636 637 g_proglen = offs + size; 638 639 if ((g_prog = realloc(g_prog, g_proglen)) == NULL) 640 fail(1, "failed to reallocate program text"); 641 642 va_start(args, fmt); 643 (void) vsnprintf(&g_prog[offs], size, fmt, args); 644 va_end(args); 645} 646 647/* 648 * This function may read like an open sewer, but keep in mind that programs 649 * that generate other programs are rarely pretty. If one has the unenviable 650 * task of maintaining or -- worse -- extending this code, use the -V option 651 * to examine the D program as generated by this function. 652 */ 653static void 654dprog_addevent(int event) 655{ 656 ls_event_info_t *info = &g_event_info[event]; 657 char *pred = NULL; 658 char stack[20]; 659 const char *arg0, *caller; 660 char *arg1 = "arg1"; 661 char buf[80]; 662 hrtime_t dur; 663 int depth; 664 665 if (info->ev_name[0] == '\0') 666 return; 667 668 if (info->ev_type == 'I') { 669 /* 670 * For interrupt events, arg0 (normally the lock pointer) is 671 * the CPU address plus the current pil, and arg1 (normally 672 * the number of nanoseconds) is the number of nanoseconds 673 * late -- and it's stored in arg2. 674 */ 675#if defined(sun) 676 arg0 = "(uintptr_t)curthread->t_cpu + \n" 677 "\t curthread->t_cpu->cpu_profile_pil"; 678#else 679 arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n" 680 "\t 0x01000000 + curthread->td_pri_class"; 681#endif 682 caller = "(uintptr_t)arg0"; 683 arg1 = "arg2"; 684 } else { 685 arg0 = "(uintptr_t)arg0"; 686 caller = "caller"; 687 } 688 689 if (g_recsize > LS_HIST) { 690 for (depth = 0; g_recsize > LS_STACK(depth); depth++) 691 continue; 692 693 if (g_tracing) { 694 (void) sprintf(stack, "\tstack(%d);\n", depth); 695 } else { 696 (void) sprintf(stack, ", stack(%d)", depth); 697 } 698 } else { 699 (void) sprintf(stack, ""); 700 } 701 702 if (info->ev_acquire != NULL) { 703 /* 704 * If this is a hold event, we need to generate an additional 705 * clause for the acquire; the clause for the release will be 706 * generated with the aggregating statement, below. 707 */ 708 dprog_add("%s\n", info->ev_acquire); 709 predicate_add(&pred, info->ev_predicate, NULL, 0); 710 predicate_add(&pred, g_predicate, NULL, 0); 711 if (pred != NULL) 712 dprog_add("/%s/\n", pred); 713 714 dprog_add("{\n"); 715 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event); 716 717 if (info->ev_type == 'H') { 718 dprog_add("\t%s = timestamp;\n", buf); 719 } else { 720 /* 721 * If this isn't a hold event, it's the recursive 722 * error event. For this, we simply bump the 723 * thread-local, per-lock count. 724 */ 725 dprog_add("\t%s++;\n", buf); 726 } 727 728 dprog_add("}\n\n"); 729 predicate_destroy(&pred); 730 pred = NULL; 731 732 if (info->ev_type == 'E') { 733 /* 734 * If this is the recursive lock error event, we need 735 * to generate an additional clause to decrement the 736 * thread-local, per-lock count. This assures that we 737 * only execute the aggregating clause if we have 738 * recursive entry. 739 */ 740 dprog_add("%s\n", info->ev_name); 741 dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf); 742 } 743 744 predicate_add(&pred, buf, NULL, 0); 745 746 if (info->ev_type == 'H') { 747 (void) sprintf(buf, "timestamp -\n\t " 748 "self->ev%d[(uintptr_t)arg0]", event); 749 } 750 751 arg1 = buf; 752 } else { 753 predicate_add(&pred, info->ev_predicate, NULL, 0); 754 if (info->ev_type != 'I') 755 predicate_add(&pred, g_predicate, NULL, 0); 756 else 757 predicate_add(&pred, g_ipredicate, NULL, 0); 758 } 759 760 if ((dur = g_min_duration[event]) != 0) 761 predicate_add(&pred, arg1, ">=", dur); 762 763 dprog_add("%s\n", info->ev_name); 764 765 if (pred != NULL) 766 dprog_add("/%s/\n", pred); 767 predicate_destroy(&pred); 768 769 dprog_add("{\n"); 770 771 if (g_tracing) { 772 dprog_add("\ttrace(%dULL);\n", event); 773 dprog_add("\ttrace(%s);\n", arg0); 774 dprog_add("\ttrace(%s);\n", caller); 775 dprog_add(stack); 776 } else { 777 /* 778 * The ordering here is important: when we process the 779 * aggregate, we count on the fact that @avg appears before 780 * @hist in program order to assure that @avg is assigned the 781 * first aggregation variable ID and @hist assigned the 782 * second; see the comment in process_aggregate() for details. 783 */ 784 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n", 785 event, arg0, caller, stack, arg1); 786 787 if (g_recsize >= LS_HIST) { 788 dprog_add("\t@hist[%dULL, %s, %s%s] = quantize" 789 "(%s);\n", event, arg0, caller, stack, arg1); 790 } 791 } 792 793 if (info->ev_acquire != NULL) 794 dprog_add("\tself->ev%d[arg0] = 0;\n", event); 795 796 dprog_add("}\n\n"); 797} 798 799static void 800dprog_compile() 801{ 802 dtrace_prog_t *prog; 803 dtrace_proginfo_t info; 804 805 if (g_Vflag) { 806 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n"); 807 (void) fputs(g_prog, stderr); 808 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n"); 809 } 810 811 if ((prog = dtrace_program_strcompile(g_dtp, g_prog, 812 DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL) 813 dfail("failed to compile program"); 814 815 if (dtrace_program_exec(g_dtp, prog, &info) == -1) 816 dfail("failed to enable probes"); 817 818 if (dtrace_go(g_dtp) != 0) 819 dfail("couldn't start tracing"); 820} 821 822static void 823#if defined(sun) 824status_fire(void) 825#else 826status_fire(int i) 827#endif 828{} 829 830static void 831status_init(void) 832{ 833 dtrace_optval_t val, status, agg; 834 struct sigaction act; 835 struct itimerspec ts; 836 struct sigevent ev; 837 timer_t tid; 838 839 if (dtrace_getopt(g_dtp, "statusrate", &status) == -1) 840 dfail("failed to get 'statusrate'"); 841 842 if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1) 843 dfail("failed to get 'statusrate'"); 844 845 /* 846 * We would want to awaken at a rate that is the GCD of the statusrate 847 * and the aggrate -- but that seems a bit absurd. Instead, we'll 848 * simply awaken at a rate that is the more frequent of the two, which 849 * assures that we're never later than the interval implied by the 850 * more frequent rate. 851 */ 852 val = status < agg ? status : agg; 853 854 (void) sigemptyset(&act.sa_mask); 855 act.sa_flags = 0; 856 act.sa_handler = status_fire; 857 (void) sigaction(SIGUSR1, &act, NULL); 858 859 ev.sigev_notify = SIGEV_SIGNAL; 860 ev.sigev_signo = SIGUSR1; 861 862 if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1) 863 dfail("cannot create CLOCK_REALTIME timer"); 864 865 ts.it_value.tv_sec = val / NANOSEC; 866 ts.it_value.tv_nsec = val % NANOSEC; 867 ts.it_interval = ts.it_value; 868 869 if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1) 870 dfail("cannot set time on CLOCK_REALTIME timer"); 871} 872 873static void 874status_check(void) 875{ 876 if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0) 877 dfail("failed to snap aggregate"); 878 879 if (dtrace_status(g_dtp) == -1) 880 dfail("dtrace_status()"); 881} 882 883static void 884lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data) 885{ 886 bzero(lsrec, g_recsize); 887 lsrec->ls_count = 1; 888 889 if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3)) 890 fail(0, "truncated DTrace record"); 891 892 if (rec->dtrd_size != sizeof (uint64_t)) 893 fail(0, "bad event size in first record"); 894 895 /* LINTED - alignment */ 896 lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset)); 897 rec++; 898 899 if (rec->dtrd_size != sizeof (uintptr_t)) 900 fail(0, "bad lock address size in second record"); 901 902 /* LINTED - alignment */ 903 lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset)); 904 rec++; 905 906 if (rec->dtrd_size != sizeof (uintptr_t)) 907 fail(0, "bad caller size in third record"); 908 909 /* LINTED - alignment */ 910 lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset)); 911 rec++; 912 913 if (g_recsize > LS_HIST) { 914 int frames, i; 915 pc_t *stack; 916 917 frames = rec->dtrd_size / sizeof (pc_t); 918 /* LINTED - alignment */ 919 stack = (pc_t *)(data + rec->dtrd_offset); 920 921 for (i = 1; i < frames; i++) 922 lsrec->ls_stack[i - 1] = stack[i]; 923 } 924} 925 926/*ARGSUSED*/ 927static int 928count_aggregate(const dtrace_aggdata_t *agg, void *arg) 929{ 930 *((size_t *)arg) += 1; 931 932 return (DTRACE_AGGWALK_NEXT); 933} 934 935static int 936process_aggregate(const dtrace_aggdata_t *agg, void *arg) 937{ 938 const dtrace_aggdesc_t *aggdesc = agg->dtada_desc; 939 caddr_t data = agg->dtada_data; 940 lsdata_t *lsdata = arg; 941 lsrec_t *lsrec = lsdata->lsd_next; 942 const dtrace_recdesc_t *rec; 943 uint64_t *avg, *quantized; 944 int i, j; 945 946 assert(lsdata->lsd_count < g_nrecs); 947 948 /* 949 * Aggregation variable IDs are guaranteed to be generated in program 950 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE 951 * plus one. As "avg" appears before "hist" in program order, we know 952 * that "avg" will be allocated the first aggregation variable ID, and 953 * "hist" will be allocated the second aggregation variable ID -- and 954 * we therefore use the aggregation variable ID to differentiate the 955 * cases. 956 */ 957 if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) { 958 /* 959 * If this is the histogram entry. We'll copy the quantized 960 * data into lc_hist, and jump over the rest. 961 */ 962 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 963 964 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2) 965 fail(0, "bad variable ID in aggregation record"); 966 967 if (rec->dtrd_size != 968 DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t)) 969 fail(0, "bad quantize size in aggregation record"); 970 971 /* LINTED - alignment */ 972 quantized = (uint64_t *)(data + rec->dtrd_offset); 973 974 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0; 975 i < DTRACE_QUANTIZE_NBUCKETS; i++, j++) 976 lsrec->ls_hist[j] = quantized[i]; 977 978 goto out; 979 } 980 981 lsrec_fill(lsrec, &aggdesc->dtagd_rec[1], 982 aggdesc->dtagd_nrecs - 1, data); 983 984 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 985 986 if (rec->dtrd_size != 2 * sizeof (uint64_t)) 987 fail(0, "bad avg size in aggregation record"); 988 989 /* LINTED - alignment */ 990 avg = (uint64_t *)(data + rec->dtrd_offset); 991 lsrec->ls_count = (uint32_t)avg[0]; 992 lsrec->ls_time = (uintptr_t)avg[1]; 993 994 if (g_recsize >= LS_HIST) 995 return (DTRACE_AGGWALK_NEXT); 996 997out: 998 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 999 lsdata->lsd_count++; 1000 1001 return (DTRACE_AGGWALK_NEXT); 1002} 1003 1004static int 1005process_trace(const dtrace_probedata_t *pdata, void *arg) 1006{ 1007 lsdata_t *lsdata = arg; 1008 lsrec_t *lsrec = lsdata->lsd_next; 1009 dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc; 1010 caddr_t data = pdata->dtpda_data; 1011 1012 if (lsdata->lsd_count >= g_nrecs) 1013 return (DTRACE_CONSUME_NEXT); 1014 1015 lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data); 1016 1017 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 1018 lsdata->lsd_count++; 1019 1020 return (DTRACE_CONSUME_NEXT); 1021} 1022 1023static int 1024process_data(FILE *out, char *data) 1025{ 1026 lsdata_t lsdata; 1027 1028 /* LINTED - alignment */ 1029 lsdata.lsd_next = (lsrec_t *)data; 1030 lsdata.lsd_count = 0; 1031 1032 if (g_tracing) { 1033 if (dtrace_consume(g_dtp, out, 1034 process_trace, NULL, &lsdata) != 0) 1035 dfail("failed to consume buffer"); 1036 1037 return (lsdata.lsd_count); 1038 } 1039 1040 if (dtrace_aggregate_walk_keyvarsorted(g_dtp, 1041 process_aggregate, &lsdata) != 0) 1042 dfail("failed to walk aggregate"); 1043 1044 return (lsdata.lsd_count); 1045} 1046 1047/*ARGSUSED*/ 1048static int 1049drophandler(const dtrace_dropdata_t *data, void *arg) 1050{ 1051 g_dropped++; 1052 (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg); 1053 return (DTRACE_HANDLE_OK); 1054} 1055 1056int 1057main(int argc, char **argv) 1058{ 1059 char *data_buf; 1060 lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf; 1061 FILE *out = stdout; 1062 char c; 1063 pid_t child; 1064 int status; 1065 int i, j; 1066 hrtime_t duration; 1067 char *addrp, *offp, *sizep, *evp, *lastp, *p; 1068 uintptr_t addr; 1069 size_t size, off; 1070 int events_specified = 0; 1071 int exec_errno = 0; 1072 uint32_t event; 1073 char *filt = NULL, *ifilt = NULL; 1074 static uint64_t ev_count[LS_MAX_EVENTS + 1]; 1075 static uint64_t ev_time[LS_MAX_EVENTS + 1]; 1076 dtrace_optval_t aggsize; 1077 char aggstr[10]; 1078 long ncpus; 1079 int dynvar = 0; 1080 int err; 1081 1082 if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) { 1083 fail(0, "cannot open dtrace library: %s", 1084 dtrace_errmsg(NULL, err)); 1085 } 1086 1087 if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1) 1088 dfail("couldn't establish drop handler"); 1089 1090 if (symtab_init() == -1) 1091 fail(1, "can't load kernel symbols"); 1092 1093 g_nrecs = DEFAULT_NRECS; 1094 1095 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) { 1096 switch (c) { 1097 case 'b': 1098 g_recsize = LS_BASIC; 1099 break; 1100 1101 case 't': 1102 g_recsize = LS_TIME; 1103 break; 1104 1105 case 'h': 1106 g_recsize = LS_HIST; 1107 break; 1108 1109 case 's': 1110 if (!isdigit(optarg[0])) 1111 usage(); 1112 g_stkdepth = atoi(optarg); 1113 if (g_stkdepth > LS_MAX_STACK_DEPTH) 1114 fail(0, "max stack depth is %d", 1115 LS_MAX_STACK_DEPTH); 1116 g_recsize = LS_STACK(g_stkdepth); 1117 break; 1118 1119 case 'n': 1120 if (!isdigit(optarg[0])) 1121 usage(); 1122 g_nrecs = atoi(optarg); 1123 break; 1124 1125 case 'd': 1126 if (!isdigit(optarg[0])) 1127 usage(); 1128 duration = atoll(optarg); 1129 1130 /* 1131 * XXX -- durations really should be per event 1132 * since the units are different, but it's hard 1133 * to express this nicely in the interface. 1134 * Not clear yet what the cleanest solution is. 1135 */ 1136 for (i = 0; i < LS_MAX_EVENTS; i++) 1137 if (g_event_info[i].ev_type != 'E') 1138 g_min_duration[i] = duration; 1139 1140 break; 1141 1142 case 'i': 1143 if (!isdigit(optarg[0])) 1144 usage(); 1145 i = atoi(optarg); 1146 if (i <= 0) 1147 usage(); 1148 if (i > MAX_HZ) 1149 fail(0, "max interrupt rate is %d Hz", MAX_HZ); 1150 1151 for (j = 0; j < LS_MAX_EVENTS; j++) 1152 if (strcmp(g_event_info[j].ev_desc, 1153 "Profiling interrupt") == 0) 1154 break; 1155 1156 (void) sprintf(g_event_info[j].ev_name, 1157 "profile:::profile-%d", i); 1158 break; 1159 1160 case 'l': 1161 case 'f': 1162 addrp = strtok(optarg, ","); 1163 sizep = strtok(NULL, ","); 1164 addrp = strtok(optarg, ",+"); 1165 offp = strtok(NULL, ","); 1166 1167 size = sizep ? strtoul(sizep, NULL, 0) : 1; 1168 off = offp ? strtoul(offp, NULL, 0) : 0; 1169 1170 if (addrp[0] == '0') { 1171 addr = strtoul(addrp, NULL, 16) + off; 1172 } else { 1173 addr = sym_to_addr(addrp) + off; 1174 if (sizep == NULL) 1175 size = sym_size(addrp) - off; 1176 if (addr - off == 0) 1177 fail(0, "symbol '%s' not found", addrp); 1178 if (size == 0) 1179 size = 1; 1180 } 1181 1182 1183 if (c == 'l') { 1184 filter_add(&filt, "arg0", addr, size); 1185 } else { 1186 filter_add(&filt, "caller", addr, size); 1187 filter_add(&ifilt, "arg0", addr, size); 1188 } 1189 break; 1190 1191 case 'e': 1192 evp = strtok_r(optarg, ",", &lastp); 1193 while (evp) { 1194 int ev1, ev2; 1195 char *evp2; 1196 1197 (void) strtok(evp, "-"); 1198 evp2 = strtok(NULL, "-"); 1199 ev1 = atoi(evp); 1200 ev2 = evp2 ? atoi(evp2) : ev1; 1201 if ((uint_t)ev1 >= LS_MAX_EVENTS || 1202 (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2) 1203 fail(0, "-e events out of range"); 1204 for (i = ev1; i <= ev2; i++) 1205 g_enabled[i] = 1; 1206 evp = strtok_r(NULL, ",", &lastp); 1207 } 1208 events_specified = 1; 1209 break; 1210 1211 case 'c': 1212 g_cflag = 1; 1213 break; 1214 1215 case 'k': 1216 g_kflag = 1; 1217 break; 1218 1219 case 'w': 1220 g_wflag = 1; 1221 break; 1222 1223 case 'W': 1224 g_Wflag = 1; 1225 break; 1226 1227 case 'g': 1228 g_gflag = 1; 1229 break; 1230 1231 case 'C': 1232 case 'E': 1233 case 'H': 1234 case 'I': 1235 for (i = 0; i < LS_MAX_EVENTS; i++) 1236 if (g_event_info[i].ev_type == c) 1237 g_enabled[i] = 1; 1238 events_specified = 1; 1239 break; 1240 1241 case 'A': 1242 for (i = 0; i < LS_MAX_EVENTS; i++) 1243 if (strchr("CH", g_event_info[i].ev_type)) 1244 g_enabled[i] = 1; 1245 events_specified = 1; 1246 break; 1247 1248 case 'T': 1249 g_tracing = 1; 1250 break; 1251 1252 case 'D': 1253 if (!isdigit(optarg[0])) 1254 usage(); 1255 g_topn = atoi(optarg); 1256 break; 1257 1258 case 'R': 1259 g_rates = 1; 1260 break; 1261 1262 case 'p': 1263 g_pflag = 1; 1264 break; 1265 1266 case 'P': 1267 g_Pflag = 1; 1268 break; 1269 1270 case 'o': 1271 if ((out = fopen(optarg, "w")) == NULL) 1272 fail(1, "error opening file"); 1273 break; 1274 1275 case 'V': 1276 g_Vflag = 1; 1277 break; 1278 1279 default: 1280 if (strchr(LOCKSTAT_OPTSTR, c) == NULL) 1281 usage(); 1282 } 1283 } 1284 1285 if (filt != NULL) { 1286 predicate_add(&g_predicate, filt, NULL, 0); 1287 filter_destroy(&filt); 1288 } 1289 1290 if (ifilt != NULL) { 1291 predicate_add(&g_ipredicate, ifilt, NULL, 0); 1292 filter_destroy(&ifilt); 1293 } 1294 1295 if (g_recsize == 0) { 1296 if (g_gflag) { 1297 g_stkdepth = LS_MAX_STACK_DEPTH; 1298 g_recsize = LS_STACK(g_stkdepth); 1299 } else { 1300 g_recsize = LS_TIME; 1301 } 1302 } 1303 1304 if (g_gflag && g_recsize <= LS_STACK(0)) 1305 fail(0, "'-g' requires at least '-s 1' data gathering"); 1306 1307 /* 1308 * Make sure the alignment is reasonable 1309 */ 1310 g_recsize = -(-g_recsize & -sizeof (uint64_t)); 1311 1312 for (i = 0; i < LS_MAX_EVENTS; i++) { 1313 /* 1314 * If no events were specified, enable -C. 1315 */ 1316 if (!events_specified && g_event_info[i].ev_type == 'C') 1317 g_enabled[i] = 1; 1318 } 1319 1320 for (i = 0; i < LS_MAX_EVENTS; i++) { 1321 if (!g_enabled[i]) 1322 continue; 1323 1324 if (g_event_info[i].ev_acquire != NULL) { 1325 /* 1326 * If we've enabled a hold event, we must explicitly 1327 * allocate dynamic variable space. 1328 */ 1329 dynvar = 1; 1330 } 1331 1332 dprog_addevent(i); 1333 } 1334 1335 /* 1336 * Make sure there are remaining arguments to specify a child command 1337 * to execute. 1338 */ 1339 if (argc <= optind) 1340 usage(); 1341 1342 if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1) 1343 dfail("couldn't determine number of online CPUs"); 1344 1345 /* 1346 * By default, we set our data buffer size to be the number of records 1347 * multiplied by the size of the record, doubled to account for some 1348 * DTrace slop and divided by the number of CPUs. We silently clamp 1349 * the aggregation size at both a minimum and a maximum to prevent 1350 * absurdly low or high values. 1351 */ 1352 if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE) 1353 aggsize = MIN_AGGSIZE; 1354 1355 if (aggsize > MAX_AGGSIZE) 1356 aggsize = MAX_AGGSIZE; 1357 1358 (void) sprintf(aggstr, "%lld", (long long)aggsize); 1359 1360 if (!g_tracing) { 1361 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1) 1362 dfail("failed to set 'bufsize'"); 1363 1364 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1) 1365 dfail("failed to set 'aggsize'"); 1366 1367 if (dynvar) { 1368 /* 1369 * If we're using dynamic variables, we set our 1370 * dynamic variable size to be one megabyte per CPU, 1371 * with a hard-limit of 32 megabytes. This may still 1372 * be too small in some cases, but it can be tuned 1373 * manually via -x if need be. 1374 */ 1375 (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32); 1376 1377 if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1) 1378 dfail("failed to set 'dynvarsize'"); 1379 } 1380 } else { 1381 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1) 1382 dfail("failed to set 'bufsize'"); 1383 } 1384 1385 if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1) 1386 dfail("failed to set 'statusrate'"); 1387 1388 optind = 1; 1389 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) { 1390 switch (c) { 1391 case 'x': 1392 if ((p = strchr(optarg, '=')) != NULL) 1393 *p++ = '\0'; 1394 1395 if (dtrace_setopt(g_dtp, optarg, p) != 0) 1396 dfail("failed to set -x %s", optarg); 1397 break; 1398 } 1399 } 1400 1401 argc -= optind; 1402 argv += optind; 1403 1404 dprog_compile(); 1405 status_init(); 1406 1407 g_elapsed = -gethrtime(); 1408 1409 /* 1410 * Spawn the specified command and wait for it to complete. 1411 */ 1412 child = fork(); 1413 if (child == -1) 1414 fail(1, "cannot fork"); 1415 if (child == 0) { 1416 (void) dtrace_close(g_dtp); 1417 (void) execvp(argv[0], &argv[0]); 1418 exec_errno = errno; 1419 exit(127); 1420 } 1421 1422#if defined(sun) 1423 while (waitpid(child, &status, WEXITED) != child) 1424#else 1425 while (waitpid(child, &status, 0) != child) 1426#endif 1427 status_check(); 1428 1429 g_elapsed += gethrtime(); 1430 1431 if (WIFEXITED(status)) { 1432 if (WEXITSTATUS(status) != 0) { 1433 if (exec_errno != 0) { 1434 errno = exec_errno; 1435 fail(1, "could not execute %s", argv[0]); 1436 } 1437 (void) fprintf(stderr, 1438 "lockstat: warning: %s exited with code %d\n", 1439 argv[0], WEXITSTATUS(status)); 1440 } 1441 } else { 1442 (void) fprintf(stderr, 1443 "lockstat: warning: %s died on signal %d\n", 1444 argv[0], WTERMSIG(status)); 1445 } 1446 1447 if (dtrace_stop(g_dtp) == -1) 1448 dfail("failed to stop dtrace"); 1449 1450 /* 1451 * Before we read out the results, we need to allocate our buffer. 1452 * If we're tracing, then we'll just use the precalculated size. If 1453 * we're not, then we'll take a snapshot of the aggregate, and walk 1454 * it to count the number of records. 1455 */ 1456 if (!g_tracing) { 1457 if (dtrace_aggregate_snap(g_dtp) != 0) 1458 dfail("failed to snap aggregate"); 1459 1460 g_nrecs = 0; 1461 1462 if (dtrace_aggregate_walk(g_dtp, 1463 count_aggregate, &g_nrecs) != 0) 1464 dfail("failed to walk aggregate"); 1465 } 1466 1467#if defined(sun) 1468 if ((data_buf = memalign(sizeof (uint64_t), 1469 (g_nrecs + 1) * g_recsize)) == NULL) 1470#else 1471 if (posix_memalign((void **)&data_buf, sizeof (uint64_t), 1472 (g_nrecs + 1) * g_recsize) ) 1473#endif 1474 fail(1, "Memory allocation failed"); 1475 1476 /* 1477 * Read out the DTrace data. 1478 */ 1479 g_nrecs_used = process_data(out, data_buf); 1480 1481 if (g_nrecs_used > g_nrecs || g_dropped) 1482 (void) fprintf(stderr, "lockstat: warning: " 1483 "ran out of data records (use -n for more)\n"); 1484 1485 /* LINTED - alignment */ 1486 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1487 /* LINTED - alignment */ 1488 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1489 ev_count[lsp->ls_event] += lsp->ls_count; 1490 ev_time[lsp->ls_event] += lsp->ls_time; 1491 } 1492 1493 /* 1494 * If -g was specified, convert stacks into individual records. 1495 */ 1496 if (g_gflag) { 1497 lsrec_t *newlsp, *oldlsp; 1498 1499#if defined(sun) 1500 newlsp = memalign(sizeof (uint64_t), 1501 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1502#else 1503 posix_memalign((void **)&newlsp, sizeof (uint64_t), 1504 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1505#endif 1506 if (newlsp == NULL) 1507 fail(1, "Cannot allocate space for -g processing"); 1508 lsp = newlsp; 1509 /* LINTED - alignment */ 1510 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1511 /* LINTED - alignment */ 1512 oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) { 1513 int fr; 1514 int caller_in_stack = 0; 1515 1516 if (oldlsp->ls_count == 0) 1517 continue; 1518 1519 for (fr = 0; fr < g_stkdepth; fr++) { 1520 if (oldlsp->ls_stack[fr] == 0) 1521 break; 1522 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller) 1523 caller_in_stack = 1; 1524 bcopy(oldlsp, lsp, LS_TIME); 1525 lsp->ls_caller = oldlsp->ls_stack[fr]; 1526 /* LINTED - alignment */ 1527 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1528 } 1529 if (!caller_in_stack) { 1530 bcopy(oldlsp, lsp, LS_TIME); 1531 /* LINTED - alignment */ 1532 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1533 } 1534 } 1535 g_nrecs = g_nrecs_used = 1536 ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME; 1537 g_recsize = LS_TIME; 1538 g_stkdepth = 0; 1539 free(data_buf); 1540 data_buf = (char *)newlsp; 1541 } 1542 1543 if ((sort_buf = calloc(2 * (g_nrecs + 1), 1544 sizeof (void *))) == NULL) 1545 fail(1, "Sort buffer allocation failed"); 1546 merge_buf = sort_buf + (g_nrecs + 1); 1547 1548 /* 1549 * Build the sort buffer, discarding zero-count records along the way. 1550 */ 1551 /* LINTED - alignment */ 1552 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1553 /* LINTED - alignment */ 1554 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1555 if (lsp->ls_count == 0) 1556 lsp->ls_event = LS_MAX_EVENTS; 1557 sort_buf[i] = lsp; 1558 } 1559 1560 if (g_nrecs_used == 0) 1561 exit(0); 1562 1563 /* 1564 * Add a sentinel after the last record 1565 */ 1566 sort_buf[i] = lsp; 1567 lsp->ls_event = LS_MAX_EVENTS; 1568 1569 if (g_tracing) { 1570 report_trace(out, sort_buf); 1571 return (0); 1572 } 1573 1574 /* 1575 * Application of -g may have resulted in multiple records 1576 * with the same signature; coalesce them. 1577 */ 1578 if (g_gflag) { 1579 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1580 coalesce(lockcmp, sort_buf, g_nrecs_used); 1581 } 1582 1583 /* 1584 * Coalesce locks within the same symbol if -c option specified. 1585 * Coalesce PCs within the same function if -k option specified. 1586 */ 1587 if (g_cflag || g_kflag) { 1588 for (i = 0; i < g_nrecs_used; i++) { 1589 int fr; 1590 lsp = sort_buf[i]; 1591 if (g_cflag) 1592 coalesce_symbol(&lsp->ls_lock); 1593 if (g_kflag) { 1594 for (fr = 0; fr < g_stkdepth; fr++) 1595 coalesce_symbol(&lsp->ls_stack[fr]); 1596 coalesce_symbol(&lsp->ls_caller); 1597 } 1598 } 1599 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1600 coalesce(lockcmp, sort_buf, g_nrecs_used); 1601 } 1602 1603 /* 1604 * Coalesce callers if -w option specified 1605 */ 1606 if (g_wflag) { 1607 mergesort(lock_and_count_cmp_anywhere, 1608 sort_buf, merge_buf, g_nrecs_used); 1609 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used); 1610 } 1611 1612 /* 1613 * Coalesce locks if -W option specified 1614 */ 1615 if (g_Wflag) { 1616 mergesort(site_and_count_cmp_anylock, 1617 sort_buf, merge_buf, g_nrecs_used); 1618 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used); 1619 } 1620 1621 /* 1622 * Sort data by contention count (ls_count) or total time (ls_time), 1623 * depending on g_Pflag. Override g_Pflag if time wasn't measured. 1624 */ 1625 if (g_recsize < LS_TIME) 1626 g_Pflag = 0; 1627 1628 if (g_Pflag) 1629 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used); 1630 else 1631 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used); 1632 1633 /* 1634 * Display data by event type 1635 */ 1636 first = &sort_buf[0]; 1637 while ((event = (*first)->ls_event) < LS_MAX_EVENTS) { 1638 current = first; 1639 while ((lsp = *current)->ls_event == event) 1640 current++; 1641 report_stats(out, first, current - first, ev_count[event], 1642 ev_time[event]); 1643 first = current; 1644 } 1645 1646 return (0); 1647} 1648 1649static char * 1650format_symbol(char *buf, uintptr_t addr, int show_size) 1651{ 1652 uintptr_t symoff; 1653 char *symname; 1654 size_t symsize; 1655 1656 symname = addr_to_sym(addr, &symoff, &symsize); 1657 1658 if (show_size && symoff == 0) 1659 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize); 1660 else if (symoff == 0) 1661 (void) sprintf(buf, "%s", symname); 1662 else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */ 1663#if defined(sun) 1664 (void) sprintf(buf, "%s+%ld", symname, (long)symoff); 1665#else 1666 (void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]); 1667#endif 1668 else if (symoff <= symsize || (symoff < 256 && addr != symoff)) 1669 (void) sprintf(buf, "%s+0x%llx", symname, 1670 (unsigned long long)symoff); 1671 else 1672 (void) sprintf(buf, "0x%llx", (unsigned long long)addr); 1673 return (buf); 1674} 1675 1676static void 1677report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count, 1678 uint64_t total_time) 1679{ 1680 uint32_t event = sort_buf[0]->ls_event; 1681 lsrec_t *lsp; 1682 double ptotal = 0.0; 1683 double percent; 1684 int i, j, fr; 1685 int displayed; 1686 int first_bin, last_bin, max_bin_count, total_bin_count; 1687 int rectype; 1688 char buf[256]; 1689 char lhdr[80], chdr[80]; 1690 1691 rectype = g_recsize; 1692 1693 if (g_topn == 0) { 1694 (void) fprintf(out, "%20llu %s\n", 1695 g_rates == 0 ? total_count : 1696 ((unsigned long long)total_count * NANOSEC) / g_elapsed, 1697 g_event_info[event].ev_desc); 1698 return; 1699 } 1700 1701 (void) sprintf(lhdr, "%s%s", 1702 g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr); 1703 (void) sprintf(chdr, "%s%s", 1704 g_wflag ? "Hottest " : "", "Caller"); 1705 1706 if (!g_pflag) 1707 (void) fprintf(out, 1708 "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n", 1709 g_event_info[event].ev_desc, (double)total_count, 1710 (double)g_elapsed / NANOSEC, 1711 (double)total_count * NANOSEC / g_elapsed); 1712 1713 if (!g_pflag && rectype < LS_HIST) { 1714 (void) sprintf(buf, "%s", g_event_info[event].ev_units); 1715 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1716 g_rates ? "ops/s" : "Count", 1717 g_gflag ? "genr" : "indv", 1718 "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr); 1719 (void) fprintf(out, "---------------------------------" 1720 "----------------------------------------------\n"); 1721 } 1722 1723 displayed = 0; 1724 for (i = 0; i < nrecs; i++) { 1725 lsp = sort_buf[i]; 1726 1727 if (displayed++ >= g_topn) 1728 break; 1729 1730 if (g_pflag) { 1731 int j; 1732 1733 (void) fprintf(out, "%u %u", 1734 lsp->ls_event, lsp->ls_count); 1735 (void) fprintf(out, " %s", 1736 format_symbol(buf, lsp->ls_lock, g_cflag)); 1737 (void) fprintf(out, " %s", 1738 format_symbol(buf, lsp->ls_caller, 0)); 1739 (void) fprintf(out, " %f", 1740 (double)lsp->ls_refcnt / lsp->ls_count); 1741 if (rectype >= LS_TIME) 1742 (void) fprintf(out, " %llu", 1743 (unsigned long long)lsp->ls_time); 1744 if (rectype >= LS_HIST) { 1745 for (j = 0; j < 64; j++) 1746 (void) fprintf(out, " %u", 1747 lsp->ls_hist[j]); 1748 } 1749 for (j = 0; j < LS_MAX_STACK_DEPTH; j++) { 1750 if (rectype <= LS_STACK(j) || 1751 lsp->ls_stack[j] == 0) 1752 break; 1753 (void) fprintf(out, " %s", 1754 format_symbol(buf, lsp->ls_stack[j], 0)); 1755 } 1756 (void) fprintf(out, "\n"); 1757 continue; 1758 } 1759 1760 if (rectype >= LS_HIST) { 1761 (void) fprintf(out, "---------------------------------" 1762 "----------------------------------------------\n"); 1763 (void) sprintf(buf, "%s", 1764 g_event_info[event].ev_units); 1765 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1766 g_rates ? "ops/s" : "Count", 1767 g_gflag ? "genr" : "indv", 1768 "cuml", "rcnt", buf, lhdr, chdr); 1769 } 1770 1771 if (g_Pflag && total_time != 0) 1772 percent = (lsp->ls_time * 100.00) / total_time; 1773 else 1774 percent = (lsp->ls_count * 100.00) / total_count; 1775 1776 ptotal += percent; 1777 1778 if (rectype >= LS_TIME) 1779 (void) sprintf(buf, "%llu", 1780 (unsigned long long)(lsp->ls_time / lsp->ls_count)); 1781 else 1782 buf[0] = '\0'; 1783 1784 (void) fprintf(out, "%5llu ", 1785 g_rates == 0 ? lsp->ls_count : 1786 ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed); 1787 1788 (void) fprintf(out, "%3.0f%% ", percent); 1789 1790 if (g_gflag) 1791 (void) fprintf(out, "---- "); 1792 else 1793 (void) fprintf(out, "%3.0f%% ", ptotal); 1794 1795 (void) fprintf(out, "%4.2f %8s ", 1796 (double)lsp->ls_refcnt / lsp->ls_count, buf); 1797 1798 (void) fprintf(out, "%-22s ", 1799 format_symbol(buf, lsp->ls_lock, g_cflag)); 1800 1801 (void) fprintf(out, "%-24s\n", 1802 format_symbol(buf, lsp->ls_caller, 0)); 1803 1804 if (rectype < LS_HIST) 1805 continue; 1806 1807 (void) fprintf(out, "\n"); 1808 (void) fprintf(out, "%10s %31s %-9s %-24s\n", 1809 g_event_info[event].ev_units, 1810 "------ Time Distribution ------", 1811 g_rates ? "ops/s" : "count", 1812 rectype > LS_STACK(0) ? "Stack" : ""); 1813 1814 first_bin = 0; 1815 while (lsp->ls_hist[first_bin] == 0) 1816 first_bin++; 1817 1818 last_bin = 63; 1819 while (lsp->ls_hist[last_bin] == 0) 1820 last_bin--; 1821 1822 max_bin_count = 0; 1823 total_bin_count = 0; 1824 for (j = first_bin; j <= last_bin; j++) { 1825 total_bin_count += lsp->ls_hist[j]; 1826 if (lsp->ls_hist[j] > max_bin_count) 1827 max_bin_count = lsp->ls_hist[j]; 1828 } 1829 1830 /* 1831 * If we went a few frames below the caller, ignore them 1832 */ 1833 for (fr = 3; fr > 0; fr--) 1834 if (lsp->ls_stack[fr] == lsp->ls_caller) 1835 break; 1836 1837 for (j = first_bin; j <= last_bin; j++) { 1838 uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count; 1839 (void) fprintf(out, "%10llu |%s%s %-9u ", 1840 1ULL << j, 1841 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth, 1842 " " + depth, 1843 g_rates == 0 ? lsp->ls_hist[j] : 1844 (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) / 1845 g_elapsed)); 1846 if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) { 1847 (void) fprintf(out, "\n"); 1848 continue; 1849 } 1850 (void) fprintf(out, "%-24s\n", 1851 format_symbol(buf, lsp->ls_stack[fr], 0)); 1852 fr++; 1853 } 1854 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1855 (void) fprintf(out, "%15s %-36s %-24s\n", "", "", 1856 format_symbol(buf, lsp->ls_stack[fr], 0)); 1857 fr++; 1858 } 1859 } 1860 1861 if (!g_pflag) 1862 (void) fprintf(out, "---------------------------------" 1863 "----------------------------------------------\n"); 1864 1865 (void) fflush(out); 1866} 1867 1868static void 1869report_trace(FILE *out, lsrec_t **sort_buf) 1870{ 1871 lsrec_t *lsp; 1872 int i, fr; 1873 int rectype; 1874 char buf[256], buf2[256]; 1875 1876 rectype = g_recsize; 1877 1878 if (!g_pflag) { 1879 (void) fprintf(out, "%5s %7s %11s %-24s %-24s\n", 1880 "Event", "Time", "Owner", "Lock", "Caller"); 1881 (void) fprintf(out, "---------------------------------" 1882 "----------------------------------------------\n"); 1883 } 1884 1885 for (i = 0; i < g_nrecs_used; i++) { 1886 1887 lsp = sort_buf[i]; 1888 1889 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0) 1890 continue; 1891 1892 (void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n", 1893 lsp->ls_event, (unsigned long long)lsp->ls_time, 1894 (void *)lsp->ls_next, 1895 format_symbol(buf, lsp->ls_lock, 0), 1896 format_symbol(buf2, lsp->ls_caller, 0)); 1897 1898 if (rectype <= LS_STACK(0)) 1899 continue; 1900 1901 /* 1902 * If we went a few frames below the caller, ignore them 1903 */ 1904 for (fr = 3; fr > 0; fr--) 1905 if (lsp->ls_stack[fr] == lsp->ls_caller) 1906 break; 1907 1908 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1909 (void) fprintf(out, "%53s %-24s\n", "", 1910 format_symbol(buf, lsp->ls_stack[fr], 0)); 1911 fr++; 1912 } 1913 (void) fprintf(out, "\n"); 1914 } 1915 1916 (void) fflush(out); 1917} 1918