lockstat.c revision 283508
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <stdio.h>
29#include <stddef.h>
30#include <stdlib.h>
31#include <stdarg.h>
32#include <string.h>
33#include <strings.h>
34#include <ctype.h>
35#include <fcntl.h>
36#include <unistd.h>
37#include <errno.h>
38#include <limits.h>
39#include <sys/types.h>
40#include <sys/modctl.h>
41#include <sys/stat.h>
42#include <sys/wait.h>
43#include <dtrace.h>
44#include <sys/lockstat.h>
45#include <alloca.h>
46#include <signal.h>
47#include <assert.h>
48
49#ifdef illumos
50#define	GETOPT_EOF	EOF
51#else
52#include <sys/time.h>
53#include <sys/resource.h>
54
55#define	mergesort(a, b, c, d)	lsmergesort(a, b, c, d)
56#define	GETOPT_EOF		(-1)
57
58typedef	uintptr_t	pc_t;
59#endif
60
61#define	LOCKSTAT_OPTSTR	"x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
62
63#define	LS_MAX_STACK_DEPTH	50
64#define	LS_MAX_EVENTS		64
65
66typedef struct lsrec {
67	struct lsrec	*ls_next;	/* next in hash chain */
68	uintptr_t	ls_lock;	/* lock address */
69	uintptr_t	ls_caller;	/* caller address */
70	uint32_t	ls_count;	/* cumulative event count */
71	uint32_t	ls_event;	/* type of event */
72	uintptr_t	ls_refcnt;	/* cumulative reference count */
73	uint64_t	ls_time;	/* cumulative event duration */
74	uint32_t	ls_hist[64];	/* log2(duration) histogram */
75	uintptr_t	ls_stack[LS_MAX_STACK_DEPTH];
76} lsrec_t;
77
78typedef struct lsdata {
79	struct lsrec	*lsd_next;	/* next available */
80	int		lsd_count;	/* number of records */
81} lsdata_t;
82
83/*
84 * Definitions for the types of experiments which can be run.  They are
85 * listed in increasing order of memory cost and processing time cost.
86 * The numerical value of each type is the number of bytes needed per record.
87 */
88#define	LS_BASIC	offsetof(lsrec_t, ls_time)
89#define	LS_TIME		offsetof(lsrec_t, ls_hist[0])
90#define	LS_HIST		offsetof(lsrec_t, ls_stack[0])
91#define	LS_STACK(depth)	offsetof(lsrec_t, ls_stack[depth])
92
93static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
94static void report_trace(FILE *, lsrec_t **);
95
96extern int symtab_init(void);
97extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
98extern uintptr_t sym_to_addr(char *name);
99extern size_t sym_size(char *name);
100extern char *strtok_r(char *, const char *, char **);
101
102#define	DEFAULT_NRECS	10000
103#define	DEFAULT_HZ	97
104#define	MAX_HZ		1000
105#define	MIN_AGGSIZE	(16 * 1024)
106#define	MAX_AGGSIZE	(32 * 1024 * 1024)
107
108static int g_stkdepth;
109static int g_topn = INT_MAX;
110static hrtime_t g_elapsed;
111static int g_rates = 0;
112static int g_pflag = 0;
113static int g_Pflag = 0;
114static int g_wflag = 0;
115static int g_Wflag = 0;
116static int g_cflag = 0;
117static int g_kflag = 0;
118static int g_gflag = 0;
119static int g_Vflag = 0;
120static int g_tracing = 0;
121static size_t g_recsize;
122static size_t g_nrecs;
123static int g_nrecs_used;
124static uchar_t g_enabled[LS_MAX_EVENTS];
125static hrtime_t g_min_duration[LS_MAX_EVENTS];
126static dtrace_hdl_t *g_dtp;
127static char *g_predicate;
128static char *g_ipredicate;
129static char *g_prog;
130static int g_proglen;
131static int g_dropped;
132
133typedef struct ls_event_info {
134	char	ev_type;
135	char	ev_lhdr[20];
136	char	ev_desc[80];
137	char	ev_units[10];
138	char	ev_name[DTRACE_NAMELEN];
139	char	*ev_predicate;
140	char	*ev_acquire;
141} ls_event_info_t;
142
143static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
144	{ 'C',	"Lock",	"Adaptive mutex spin",			"nsec",
145	    "lockstat:::adaptive-spin" },
146	{ 'C',	"Lock",	"Adaptive mutex block",			"nsec",
147	    "lockstat:::adaptive-block" },
148	{ 'C',	"Lock",	"Spin lock spin",			"nsec",
149	    "lockstat:::spin-spin" },
150	{ 'C',	"Lock",	"Thread lock spin",			"nsec",
151	    "lockstat:::thread-spin" },
152	{ 'C',	"Lock",	"R/W writer blocked by writer",		"nsec",
153	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
154	{ 'C',	"Lock",	"R/W writer blocked by readers",	"nsec",
155	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
156	{ 'C',	"Lock",	"R/W reader blocked by writer",		"nsec",
157	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
158	{ 'C',	"Lock",	"R/W reader blocked by write wanted",	"nsec",
159	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
160	{ 'C',	"Lock",	"Unknown event (type 8)",		"units"	},
161	{ 'C',	"Lock",	"Unknown event (type 9)",		"units"	},
162	{ 'C',	"Lock",	"Unknown event (type 10)",		"units"	},
163	{ 'C',	"Lock",	"Unknown event (type 11)",		"units"	},
164	{ 'C',	"Lock",	"Unknown event (type 12)",		"units"	},
165	{ 'C',	"Lock",	"Unknown event (type 13)",		"units"	},
166	{ 'C',	"Lock",	"Unknown event (type 14)",		"units"	},
167	{ 'C',	"Lock",	"Unknown event (type 15)",		"units"	},
168	{ 'C',	"Lock",	"Unknown event (type 16)",		"units"	},
169	{ 'C',	"Lock",	"Unknown event (type 17)",		"units"	},
170	{ 'C',	"Lock",	"Unknown event (type 18)",		"units"	},
171	{ 'C',	"Lock",	"Unknown event (type 19)",		"units"	},
172	{ 'C',	"Lock",	"Unknown event (type 20)",		"units"	},
173	{ 'C',	"Lock",	"Unknown event (type 21)",		"units"	},
174	{ 'C',	"Lock",	"Unknown event (type 22)",		"units"	},
175	{ 'C',	"Lock",	"Unknown event (type 23)",		"units"	},
176	{ 'C',	"Lock",	"Unknown event (type 24)",		"units"	},
177	{ 'C',	"Lock",	"Unknown event (type 25)",		"units"	},
178	{ 'C',	"Lock",	"Unknown event (type 26)",		"units"	},
179	{ 'C',	"Lock",	"Unknown event (type 27)",		"units"	},
180	{ 'C',	"Lock",	"Unknown event (type 28)",		"units"	},
181	{ 'C',	"Lock",	"Unknown event (type 29)",		"units"	},
182	{ 'C',	"Lock",	"Unknown event (type 30)",		"units"	},
183	{ 'C',	"Lock",	"Unknown event (type 31)",		"units"	},
184	{ 'H',	"Lock",	"Adaptive mutex hold",			"nsec",
185	    "lockstat:::adaptive-release", NULL,
186	    "lockstat:::adaptive-acquire" },
187	{ 'H',	"Lock",	"Spin lock hold",			"nsec",
188	    "lockstat:::spin-release", NULL,
189	    "lockstat:::spin-acquire" },
190	{ 'H',	"Lock",	"R/W writer hold",			"nsec",
191	    "lockstat:::rw-release", "arg1 == 0",
192	    "lockstat:::rw-acquire" },
193	{ 'H',	"Lock",	"R/W reader hold",			"nsec",
194	    "lockstat:::rw-release", "arg1 != 0",
195	    "lockstat:::rw-acquire" },
196	{ 'H',	"Lock",	"Unknown event (type 36)",		"units"	},
197	{ 'H',	"Lock",	"Unknown event (type 37)",		"units"	},
198	{ 'H',	"Lock",	"Unknown event (type 38)",		"units"	},
199	{ 'H',	"Lock",	"Unknown event (type 39)",		"units"	},
200	{ 'H',	"Lock",	"Unknown event (type 40)",		"units"	},
201	{ 'H',	"Lock",	"Unknown event (type 41)",		"units"	},
202	{ 'H',	"Lock",	"Unknown event (type 42)",		"units"	},
203	{ 'H',	"Lock",	"Unknown event (type 43)",		"units"	},
204	{ 'H',	"Lock",	"Unknown event (type 44)",		"units"	},
205	{ 'H',	"Lock",	"Unknown event (type 45)",		"units"	},
206	{ 'H',	"Lock",	"Unknown event (type 46)",		"units"	},
207	{ 'H',	"Lock",	"Unknown event (type 47)",		"units"	},
208	{ 'H',	"Lock",	"Unknown event (type 48)",		"units"	},
209	{ 'H',	"Lock",	"Unknown event (type 49)",		"units"	},
210	{ 'H',	"Lock",	"Unknown event (type 50)",		"units"	},
211	{ 'H',	"Lock",	"Unknown event (type 51)",		"units"	},
212	{ 'H',	"Lock",	"Unknown event (type 52)",		"units"	},
213	{ 'H',	"Lock",	"Unknown event (type 53)",		"units"	},
214	{ 'H',	"Lock",	"Unknown event (type 54)",		"units"	},
215	{ 'H',	"Lock",	"Unknown event (type 55)",		"units"	},
216#ifdef illumos
217	{ 'I',	"CPU+PIL", "Profiling interrupt",		"nsec",
218#else
219	{ 'I',	"CPU+Pri_Class", "Profiling interrupt",		"nsec",
220#endif
221	    "profile:::profile-97", NULL },
222	{ 'I',	"Lock",	"Unknown event (type 57)",		"units"	},
223	{ 'I',	"Lock",	"Unknown event (type 58)",		"units"	},
224	{ 'I',	"Lock",	"Unknown event (type 59)",		"units"	},
225	{ 'E',	"Lock",	"Recursive lock entry detected",	"(N/A)",
226	    "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
227	{ 'E',	"Lock",	"Lockstat enter failure",		"(N/A)"	},
228	{ 'E',	"Lock",	"Lockstat exit failure",		"nsec"	},
229	{ 'E',	"Lock",	"Lockstat record failure",		"(N/A)"	},
230};
231
232#ifndef illumos
233static char *g_pri_class[] = {
234	"",
235	"Intr",
236	"RealT",
237	"TShar",
238	"Idle"
239};
240#endif
241
242static void
243fail(int do_perror, const char *message, ...)
244{
245	va_list args;
246	int save_errno = errno;
247
248	va_start(args, message);
249	(void) fprintf(stderr, "lockstat: ");
250	(void) vfprintf(stderr, message, args);
251	va_end(args);
252	if (do_perror)
253		(void) fprintf(stderr, ": %s", strerror(save_errno));
254	(void) fprintf(stderr, "\n");
255	exit(2);
256}
257
258static void
259dfail(const char *message, ...)
260{
261	va_list args;
262
263	va_start(args, message);
264	(void) fprintf(stderr, "lockstat: ");
265	(void) vfprintf(stderr, message, args);
266	va_end(args);
267	(void) fprintf(stderr, ": %s\n",
268	    dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
269
270	exit(2);
271}
272
273static void
274show_events(char event_type, char *desc)
275{
276	int i, first = -1, last;
277
278	for (i = 0; i < LS_MAX_EVENTS; i++) {
279		ls_event_info_t *evp = &g_event_info[i];
280		if (evp->ev_type != event_type ||
281		    strncmp(evp->ev_desc, "Unknown event", 13) == 0)
282			continue;
283		if (first == -1)
284			first = i;
285		last = i;
286	}
287
288	(void) fprintf(stderr,
289	    "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
290	    desc, event_type, first, last);
291
292	for (i = first; i <= last; i++)
293		(void) fprintf(stderr,
294		    "%4d = %s\n", i, g_event_info[i].ev_desc);
295}
296
297static void
298usage(void)
299{
300	(void) fprintf(stderr,
301	    "Usage: lockstat [options] command [args]\n"
302	    "\nGeneral options:\n\n"
303	    "  -V              print the corresponding D program\n"
304	    "\nEvent selection options:\n\n"
305	    "  -C              watch contention events [on by default]\n"
306	    "  -E              watch error events [off by default]\n"
307	    "  -H              watch hold events [off by default]\n"
308	    "  -I              watch interrupt events [off by default]\n"
309	    "  -A              watch all lock events [equivalent to -CH]\n"
310	    "  -e event_list   only watch the specified events (shown below);\n"
311	    "                  <event_list> is a comma-separated list of\n"
312	    "                  events or ranges of events, e.g. 1,4-7,35\n"
313	    "  -i rate         interrupt rate for -I [default: %d Hz]\n"
314	    "\nData gathering options:\n\n"
315	    "  -b              basic statistics (lock, caller, event count)\n"
316	    "  -t              timing for all events [default]\n"
317	    "  -h              histograms for event times\n"
318	    "  -s depth        stack traces <depth> deep\n"
319	    "  -x opt[=val]    enable or modify DTrace options\n"
320	    "\nData filtering options:\n\n"
321	    "  -n nrecords     maximum number of data records [default: %d]\n"
322	    "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
323	    "                  symbolic name or hex address; <size> defaults\n"
324	    "                  to the ELF symbol size if available, 1 if not\n"
325	    "  -f func[,size]  only watch events generated by <func>\n"
326	    "  -d duration     only watch events longer than <duration>\n"
327	    "  -T              trace (rather than sample) events\n"
328	    "\nData reporting options:\n\n"
329	    "  -c              coalesce lock data for arrays like pse_mutex[]\n"
330	    "  -k              coalesce PCs within functions\n"
331	    "  -g              show total events generated by function\n"
332	    "  -w              wherever: don't distinguish events by caller\n"
333	    "  -W              whichever: don't distinguish events by lock\n"
334	    "  -R              display rates rather than counts\n"
335	    "  -p              parsable output format (awk(1)-friendly)\n"
336	    "  -P              sort lock data by (count * avg_time) product\n"
337	    "  -D n            only display top <n> events of each type\n"
338	    "  -o filename     send output to <filename>\n",
339	    DEFAULT_HZ, DEFAULT_NRECS);
340
341	show_events('C', "Contention");
342	show_events('H', "Hold-time");
343	show_events('I', "Interrupt");
344	show_events('E', "Error");
345	(void) fprintf(stderr, "\n");
346
347	exit(1);
348}
349
350static int
351lockcmp(lsrec_t *a, lsrec_t *b)
352{
353	int i;
354
355	if (a->ls_event < b->ls_event)
356		return (-1);
357	if (a->ls_event > b->ls_event)
358		return (1);
359
360	for (i = g_stkdepth - 1; i >= 0; i--) {
361		if (a->ls_stack[i] < b->ls_stack[i])
362			return (-1);
363		if (a->ls_stack[i] > b->ls_stack[i])
364			return (1);
365	}
366
367	if (a->ls_caller < b->ls_caller)
368		return (-1);
369	if (a->ls_caller > b->ls_caller)
370		return (1);
371
372	if (a->ls_lock < b->ls_lock)
373		return (-1);
374	if (a->ls_lock > b->ls_lock)
375		return (1);
376
377	return (0);
378}
379
380static int
381countcmp(lsrec_t *a, lsrec_t *b)
382{
383	if (a->ls_event < b->ls_event)
384		return (-1);
385	if (a->ls_event > b->ls_event)
386		return (1);
387
388	return (b->ls_count - a->ls_count);
389}
390
391static int
392timecmp(lsrec_t *a, lsrec_t *b)
393{
394	if (a->ls_event < b->ls_event)
395		return (-1);
396	if (a->ls_event > b->ls_event)
397		return (1);
398
399	if (a->ls_time < b->ls_time)
400		return (1);
401	if (a->ls_time > b->ls_time)
402		return (-1);
403
404	return (0);
405}
406
407static int
408lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
409{
410	if (a->ls_event < b->ls_event)
411		return (-1);
412	if (a->ls_event > b->ls_event)
413		return (1);
414
415	if (a->ls_lock < b->ls_lock)
416		return (-1);
417	if (a->ls_lock > b->ls_lock)
418		return (1);
419
420	return (0);
421}
422
423static int
424lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
425{
426	if (a->ls_event < b->ls_event)
427		return (-1);
428	if (a->ls_event > b->ls_event)
429		return (1);
430
431	if (a->ls_lock < b->ls_lock)
432		return (-1);
433	if (a->ls_lock > b->ls_lock)
434		return (1);
435
436	return (b->ls_count - a->ls_count);
437}
438
439static int
440sitecmp_anylock(lsrec_t *a, lsrec_t *b)
441{
442	int i;
443
444	if (a->ls_event < b->ls_event)
445		return (-1);
446	if (a->ls_event > b->ls_event)
447		return (1);
448
449	for (i = g_stkdepth - 1; i >= 0; i--) {
450		if (a->ls_stack[i] < b->ls_stack[i])
451			return (-1);
452		if (a->ls_stack[i] > b->ls_stack[i])
453			return (1);
454	}
455
456	if (a->ls_caller < b->ls_caller)
457		return (-1);
458	if (a->ls_caller > b->ls_caller)
459		return (1);
460
461	return (0);
462}
463
464static int
465site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
466{
467	int i;
468
469	if (a->ls_event < b->ls_event)
470		return (-1);
471	if (a->ls_event > b->ls_event)
472		return (1);
473
474	for (i = g_stkdepth - 1; i >= 0; i--) {
475		if (a->ls_stack[i] < b->ls_stack[i])
476			return (-1);
477		if (a->ls_stack[i] > b->ls_stack[i])
478			return (1);
479	}
480
481	if (a->ls_caller < b->ls_caller)
482		return (-1);
483	if (a->ls_caller > b->ls_caller)
484		return (1);
485
486	return (b->ls_count - a->ls_count);
487}
488
489static void
490lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
491{
492	int m = n / 2;
493	int i, j;
494
495	if (m > 1)
496		lsmergesort(cmp, a, b, m);
497	if (n - m > 1)
498		lsmergesort(cmp, a + m, b + m, n - m);
499	for (i = m; i > 0; i--)
500		b[i - 1] = a[i - 1];
501	for (j = m - 1; j < n - 1; j++)
502		b[n + m - j - 2] = a[j + 1];
503	while (i < j)
504		*a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
505	*a = b[i];
506}
507
508static void
509coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
510{
511	int i, j;
512	lsrec_t *target, *current;
513
514	target = lock[0];
515
516	for (i = 1; i < n; i++) {
517		current = lock[i];
518		if (cmp(current, target) != 0) {
519			target = current;
520			continue;
521		}
522		current->ls_event = LS_MAX_EVENTS;
523		target->ls_count += current->ls_count;
524		target->ls_refcnt += current->ls_refcnt;
525		if (g_recsize < LS_TIME)
526			continue;
527		target->ls_time += current->ls_time;
528		if (g_recsize < LS_HIST)
529			continue;
530		for (j = 0; j < 64; j++)
531			target->ls_hist[j] += current->ls_hist[j];
532	}
533}
534
535static void
536coalesce_symbol(uintptr_t *addrp)
537{
538	uintptr_t symoff;
539	size_t symsize;
540
541	if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
542		*addrp -= symoff;
543}
544
545static void
546predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
547{
548	char *new;
549	int len, newlen;
550
551	if (what == NULL)
552		return;
553
554	if (*pred == NULL) {
555		*pred = malloc(1);
556		*pred[0] = '\0';
557	}
558
559	len = strlen(*pred);
560	newlen = len + strlen(what) + 32 + strlen("( && )");
561	new = malloc(newlen);
562
563	if (*pred[0] != '\0') {
564		if (cmp != NULL) {
565			(void) sprintf(new, "(%s) && (%s %s 0x%p)",
566			    *pred, what, cmp, (void *)value);
567		} else {
568			(void) sprintf(new, "(%s) && (%s)", *pred, what);
569		}
570	} else {
571		if (cmp != NULL) {
572			(void) sprintf(new, "%s %s 0x%p",
573			    what, cmp, (void *)value);
574		} else {
575			(void) sprintf(new, "%s", what);
576		}
577	}
578
579	free(*pred);
580	*pred = new;
581}
582
583static void
584predicate_destroy(char **pred)
585{
586	free(*pred);
587	*pred = NULL;
588}
589
590static void
591filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
592{
593	char buf[256], *c = buf, *new;
594	int len, newlen;
595
596	if (*filt == NULL) {
597		*filt = malloc(1);
598		*filt[0] = '\0';
599	}
600
601#ifdef illumos
602	(void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
603	    " || " : "", what, (void *)base, what, (void *)(base + size));
604#else
605	(void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ?
606	    " || " : "", what, (void *)base, what, (void *)(base + size));
607#endif
608
609	newlen = (len = strlen(*filt) + 1) + strlen(c);
610	new = malloc(newlen);
611	bcopy(*filt, new, len);
612	(void) strcat(new, c);
613	free(*filt);
614	*filt = new;
615}
616
617static void
618filter_destroy(char **filt)
619{
620	free(*filt);
621	*filt = NULL;
622}
623
624static void
625dprog_add(const char *fmt, ...)
626{
627	va_list args;
628	int size, offs;
629	char c;
630
631	va_start(args, fmt);
632	size = vsnprintf(&c, 1, fmt, args) + 1;
633	va_end(args);
634
635	if (g_proglen == 0) {
636		offs = 0;
637	} else {
638		offs = g_proglen - 1;
639	}
640
641	g_proglen = offs + size;
642
643	if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
644		fail(1, "failed to reallocate program text");
645
646	va_start(args, fmt);
647	(void) vsnprintf(&g_prog[offs], size, fmt, args);
648	va_end(args);
649}
650
651/*
652 * This function may read like an open sewer, but keep in mind that programs
653 * that generate other programs are rarely pretty.  If one has the unenviable
654 * task of maintaining or -- worse -- extending this code, use the -V option
655 * to examine the D program as generated by this function.
656 */
657static void
658dprog_addevent(int event)
659{
660	ls_event_info_t *info = &g_event_info[event];
661	char *pred = NULL;
662	char stack[20];
663	const char *arg0, *caller;
664	char *arg1 = "arg1";
665	char buf[80];
666	hrtime_t dur;
667	int depth;
668
669	if (info->ev_name[0] == '\0')
670		return;
671
672	if (info->ev_type == 'I') {
673		/*
674		 * For interrupt events, arg0 (normally the lock pointer) is
675		 * the CPU address plus the current pil, and arg1 (normally
676		 * the number of nanoseconds) is the number of nanoseconds
677		 * late -- and it's stored in arg2.
678		 */
679#ifdef illumos
680		arg0 = "(uintptr_t)curthread->t_cpu + \n"
681		    "\t    curthread->t_cpu->cpu_profile_pil";
682#else
683		arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n"
684		    "\t    0x01000000 + curthread->td_pri_class";
685#endif
686		caller = "(uintptr_t)arg0";
687		arg1 = "arg2";
688	} else {
689		arg0 = "(uintptr_t)arg0";
690		caller = "caller";
691	}
692
693	if (g_recsize > LS_HIST) {
694		for (depth = 0; g_recsize > LS_STACK(depth); depth++)
695			continue;
696
697		if (g_tracing) {
698			(void) sprintf(stack, "\tstack(%d);\n", depth);
699		} else {
700			(void) sprintf(stack, ", stack(%d)", depth);
701		}
702	} else {
703		(void) sprintf(stack, "");
704	}
705
706	if (info->ev_acquire != NULL) {
707		/*
708		 * If this is a hold event, we need to generate an additional
709		 * clause for the acquire; the clause for the release will be
710		 * generated with the aggregating statement, below.
711		 */
712		dprog_add("%s\n", info->ev_acquire);
713		predicate_add(&pred, info->ev_predicate, NULL, 0);
714		predicate_add(&pred, g_predicate, NULL, 0);
715		if (pred != NULL)
716			dprog_add("/%s/\n", pred);
717
718		dprog_add("{\n");
719		(void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
720
721		if (info->ev_type == 'H') {
722			dprog_add("\t%s = timestamp;\n", buf);
723		} else {
724			/*
725			 * If this isn't a hold event, it's the recursive
726			 * error event.  For this, we simply bump the
727			 * thread-local, per-lock count.
728			 */
729			dprog_add("\t%s++;\n", buf);
730		}
731
732		dprog_add("}\n\n");
733		predicate_destroy(&pred);
734		pred = NULL;
735
736		if (info->ev_type == 'E') {
737			/*
738			 * If this is the recursive lock error event, we need
739			 * to generate an additional clause to decrement the
740			 * thread-local, per-lock count.  This assures that we
741			 * only execute the aggregating clause if we have
742			 * recursive entry.
743			 */
744			dprog_add("%s\n", info->ev_name);
745			dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
746		}
747
748		predicate_add(&pred, buf, NULL, 0);
749
750		if (info->ev_type == 'H') {
751			(void) sprintf(buf, "timestamp -\n\t    "
752			    "self->ev%d[(uintptr_t)arg0]", event);
753		}
754
755		arg1 = buf;
756	} else {
757		predicate_add(&pred, info->ev_predicate, NULL, 0);
758		if (info->ev_type != 'I')
759			predicate_add(&pred, g_predicate, NULL, 0);
760		else
761			predicate_add(&pred, g_ipredicate, NULL, 0);
762	}
763
764	if ((dur = g_min_duration[event]) != 0)
765		predicate_add(&pred, arg1, ">=", dur);
766
767	dprog_add("%s\n", info->ev_name);
768
769	if (pred != NULL)
770		dprog_add("/%s/\n", pred);
771	predicate_destroy(&pred);
772
773	dprog_add("{\n");
774
775	if (g_tracing) {
776		dprog_add("\ttrace(%dULL);\n", event);
777		dprog_add("\ttrace(%s);\n", arg0);
778		dprog_add("\ttrace(%s);\n", caller);
779		dprog_add(stack);
780	} else {
781		/*
782		 * The ordering here is important:  when we process the
783		 * aggregate, we count on the fact that @avg appears before
784		 * @hist in program order to assure that @avg is assigned the
785		 * first aggregation variable ID and @hist assigned the
786		 * second; see the comment in process_aggregate() for details.
787		 */
788		dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
789		    event, arg0, caller, stack, arg1);
790
791		if (g_recsize >= LS_HIST) {
792			dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
793			    "(%s);\n", event, arg0, caller, stack, arg1);
794		}
795	}
796
797	if (info->ev_acquire != NULL)
798		dprog_add("\tself->ev%d[arg0] = 0;\n", event);
799
800	dprog_add("}\n\n");
801}
802
803static void
804dprog_compile()
805{
806	dtrace_prog_t *prog;
807	dtrace_proginfo_t info;
808
809	if (g_Vflag) {
810		(void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
811		(void) fputs(g_prog, stderr);
812		(void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
813	}
814
815	if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
816	    DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
817		dfail("failed to compile program");
818
819	if (dtrace_program_exec(g_dtp, prog, &info) == -1)
820		dfail("failed to enable probes");
821
822	if (dtrace_go(g_dtp) != 0)
823		dfail("couldn't start tracing");
824}
825
826static void
827#ifdef illumos
828status_fire(void)
829#else
830status_fire(int i)
831#endif
832{}
833
834static void
835status_init(void)
836{
837	dtrace_optval_t val, status, agg;
838	struct sigaction act;
839	struct itimerspec ts;
840	struct sigevent ev;
841	timer_t tid;
842
843	if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
844		dfail("failed to get 'statusrate'");
845
846	if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
847		dfail("failed to get 'statusrate'");
848
849	/*
850	 * We would want to awaken at a rate that is the GCD of the statusrate
851	 * and the aggrate -- but that seems a bit absurd.  Instead, we'll
852	 * simply awaken at a rate that is the more frequent of the two, which
853	 * assures that we're never later than the interval implied by the
854	 * more frequent rate.
855	 */
856	val = status < agg ? status : agg;
857
858	(void) sigemptyset(&act.sa_mask);
859	act.sa_flags = 0;
860	act.sa_handler = status_fire;
861	(void) sigaction(SIGUSR1, &act, NULL);
862
863	ev.sigev_notify = SIGEV_SIGNAL;
864	ev.sigev_signo = SIGUSR1;
865
866	if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
867		dfail("cannot create CLOCK_REALTIME timer");
868
869	ts.it_value.tv_sec = val / NANOSEC;
870	ts.it_value.tv_nsec = val % NANOSEC;
871	ts.it_interval = ts.it_value;
872
873	if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
874		dfail("cannot set time on CLOCK_REALTIME timer");
875}
876
877static void
878status_check(void)
879{
880	if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
881		dfail("failed to snap aggregate");
882
883	if (dtrace_status(g_dtp) == -1)
884		dfail("dtrace_status()");
885}
886
887static void
888lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
889{
890	bzero(lsrec, g_recsize);
891	lsrec->ls_count = 1;
892
893	if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
894		fail(0, "truncated DTrace record");
895
896	if (rec->dtrd_size != sizeof (uint64_t))
897		fail(0, "bad event size in first record");
898
899	/* LINTED - alignment */
900	lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
901	rec++;
902
903	if (rec->dtrd_size != sizeof (uintptr_t))
904		fail(0, "bad lock address size in second record");
905
906	/* LINTED - alignment */
907	lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
908	rec++;
909
910	if (rec->dtrd_size != sizeof (uintptr_t))
911		fail(0, "bad caller size in third record");
912
913	/* LINTED - alignment */
914	lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
915	rec++;
916
917	if (g_recsize > LS_HIST) {
918		int frames, i;
919		pc_t *stack;
920
921		frames = rec->dtrd_size / sizeof (pc_t);
922		/* LINTED - alignment */
923		stack = (pc_t *)(data + rec->dtrd_offset);
924
925		for (i = 1; i < frames; i++)
926			lsrec->ls_stack[i - 1] = stack[i];
927	}
928}
929
930/*ARGSUSED*/
931static int
932count_aggregate(const dtrace_aggdata_t *agg, void *arg)
933{
934	*((size_t *)arg) += 1;
935
936	return (DTRACE_AGGWALK_NEXT);
937}
938
939static int
940process_aggregate(const dtrace_aggdata_t *agg, void *arg)
941{
942	const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
943	caddr_t data = agg->dtada_data;
944	lsdata_t *lsdata = arg;
945	lsrec_t *lsrec = lsdata->lsd_next;
946	const dtrace_recdesc_t *rec;
947	uint64_t *avg, *quantized;
948	int i, j;
949
950	assert(lsdata->lsd_count < g_nrecs);
951
952	/*
953	 * Aggregation variable IDs are guaranteed to be generated in program
954	 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
955	 * plus one.  As "avg" appears before "hist" in program order, we know
956	 * that "avg" will be allocated the first aggregation variable ID, and
957	 * "hist" will be allocated the second aggregation variable ID -- and
958	 * we therefore use the aggregation variable ID to differentiate the
959	 * cases.
960	 */
961	if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
962		/*
963		 * If this is the histogram entry.  We'll copy the quantized
964		 * data into lc_hist, and jump over the rest.
965		 */
966		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
967
968		if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
969			fail(0, "bad variable ID in aggregation record");
970
971		if (rec->dtrd_size !=
972		    DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
973			fail(0, "bad quantize size in aggregation record");
974
975		/* LINTED - alignment */
976		quantized = (uint64_t *)(data + rec->dtrd_offset);
977
978		for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
979		    i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
980			lsrec->ls_hist[j] = quantized[i];
981
982		goto out;
983	}
984
985	lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
986	    aggdesc->dtagd_nrecs - 1, data);
987
988	rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
989
990	if (rec->dtrd_size != 2 * sizeof (uint64_t))
991		fail(0, "bad avg size in aggregation record");
992
993	/* LINTED - alignment */
994	avg = (uint64_t *)(data + rec->dtrd_offset);
995	lsrec->ls_count = (uint32_t)avg[0];
996	lsrec->ls_time = (uintptr_t)avg[1];
997
998	if (g_recsize >= LS_HIST)
999		return (DTRACE_AGGWALK_NEXT);
1000
1001out:
1002	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1003	lsdata->lsd_count++;
1004
1005	return (DTRACE_AGGWALK_NEXT);
1006}
1007
1008static int
1009process_trace(const dtrace_probedata_t *pdata, void *arg)
1010{
1011	lsdata_t *lsdata = arg;
1012	lsrec_t *lsrec = lsdata->lsd_next;
1013	dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
1014	caddr_t data = pdata->dtpda_data;
1015
1016	if (lsdata->lsd_count >= g_nrecs)
1017		return (DTRACE_CONSUME_NEXT);
1018
1019	lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
1020
1021	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1022	lsdata->lsd_count++;
1023
1024	return (DTRACE_CONSUME_NEXT);
1025}
1026
1027static int
1028process_data(FILE *out, char *data)
1029{
1030	lsdata_t lsdata;
1031
1032	/* LINTED - alignment */
1033	lsdata.lsd_next = (lsrec_t *)data;
1034	lsdata.lsd_count = 0;
1035
1036	if (g_tracing) {
1037		if (dtrace_consume(g_dtp, out,
1038		    process_trace, NULL, &lsdata) != 0)
1039			dfail("failed to consume buffer");
1040
1041		return (lsdata.lsd_count);
1042	}
1043
1044	if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
1045	    process_aggregate, &lsdata) != 0)
1046		dfail("failed to walk aggregate");
1047
1048	return (lsdata.lsd_count);
1049}
1050
1051/*ARGSUSED*/
1052static int
1053drophandler(const dtrace_dropdata_t *data, void *arg)
1054{
1055	g_dropped++;
1056	(void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1057	return (DTRACE_HANDLE_OK);
1058}
1059
1060int
1061main(int argc, char **argv)
1062{
1063	char *data_buf;
1064	lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1065	FILE *out = stdout;
1066	int c;
1067	pid_t child;
1068	int status;
1069	int i, j;
1070	hrtime_t duration;
1071	char *addrp, *offp, *sizep, *evp, *lastp, *p;
1072	uintptr_t addr;
1073	size_t size, off;
1074	int events_specified = 0;
1075	int exec_errno = 0;
1076	uint32_t event;
1077	char *filt = NULL, *ifilt = NULL;
1078	static uint64_t ev_count[LS_MAX_EVENTS + 1];
1079	static uint64_t ev_time[LS_MAX_EVENTS + 1];
1080	dtrace_optval_t aggsize;
1081	char aggstr[10];
1082	long ncpus;
1083	int dynvar = 0;
1084	int err;
1085
1086	if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1087		fail(0, "cannot open dtrace library: %s",
1088		    dtrace_errmsg(NULL, err));
1089	}
1090
1091	if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1092		dfail("couldn't establish drop handler");
1093
1094	if (symtab_init() == -1)
1095		fail(1, "can't load kernel symbols");
1096
1097	g_nrecs = DEFAULT_NRECS;
1098
1099	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1100		switch (c) {
1101		case 'b':
1102			g_recsize = LS_BASIC;
1103			break;
1104
1105		case 't':
1106			g_recsize = LS_TIME;
1107			break;
1108
1109		case 'h':
1110			g_recsize = LS_HIST;
1111			break;
1112
1113		case 's':
1114			if (!isdigit(optarg[0]))
1115				usage();
1116			g_stkdepth = atoi(optarg);
1117			if (g_stkdepth > LS_MAX_STACK_DEPTH)
1118				fail(0, "max stack depth is %d",
1119				    LS_MAX_STACK_DEPTH);
1120			g_recsize = LS_STACK(g_stkdepth);
1121			break;
1122
1123		case 'n':
1124			if (!isdigit(optarg[0]))
1125				usage();
1126			g_nrecs = atoi(optarg);
1127			break;
1128
1129		case 'd':
1130			if (!isdigit(optarg[0]))
1131				usage();
1132			duration = atoll(optarg);
1133
1134			/*
1135			 * XXX -- durations really should be per event
1136			 * since the units are different, but it's hard
1137			 * to express this nicely in the interface.
1138			 * Not clear yet what the cleanest solution is.
1139			 */
1140			for (i = 0; i < LS_MAX_EVENTS; i++)
1141				if (g_event_info[i].ev_type != 'E')
1142					g_min_duration[i] = duration;
1143
1144			break;
1145
1146		case 'i':
1147			if (!isdigit(optarg[0]))
1148				usage();
1149			i = atoi(optarg);
1150			if (i <= 0)
1151				usage();
1152			if (i > MAX_HZ)
1153				fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1154
1155			for (j = 0; j < LS_MAX_EVENTS; j++)
1156				if (strcmp(g_event_info[j].ev_desc,
1157				    "Profiling interrupt") == 0)
1158					break;
1159
1160			(void) sprintf(g_event_info[j].ev_name,
1161			    "profile:::profile-%d", i);
1162			break;
1163
1164		case 'l':
1165		case 'f':
1166			addrp = strtok(optarg, ",");
1167			sizep = strtok(NULL, ",");
1168			addrp = strtok(optarg, ",+");
1169			offp = strtok(NULL, ",");
1170
1171			size = sizep ? strtoul(sizep, NULL, 0) : 1;
1172			off = offp ? strtoul(offp, NULL, 0) : 0;
1173
1174			if (addrp[0] == '0') {
1175				addr = strtoul(addrp, NULL, 16) + off;
1176			} else {
1177				addr = sym_to_addr(addrp) + off;
1178				if (sizep == NULL)
1179					size = sym_size(addrp) - off;
1180				if (addr - off == 0)
1181					fail(0, "symbol '%s' not found", addrp);
1182				if (size == 0)
1183					size = 1;
1184			}
1185
1186
1187			if (c == 'l') {
1188				filter_add(&filt, "arg0", addr, size);
1189			} else {
1190				filter_add(&filt, "caller", addr, size);
1191				filter_add(&ifilt, "arg0", addr, size);
1192			}
1193			break;
1194
1195		case 'e':
1196			evp = strtok_r(optarg, ",", &lastp);
1197			while (evp) {
1198				int ev1, ev2;
1199				char *evp2;
1200
1201				(void) strtok(evp, "-");
1202				evp2 = strtok(NULL, "-");
1203				ev1 = atoi(evp);
1204				ev2 = evp2 ? atoi(evp2) : ev1;
1205				if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1206				    (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1207					fail(0, "-e events out of range");
1208				for (i = ev1; i <= ev2; i++)
1209					g_enabled[i] = 1;
1210				evp = strtok_r(NULL, ",", &lastp);
1211			}
1212			events_specified = 1;
1213			break;
1214
1215		case 'c':
1216			g_cflag = 1;
1217			break;
1218
1219		case 'k':
1220			g_kflag = 1;
1221			break;
1222
1223		case 'w':
1224			g_wflag = 1;
1225			break;
1226
1227		case 'W':
1228			g_Wflag = 1;
1229			break;
1230
1231		case 'g':
1232			g_gflag = 1;
1233			break;
1234
1235		case 'C':
1236		case 'E':
1237		case 'H':
1238		case 'I':
1239			for (i = 0; i < LS_MAX_EVENTS; i++)
1240				if (g_event_info[i].ev_type == c)
1241					g_enabled[i] = 1;
1242			events_specified = 1;
1243			break;
1244
1245		case 'A':
1246			for (i = 0; i < LS_MAX_EVENTS; i++)
1247				if (strchr("CH", g_event_info[i].ev_type))
1248					g_enabled[i] = 1;
1249			events_specified = 1;
1250			break;
1251
1252		case 'T':
1253			g_tracing = 1;
1254			break;
1255
1256		case 'D':
1257			if (!isdigit(optarg[0]))
1258				usage();
1259			g_topn = atoi(optarg);
1260			break;
1261
1262		case 'R':
1263			g_rates = 1;
1264			break;
1265
1266		case 'p':
1267			g_pflag = 1;
1268			break;
1269
1270		case 'P':
1271			g_Pflag = 1;
1272			break;
1273
1274		case 'o':
1275			if ((out = fopen(optarg, "w")) == NULL)
1276				fail(1, "error opening file");
1277			break;
1278
1279		case 'V':
1280			g_Vflag = 1;
1281			break;
1282
1283		default:
1284			if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1285				usage();
1286		}
1287	}
1288
1289	if (filt != NULL) {
1290		predicate_add(&g_predicate, filt, NULL, 0);
1291		filter_destroy(&filt);
1292	}
1293
1294	if (ifilt != NULL) {
1295		predicate_add(&g_ipredicate, ifilt, NULL, 0);
1296		filter_destroy(&ifilt);
1297	}
1298
1299	if (g_recsize == 0) {
1300		if (g_gflag) {
1301			g_stkdepth = LS_MAX_STACK_DEPTH;
1302			g_recsize = LS_STACK(g_stkdepth);
1303		} else {
1304			g_recsize = LS_TIME;
1305		}
1306	}
1307
1308	if (g_gflag && g_recsize <= LS_STACK(0))
1309		fail(0, "'-g' requires at least '-s 1' data gathering");
1310
1311	/*
1312	 * Make sure the alignment is reasonable
1313	 */
1314	g_recsize = -(-g_recsize & -sizeof (uint64_t));
1315
1316	for (i = 0; i < LS_MAX_EVENTS; i++) {
1317		/*
1318		 * If no events were specified, enable -C.
1319		 */
1320		if (!events_specified && g_event_info[i].ev_type == 'C')
1321			g_enabled[i] = 1;
1322	}
1323
1324	for (i = 0; i < LS_MAX_EVENTS; i++) {
1325		if (!g_enabled[i])
1326			continue;
1327
1328		if (g_event_info[i].ev_acquire != NULL) {
1329			/*
1330			 * If we've enabled a hold event, we must explicitly
1331			 * allocate dynamic variable space.
1332			 */
1333			dynvar = 1;
1334		}
1335
1336		dprog_addevent(i);
1337	}
1338
1339	/*
1340	 * Make sure there are remaining arguments to specify a child command
1341	 * to execute.
1342	 */
1343	if (argc <= optind)
1344		usage();
1345
1346	if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1347		dfail("couldn't determine number of online CPUs");
1348
1349	/*
1350	 * By default, we set our data buffer size to be the number of records
1351	 * multiplied by the size of the record, doubled to account for some
1352	 * DTrace slop and divided by the number of CPUs.  We silently clamp
1353	 * the aggregation size at both a minimum and a maximum to prevent
1354	 * absurdly low or high values.
1355	 */
1356	if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1357		aggsize = MIN_AGGSIZE;
1358
1359	if (aggsize > MAX_AGGSIZE)
1360		aggsize = MAX_AGGSIZE;
1361
1362	(void) sprintf(aggstr, "%lld", (long long)aggsize);
1363
1364	if (!g_tracing) {
1365		if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1366			dfail("failed to set 'bufsize'");
1367
1368		if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1369			dfail("failed to set 'aggsize'");
1370
1371		if (dynvar) {
1372			/*
1373			 * If we're using dynamic variables, we set our
1374			 * dynamic variable size to be one megabyte per CPU,
1375			 * with a hard-limit of 32 megabytes.  This may still
1376			 * be too small in some cases, but it can be tuned
1377			 * manually via -x if need be.
1378			 */
1379			(void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1380
1381			if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1382				dfail("failed to set 'dynvarsize'");
1383		}
1384	} else {
1385		if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1386			dfail("failed to set 'bufsize'");
1387	}
1388
1389	if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1390		dfail("failed to set 'statusrate'");
1391
1392	optind = 1;
1393	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1394		switch (c) {
1395		case 'x':
1396			if ((p = strchr(optarg, '=')) != NULL)
1397				*p++ = '\0';
1398
1399			if (dtrace_setopt(g_dtp, optarg, p) != 0)
1400				dfail("failed to set -x %s", optarg);
1401			break;
1402		}
1403	}
1404
1405	argc -= optind;
1406	argv += optind;
1407
1408	dprog_compile();
1409	status_init();
1410
1411	g_elapsed = -gethrtime();
1412
1413	/*
1414	 * Spawn the specified command and wait for it to complete.
1415	 */
1416	child = fork();
1417	if (child == -1)
1418		fail(1, "cannot fork");
1419	if (child == 0) {
1420		(void) dtrace_close(g_dtp);
1421		(void) execvp(argv[0], &argv[0]);
1422		exec_errno = errno;
1423		exit(127);
1424	}
1425
1426#ifdef illumos
1427	while (waitpid(child, &status, WEXITED) != child)
1428#else
1429	while (waitpid(child, &status, 0) != child)
1430#endif
1431		status_check();
1432
1433	g_elapsed += gethrtime();
1434
1435	if (WIFEXITED(status)) {
1436		if (WEXITSTATUS(status) != 0) {
1437			if (exec_errno != 0) {
1438				errno = exec_errno;
1439				fail(1, "could not execute %s", argv[0]);
1440			}
1441			(void) fprintf(stderr,
1442			    "lockstat: warning: %s exited with code %d\n",
1443			    argv[0], WEXITSTATUS(status));
1444		}
1445	} else {
1446		(void) fprintf(stderr,
1447		    "lockstat: warning: %s died on signal %d\n",
1448		    argv[0], WTERMSIG(status));
1449	}
1450
1451	if (dtrace_stop(g_dtp) == -1)
1452		dfail("failed to stop dtrace");
1453
1454	/*
1455	 * Before we read out the results, we need to allocate our buffer.
1456	 * If we're tracing, then we'll just use the precalculated size.  If
1457	 * we're not, then we'll take a snapshot of the aggregate, and walk
1458	 * it to count the number of records.
1459	 */
1460	if (!g_tracing) {
1461		if (dtrace_aggregate_snap(g_dtp) != 0)
1462			dfail("failed to snap aggregate");
1463
1464		g_nrecs = 0;
1465
1466		if (dtrace_aggregate_walk(g_dtp,
1467		    count_aggregate, &g_nrecs) != 0)
1468			dfail("failed to walk aggregate");
1469	}
1470
1471#ifdef illumos
1472	if ((data_buf = memalign(sizeof (uint64_t),
1473	    (g_nrecs + 1) * g_recsize)) == NULL)
1474#else
1475	if (posix_memalign((void **)&data_buf, sizeof (uint64_t),
1476	    (g_nrecs + 1) * g_recsize) )
1477#endif
1478		fail(1, "Memory allocation failed");
1479
1480	/*
1481	 * Read out the DTrace data.
1482	 */
1483	g_nrecs_used = process_data(out, data_buf);
1484
1485	if (g_nrecs_used > g_nrecs || g_dropped)
1486		(void) fprintf(stderr, "lockstat: warning: "
1487		    "ran out of data records (use -n for more)\n");
1488
1489	/* LINTED - alignment */
1490	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1491	    /* LINTED - alignment */
1492	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1493		ev_count[lsp->ls_event] += lsp->ls_count;
1494		ev_time[lsp->ls_event] += lsp->ls_time;
1495	}
1496
1497	/*
1498	 * If -g was specified, convert stacks into individual records.
1499	 */
1500	if (g_gflag) {
1501		lsrec_t *newlsp, *oldlsp;
1502
1503#ifdef illumos
1504		newlsp = memalign(sizeof (uint64_t),
1505		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1506#else
1507		posix_memalign((void **)&newlsp, sizeof (uint64_t),
1508		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1509#endif
1510		if (newlsp == NULL)
1511			fail(1, "Cannot allocate space for -g processing");
1512		lsp = newlsp;
1513		/* LINTED - alignment */
1514		for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1515		    /* LINTED - alignment */
1516		    oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1517			int fr;
1518			int caller_in_stack = 0;
1519
1520			if (oldlsp->ls_count == 0)
1521				continue;
1522
1523			for (fr = 0; fr < g_stkdepth; fr++) {
1524				if (oldlsp->ls_stack[fr] == 0)
1525					break;
1526				if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1527					caller_in_stack = 1;
1528				bcopy(oldlsp, lsp, LS_TIME);
1529				lsp->ls_caller = oldlsp->ls_stack[fr];
1530				/* LINTED - alignment */
1531				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1532			}
1533			if (!caller_in_stack) {
1534				bcopy(oldlsp, lsp, LS_TIME);
1535				/* LINTED - alignment */
1536				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1537			}
1538		}
1539		g_nrecs = g_nrecs_used =
1540		    ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1541		g_recsize = LS_TIME;
1542		g_stkdepth = 0;
1543		free(data_buf);
1544		data_buf = (char *)newlsp;
1545	}
1546
1547	if ((sort_buf = calloc(2 * (g_nrecs + 1),
1548	    sizeof (void *))) == NULL)
1549		fail(1, "Sort buffer allocation failed");
1550	merge_buf = sort_buf + (g_nrecs + 1);
1551
1552	/*
1553	 * Build the sort buffer, discarding zero-count records along the way.
1554	 */
1555	/* LINTED - alignment */
1556	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1557	    /* LINTED - alignment */
1558	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1559		if (lsp->ls_count == 0)
1560			lsp->ls_event = LS_MAX_EVENTS;
1561		sort_buf[i] = lsp;
1562	}
1563
1564	if (g_nrecs_used == 0)
1565		exit(0);
1566
1567	/*
1568	 * Add a sentinel after the last record
1569	 */
1570	sort_buf[i] = lsp;
1571	lsp->ls_event = LS_MAX_EVENTS;
1572
1573	if (g_tracing) {
1574		report_trace(out, sort_buf);
1575		return (0);
1576	}
1577
1578	/*
1579	 * Application of -g may have resulted in multiple records
1580	 * with the same signature; coalesce them.
1581	 */
1582	if (g_gflag) {
1583		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1584		coalesce(lockcmp, sort_buf, g_nrecs_used);
1585	}
1586
1587	/*
1588	 * Coalesce locks within the same symbol if -c option specified.
1589	 * Coalesce PCs within the same function if -k option specified.
1590	 */
1591	if (g_cflag || g_kflag) {
1592		for (i = 0; i < g_nrecs_used; i++) {
1593			int fr;
1594			lsp = sort_buf[i];
1595			if (g_cflag)
1596				coalesce_symbol(&lsp->ls_lock);
1597			if (g_kflag) {
1598				for (fr = 0; fr < g_stkdepth; fr++)
1599					coalesce_symbol(&lsp->ls_stack[fr]);
1600				coalesce_symbol(&lsp->ls_caller);
1601			}
1602		}
1603		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1604		coalesce(lockcmp, sort_buf, g_nrecs_used);
1605	}
1606
1607	/*
1608	 * Coalesce callers if -w option specified
1609	 */
1610	if (g_wflag) {
1611		mergesort(lock_and_count_cmp_anywhere,
1612		    sort_buf, merge_buf, g_nrecs_used);
1613		coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1614	}
1615
1616	/*
1617	 * Coalesce locks if -W option specified
1618	 */
1619	if (g_Wflag) {
1620		mergesort(site_and_count_cmp_anylock,
1621		    sort_buf, merge_buf, g_nrecs_used);
1622		coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1623	}
1624
1625	/*
1626	 * Sort data by contention count (ls_count) or total time (ls_time),
1627	 * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
1628	 */
1629	if (g_recsize < LS_TIME)
1630		g_Pflag = 0;
1631
1632	if (g_Pflag)
1633		mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1634	else
1635		mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1636
1637	/*
1638	 * Display data by event type
1639	 */
1640	first = &sort_buf[0];
1641	while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1642		current = first;
1643		while ((lsp = *current)->ls_event == event)
1644			current++;
1645		report_stats(out, first, current - first, ev_count[event],
1646		    ev_time[event]);
1647		first = current;
1648	}
1649
1650	return (0);
1651}
1652
1653static char *
1654format_symbol(char *buf, uintptr_t addr, int show_size)
1655{
1656	uintptr_t symoff;
1657	char *symname;
1658	size_t symsize;
1659
1660	symname = addr_to_sym(addr, &symoff, &symsize);
1661
1662	if (show_size && symoff == 0)
1663		(void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1664	else if (symoff == 0)
1665		(void) sprintf(buf, "%s", symname);
1666	else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)	/* CPU+PIL */
1667#ifdef illumos
1668		(void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1669#else
1670		(void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]);
1671#endif
1672	else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1673		(void) sprintf(buf, "%s+0x%llx", symname,
1674		    (unsigned long long)symoff);
1675	else
1676		(void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1677	return (buf);
1678}
1679
1680static void
1681report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1682	uint64_t total_time)
1683{
1684	uint32_t event = sort_buf[0]->ls_event;
1685	lsrec_t *lsp;
1686	double ptotal = 0.0;
1687	double percent;
1688	int i, j, fr;
1689	int displayed;
1690	int first_bin, last_bin, max_bin_count, total_bin_count;
1691	int rectype;
1692	char buf[256];
1693	char lhdr[80], chdr[80];
1694
1695	rectype = g_recsize;
1696
1697	if (g_topn == 0) {
1698		(void) fprintf(out, "%20llu %s\n",
1699		    g_rates == 0 ? total_count :
1700		    ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1701		    g_event_info[event].ev_desc);
1702		return;
1703	}
1704
1705	(void) sprintf(lhdr, "%s%s",
1706	    g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1707	(void) sprintf(chdr, "%s%s",
1708	    g_wflag ? "Hottest " : "", "Caller");
1709
1710	if (!g_pflag)
1711		(void) fprintf(out,
1712		    "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1713		    g_event_info[event].ev_desc, (double)total_count,
1714		    (double)g_elapsed / NANOSEC,
1715		    (double)total_count * NANOSEC / g_elapsed);
1716
1717	if (!g_pflag && rectype < LS_HIST) {
1718		(void) sprintf(buf, "%s", g_event_info[event].ev_units);
1719		(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1720		    g_rates ? "ops/s" : "Count",
1721		    g_gflag ? "genr" : "indv",
1722		    "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1723		(void) fprintf(out, "---------------------------------"
1724		    "----------------------------------------------\n");
1725	}
1726
1727	displayed = 0;
1728	for (i = 0; i < nrecs; i++) {
1729		lsp = sort_buf[i];
1730
1731		if (displayed++ >= g_topn)
1732			break;
1733
1734		if (g_pflag) {
1735			int j;
1736
1737			(void) fprintf(out, "%u %u",
1738			    lsp->ls_event, lsp->ls_count);
1739			(void) fprintf(out, " %s",
1740			    format_symbol(buf, lsp->ls_lock, g_cflag));
1741			(void) fprintf(out, " %s",
1742			    format_symbol(buf, lsp->ls_caller, 0));
1743			(void) fprintf(out, " %f",
1744			    (double)lsp->ls_refcnt / lsp->ls_count);
1745			if (rectype >= LS_TIME)
1746				(void) fprintf(out, " %llu",
1747				    (unsigned long long)lsp->ls_time);
1748			if (rectype >= LS_HIST) {
1749				for (j = 0; j < 64; j++)
1750					(void) fprintf(out, " %u",
1751					    lsp->ls_hist[j]);
1752			}
1753			for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1754				if (rectype <= LS_STACK(j) ||
1755				    lsp->ls_stack[j] == 0)
1756					break;
1757				(void) fprintf(out, " %s",
1758				    format_symbol(buf, lsp->ls_stack[j], 0));
1759			}
1760			(void) fprintf(out, "\n");
1761			continue;
1762		}
1763
1764		if (rectype >= LS_HIST) {
1765			(void) fprintf(out, "---------------------------------"
1766			    "----------------------------------------------\n");
1767			(void) sprintf(buf, "%s",
1768			    g_event_info[event].ev_units);
1769			(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1770			    g_rates ? "ops/s" : "Count",
1771			    g_gflag ? "genr" : "indv",
1772			    "cuml", "rcnt", buf, lhdr, chdr);
1773		}
1774
1775		if (g_Pflag && total_time != 0)
1776			percent = (lsp->ls_time * 100.00) / total_time;
1777		else
1778			percent = (lsp->ls_count * 100.00) / total_count;
1779
1780		ptotal += percent;
1781
1782		if (rectype >= LS_TIME)
1783			(void) sprintf(buf, "%llu",
1784			    (unsigned long long)(lsp->ls_time / lsp->ls_count));
1785		else
1786			buf[0] = '\0';
1787
1788		(void) fprintf(out, "%5llu ",
1789		    g_rates == 0 ? lsp->ls_count :
1790		    ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1791
1792		(void) fprintf(out, "%3.0f%% ", percent);
1793
1794		if (g_gflag)
1795			(void) fprintf(out, "---- ");
1796		else
1797			(void) fprintf(out, "%3.0f%% ", ptotal);
1798
1799		(void) fprintf(out, "%4.2f %8s ",
1800		    (double)lsp->ls_refcnt / lsp->ls_count, buf);
1801
1802		(void) fprintf(out, "%-22s ",
1803		    format_symbol(buf, lsp->ls_lock, g_cflag));
1804
1805		(void) fprintf(out, "%-24s\n",
1806		    format_symbol(buf, lsp->ls_caller, 0));
1807
1808		if (rectype < LS_HIST)
1809			continue;
1810
1811		(void) fprintf(out, "\n");
1812		(void) fprintf(out, "%10s %31s %-9s %-24s\n",
1813		    g_event_info[event].ev_units,
1814		    "------ Time Distribution ------",
1815		    g_rates ? "ops/s" : "count",
1816		    rectype > LS_STACK(0) ? "Stack" : "");
1817
1818		first_bin = 0;
1819		while (lsp->ls_hist[first_bin] == 0)
1820			first_bin++;
1821
1822		last_bin = 63;
1823		while (lsp->ls_hist[last_bin] == 0)
1824			last_bin--;
1825
1826		max_bin_count = 0;
1827		total_bin_count = 0;
1828		for (j = first_bin; j <= last_bin; j++) {
1829			total_bin_count += lsp->ls_hist[j];
1830			if (lsp->ls_hist[j] > max_bin_count)
1831				max_bin_count = lsp->ls_hist[j];
1832		}
1833
1834		/*
1835		 * If we went a few frames below the caller, ignore them
1836		 */
1837		for (fr = 3; fr > 0; fr--)
1838			if (lsp->ls_stack[fr] == lsp->ls_caller)
1839				break;
1840
1841		for (j = first_bin; j <= last_bin; j++) {
1842			uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1843			(void) fprintf(out, "%10llu |%s%s %-9u ",
1844			    1ULL << j,
1845			    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1846			    "                              " + depth,
1847			    g_rates == 0 ? lsp->ls_hist[j] :
1848			    (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1849			    g_elapsed));
1850			if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1851				(void) fprintf(out, "\n");
1852				continue;
1853			}
1854			(void) fprintf(out, "%-24s\n",
1855			    format_symbol(buf, lsp->ls_stack[fr], 0));
1856			fr++;
1857		}
1858		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1859			(void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1860			    format_symbol(buf, lsp->ls_stack[fr], 0));
1861			fr++;
1862		}
1863	}
1864
1865	if (!g_pflag)
1866		(void) fprintf(out, "---------------------------------"
1867		    "----------------------------------------------\n");
1868
1869	(void) fflush(out);
1870}
1871
1872static void
1873report_trace(FILE *out, lsrec_t **sort_buf)
1874{
1875	lsrec_t *lsp;
1876	int i, fr;
1877	int rectype;
1878	char buf[256], buf2[256];
1879
1880	rectype = g_recsize;
1881
1882	if (!g_pflag) {
1883		(void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
1884		    "Event", "Time", "Owner", "Lock", "Caller");
1885		(void) fprintf(out, "---------------------------------"
1886		    "----------------------------------------------\n");
1887	}
1888
1889	for (i = 0; i < g_nrecs_used; i++) {
1890
1891		lsp = sort_buf[i];
1892
1893		if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1894			continue;
1895
1896		(void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
1897		    lsp->ls_event, (unsigned long long)lsp->ls_time,
1898		    (void *)lsp->ls_next,
1899		    format_symbol(buf, lsp->ls_lock, 0),
1900		    format_symbol(buf2, lsp->ls_caller, 0));
1901
1902		if (rectype <= LS_STACK(0))
1903			continue;
1904
1905		/*
1906		 * If we went a few frames below the caller, ignore them
1907		 */
1908		for (fr = 3; fr > 0; fr--)
1909			if (lsp->ls_stack[fr] == lsp->ls_caller)
1910				break;
1911
1912		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1913			(void) fprintf(out, "%53s  %-24s\n", "",
1914			    format_symbol(buf, lsp->ls_stack[fr], 0));
1915			fr++;
1916		}
1917		(void) fprintf(out, "\n");
1918	}
1919
1920	(void) fflush(out);
1921}
1922