lockstat.c revision 192867
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <stdio.h>
29#include <stddef.h>
30#include <stdlib.h>
31#include <stdarg.h>
32#include <string.h>
33#include <strings.h>
34#include <ctype.h>
35#include <fcntl.h>
36#include <unistd.h>
37#include <errno.h>
38#include <limits.h>
39#include <sys/types.h>
40#include <sys/modctl.h>
41#include <sys/stat.h>
42#include <sys/wait.h>
43#include <dtrace.h>
44#include <sys/lockstat.h>
45#include <alloca.h>
46#include <signal.h>
47#include <assert.h>
48
49#if !defined(sun)
50/* needed for FreeBSD */
51#include <sys/time.h>
52#include <sys/resource.h>
53typedef uintptr_t pc_t;
54
55#define mergesort(a, b, c, d)	lsmergesort(a, b, c, d)
56#endif
57
58#define	LOCKSTAT_OPTSTR	"x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
59
60#define	LS_MAX_STACK_DEPTH	50
61#define	LS_MAX_EVENTS		64
62
63typedef struct lsrec {
64	struct lsrec	*ls_next;	/* next in hash chain */
65	uintptr_t	ls_lock;	/* lock address */
66	uintptr_t	ls_caller;	/* caller address */
67	uint32_t	ls_count;	/* cumulative event count */
68	uint32_t	ls_event;	/* type of event */
69	uintptr_t	ls_refcnt;	/* cumulative reference count */
70	uint64_t	ls_time;	/* cumulative event duration */
71	uint32_t	ls_hist[64];	/* log2(duration) histogram */
72	uintptr_t	ls_stack[LS_MAX_STACK_DEPTH];
73} lsrec_t;
74
75typedef struct lsdata {
76	struct lsrec	*lsd_next;	/* next available */
77	int		lsd_count;	/* number of records */
78} lsdata_t;
79
80/*
81 * Definitions for the types of experiments which can be run.  They are
82 * listed in increasing order of memory cost and processing time cost.
83 * The numerical value of each type is the number of bytes needed per record.
84 */
85#define	LS_BASIC	offsetof(lsrec_t, ls_time)
86#define	LS_TIME		offsetof(lsrec_t, ls_hist[0])
87#define	LS_HIST		offsetof(lsrec_t, ls_stack[0])
88#define	LS_STACK(depth)	offsetof(lsrec_t, ls_stack[depth])
89
90static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
91static void report_trace(FILE *, lsrec_t **);
92
93extern int symtab_init(void);
94extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
95extern uintptr_t sym_to_addr(char *name);
96extern size_t sym_size(char *name);
97extern char *strtok_r(char *, const char *, char **);
98
99#define	DEFAULT_NRECS	10000
100#define	DEFAULT_HZ	97
101#define	MAX_HZ		1000
102#define	MIN_AGGSIZE	(16 * 1024)
103#define	MAX_AGGSIZE	(32 * 1024 * 1024)
104
105static int g_stkdepth;
106static int g_topn = INT_MAX;
107static hrtime_t g_elapsed;
108static int g_rates = 0;
109static int g_pflag = 0;
110static int g_Pflag = 0;
111static int g_wflag = 0;
112static int g_Wflag = 0;
113static int g_cflag = 0;
114static int g_kflag = 0;
115static int g_gflag = 0;
116static int g_Vflag = 0;
117static int g_tracing = 0;
118static size_t g_recsize;
119static size_t g_nrecs;
120static int g_nrecs_used;
121static uchar_t g_enabled[LS_MAX_EVENTS];
122static hrtime_t g_min_duration[LS_MAX_EVENTS];
123static dtrace_hdl_t *g_dtp;
124static char *g_predicate;
125static char *g_ipredicate;
126static char *g_prog;
127static int g_proglen;
128static int g_dropped;
129
130typedef struct ls_event_info {
131	char	ev_type;
132	char	ev_lhdr[20];
133	char	ev_desc[80];
134	char	ev_units[10];
135	char	ev_name[DTRACE_NAMELEN];
136	char	*ev_predicate;
137	char	*ev_acquire;
138} ls_event_info_t;
139
140static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
141	{ 'C',	"Lock",	"Adaptive mutex spin",			"nsec",
142	    "lockstat:::adaptive-spin" },
143	{ 'C',	"Lock",	"Adaptive mutex block",			"nsec",
144	    "lockstat:::adaptive-block" },
145	{ 'C',	"Lock",	"Spin lock spin",			"nsec",
146	    "lockstat:::spin-spin" },
147	{ 'C',	"Lock",	"Thread lock spin",			"nsec",
148	    "lockstat:::thread-spin" },
149	{ 'C',	"Lock",	"R/W writer blocked by writer",		"nsec",
150	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
151	{ 'C',	"Lock",	"R/W writer blocked by readers",	"nsec",
152	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
153	{ 'C',	"Lock",	"R/W reader blocked by writer",		"nsec",
154	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
155	{ 'C',	"Lock",	"R/W reader blocked by write wanted",	"nsec",
156	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
157	{ 'C',	"Lock",	"Unknown event (type 8)",		"units"	},
158	{ 'C',	"Lock",	"Unknown event (type 9)",		"units"	},
159	{ 'C',	"Lock",	"Unknown event (type 10)",		"units"	},
160	{ 'C',	"Lock",	"Unknown event (type 11)",		"units"	},
161	{ 'C',	"Lock",	"Unknown event (type 12)",		"units"	},
162	{ 'C',	"Lock",	"Unknown event (type 13)",		"units"	},
163	{ 'C',	"Lock",	"Unknown event (type 14)",		"units"	},
164	{ 'C',	"Lock",	"Unknown event (type 15)",		"units"	},
165	{ 'C',	"Lock",	"Unknown event (type 16)",		"units"	},
166	{ 'C',	"Lock",	"Unknown event (type 17)",		"units"	},
167	{ 'C',	"Lock",	"Unknown event (type 18)",		"units"	},
168	{ 'C',	"Lock",	"Unknown event (type 19)",		"units"	},
169	{ 'C',	"Lock",	"Unknown event (type 20)",		"units"	},
170	{ 'C',	"Lock",	"Unknown event (type 21)",		"units"	},
171	{ 'C',	"Lock",	"Unknown event (type 22)",		"units"	},
172	{ 'C',	"Lock",	"Unknown event (type 23)",		"units"	},
173	{ 'C',	"Lock",	"Unknown event (type 24)",		"units"	},
174	{ 'C',	"Lock",	"Unknown event (type 25)",		"units"	},
175	{ 'C',	"Lock",	"Unknown event (type 26)",		"units"	},
176	{ 'C',	"Lock",	"Unknown event (type 27)",		"units"	},
177	{ 'C',	"Lock",	"Unknown event (type 28)",		"units"	},
178	{ 'C',	"Lock",	"Unknown event (type 29)",		"units"	},
179	{ 'C',	"Lock",	"Unknown event (type 30)",		"units"	},
180	{ 'C',	"Lock",	"Unknown event (type 31)",		"units"	},
181	{ 'H',	"Lock",	"Adaptive mutex hold",			"nsec",
182	    "lockstat:::adaptive-release", NULL,
183	    "lockstat:::adaptive-acquire" },
184	{ 'H',	"Lock",	"Spin lock hold",			"nsec",
185	    "lockstat:::spin-release", NULL,
186	    "lockstat:::spin-acquire" },
187	{ 'H',	"Lock",	"R/W writer hold",			"nsec",
188	    "lockstat:::rw-release", "arg1 == 0",
189	    "lockstat:::rw-acquire" },
190	{ 'H',	"Lock",	"R/W reader hold",			"nsec",
191	    "lockstat:::rw-release", "arg1 != 0",
192	    "lockstat:::rw-acquire" },
193	{ 'H',	"Lock",	"Unknown event (type 36)",		"units"	},
194	{ 'H',	"Lock",	"Unknown event (type 37)",		"units"	},
195	{ 'H',	"Lock",	"Unknown event (type 38)",		"units"	},
196	{ 'H',	"Lock",	"Unknown event (type 39)",		"units"	},
197	{ 'H',	"Lock",	"Unknown event (type 40)",		"units"	},
198	{ 'H',	"Lock",	"Unknown event (type 41)",		"units"	},
199	{ 'H',	"Lock",	"Unknown event (type 42)",		"units"	},
200	{ 'H',	"Lock",	"Unknown event (type 43)",		"units"	},
201	{ 'H',	"Lock",	"Unknown event (type 44)",		"units"	},
202	{ 'H',	"Lock",	"Unknown event (type 45)",		"units"	},
203	{ 'H',	"Lock",	"Unknown event (type 46)",		"units"	},
204	{ 'H',	"Lock",	"Unknown event (type 47)",		"units"	},
205	{ 'H',	"Lock",	"Unknown event (type 48)",		"units"	},
206	{ 'H',	"Lock",	"Unknown event (type 49)",		"units"	},
207	{ 'H',	"Lock",	"Unknown event (type 50)",		"units"	},
208	{ 'H',	"Lock",	"Unknown event (type 51)",		"units"	},
209	{ 'H',	"Lock",	"Unknown event (type 52)",		"units"	},
210	{ 'H',	"Lock",	"Unknown event (type 53)",		"units"	},
211	{ 'H',	"Lock",	"Unknown event (type 54)",		"units"	},
212	{ 'H',	"Lock",	"Unknown event (type 55)",		"units"	},
213#if defined(sun)
214	{ 'I',	"CPU+PIL", "Profiling interrupt",		"nsec",
215#else
216	/* FreeBSD */
217	{ 'I',	"CPU+Pri_Class", "Profiling interrupt",		"nsec",
218#endif
219	    "profile:::profile-97", NULL },
220	{ 'I',	"Lock",	"Unknown event (type 57)",		"units"	},
221	{ 'I',	"Lock",	"Unknown event (type 58)",		"units"	},
222	{ 'I',	"Lock",	"Unknown event (type 59)",		"units"	},
223	{ 'E',	"Lock",	"Recursive lock entry detected",	"(N/A)",
224	    "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
225	{ 'E',	"Lock",	"Lockstat enter failure",		"(N/A)"	},
226	{ 'E',	"Lock",	"Lockstat exit failure",		"nsec"	},
227	{ 'E',	"Lock",	"Lockstat record failure",		"(N/A)"	},
228};
229
230#if !defined(sun)
231static char *g_pri_class[] = {
232	"",
233	"Intr",
234	"RealT",
235	"TShar",
236	"Idle"
237};
238#endif
239
240static void
241fail(int do_perror, const char *message, ...)
242{
243	va_list args;
244	int save_errno = errno;
245
246	va_start(args, message);
247	(void) fprintf(stderr, "lockstat: ");
248	(void) vfprintf(stderr, message, args);
249	va_end(args);
250	if (do_perror)
251		(void) fprintf(stderr, ": %s", strerror(save_errno));
252	(void) fprintf(stderr, "\n");
253	exit(2);
254}
255
256static void
257dfail(const char *message, ...)
258{
259	va_list args;
260
261	va_start(args, message);
262	(void) fprintf(stderr, "lockstat: ");
263	(void) vfprintf(stderr, message, args);
264	va_end(args);
265	(void) fprintf(stderr, ": %s\n",
266	    dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
267
268	exit(2);
269}
270
271static void
272show_events(char event_type, char *desc)
273{
274	int i, first = -1, last;
275
276	for (i = 0; i < LS_MAX_EVENTS; i++) {
277		ls_event_info_t *evp = &g_event_info[i];
278		if (evp->ev_type != event_type ||
279		    strncmp(evp->ev_desc, "Unknown event", 13) == 0)
280			continue;
281		if (first == -1)
282			first = i;
283		last = i;
284	}
285
286	(void) fprintf(stderr,
287	    "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
288	    desc, event_type, first, last);
289
290	for (i = first; i <= last; i++)
291		(void) fprintf(stderr,
292		    "%4d = %s\n", i, g_event_info[i].ev_desc);
293}
294
295static void
296usage(void)
297{
298	(void) fprintf(stderr,
299	    "Usage: lockstat [options] command [args]\n"
300	    "\nEvent selection options:\n\n"
301	    "  -C              watch contention events [on by default]\n"
302	    "  -E              watch error events [off by default]\n"
303	    "  -H              watch hold events [off by default]\n"
304	    "  -I              watch interrupt events [off by default]\n"
305	    "  -A              watch all lock events [equivalent to -CH]\n"
306	    "  -e event_list   only watch the specified events (shown below);\n"
307	    "                  <event_list> is a comma-separated list of\n"
308	    "                  events or ranges of events, e.g. 1,4-7,35\n"
309	    "  -i rate         interrupt rate for -I [default: %d Hz]\n"
310	    "\nData gathering options:\n\n"
311	    "  -b              basic statistics (lock, caller, event count)\n"
312	    "  -t              timing for all events [default]\n"
313	    "  -h              histograms for event times\n"
314	    "  -s depth        stack traces <depth> deep\n"
315	    "  -x opt[=val]    enable or modify DTrace options\n"
316	    "\nData filtering options:\n\n"
317	    "  -n nrecords     maximum number of data records [default: %d]\n"
318	    "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
319	    "                  symbolic name or hex address; <size> defaults\n"
320	    "                  to the ELF symbol size if available, 1 if not\n"
321	    "  -f func[,size]  only watch events generated by <func>\n"
322	    "  -d duration     only watch events longer than <duration>\n"
323	    "  -T              trace (rather than sample) events\n"
324	    "\nData reporting options:\n\n"
325	    "  -c              coalesce lock data for arrays like pse_mutex[]\n"
326	    "  -k              coalesce PCs within functions\n"
327	    "  -g              show total events generated by function\n"
328	    "  -w              wherever: don't distinguish events by caller\n"
329	    "  -W              whichever: don't distinguish events by lock\n"
330	    "  -R              display rates rather than counts\n"
331	    "  -p              parsable output format (awk(1)-friendly)\n"
332	    "  -P              sort lock data by (count * avg_time) product\n"
333	    "  -D n            only display top <n> events of each type\n"
334	    "  -o filename     send output to <filename>\n",
335	    DEFAULT_HZ, DEFAULT_NRECS);
336
337	show_events('C', "Contention");
338	show_events('H', "Hold-time");
339	show_events('I', "Interrupt");
340	show_events('E', "Error");
341	(void) fprintf(stderr, "\n");
342
343	exit(1);
344}
345
346static int
347lockcmp(lsrec_t *a, lsrec_t *b)
348{
349	int i;
350
351	if (a->ls_event < b->ls_event)
352		return (-1);
353	if (a->ls_event > b->ls_event)
354		return (1);
355
356	for (i = g_stkdepth - 1; i >= 0; i--) {
357		if (a->ls_stack[i] < b->ls_stack[i])
358			return (-1);
359		if (a->ls_stack[i] > b->ls_stack[i])
360			return (1);
361	}
362
363	if (a->ls_caller < b->ls_caller)
364		return (-1);
365	if (a->ls_caller > b->ls_caller)
366		return (1);
367
368	if (a->ls_lock < b->ls_lock)
369		return (-1);
370	if (a->ls_lock > b->ls_lock)
371		return (1);
372
373	return (0);
374}
375
376static int
377countcmp(lsrec_t *a, lsrec_t *b)
378{
379	if (a->ls_event < b->ls_event)
380		return (-1);
381	if (a->ls_event > b->ls_event)
382		return (1);
383
384	return (b->ls_count - a->ls_count);
385}
386
387static int
388timecmp(lsrec_t *a, lsrec_t *b)
389{
390	if (a->ls_event < b->ls_event)
391		return (-1);
392	if (a->ls_event > b->ls_event)
393		return (1);
394
395	if (a->ls_time < b->ls_time)
396		return (1);
397	if (a->ls_time > b->ls_time)
398		return (-1);
399
400	return (0);
401}
402
403static int
404lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
405{
406	if (a->ls_event < b->ls_event)
407		return (-1);
408	if (a->ls_event > b->ls_event)
409		return (1);
410
411	if (a->ls_lock < b->ls_lock)
412		return (-1);
413	if (a->ls_lock > b->ls_lock)
414		return (1);
415
416	return (0);
417}
418
419static int
420lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
421{
422	if (a->ls_event < b->ls_event)
423		return (-1);
424	if (a->ls_event > b->ls_event)
425		return (1);
426
427	if (a->ls_lock < b->ls_lock)
428		return (-1);
429	if (a->ls_lock > b->ls_lock)
430		return (1);
431
432	return (b->ls_count - a->ls_count);
433}
434
435static int
436sitecmp_anylock(lsrec_t *a, lsrec_t *b)
437{
438	int i;
439
440	if (a->ls_event < b->ls_event)
441		return (-1);
442	if (a->ls_event > b->ls_event)
443		return (1);
444
445	for (i = g_stkdepth - 1; i >= 0; i--) {
446		if (a->ls_stack[i] < b->ls_stack[i])
447			return (-1);
448		if (a->ls_stack[i] > b->ls_stack[i])
449			return (1);
450	}
451
452	if (a->ls_caller < b->ls_caller)
453		return (-1);
454	if (a->ls_caller > b->ls_caller)
455		return (1);
456
457	return (0);
458}
459
460static int
461site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
462{
463	int i;
464
465	if (a->ls_event < b->ls_event)
466		return (-1);
467	if (a->ls_event > b->ls_event)
468		return (1);
469
470	for (i = g_stkdepth - 1; i >= 0; i--) {
471		if (a->ls_stack[i] < b->ls_stack[i])
472			return (-1);
473		if (a->ls_stack[i] > b->ls_stack[i])
474			return (1);
475	}
476
477	if (a->ls_caller < b->ls_caller)
478		return (-1);
479	if (a->ls_caller > b->ls_caller)
480		return (1);
481
482	return (b->ls_count - a->ls_count);
483}
484
485static void
486lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
487{
488	int m = n / 2;
489	int i, j;
490
491	if (m > 1)
492		lsmergesort(cmp, a, b, m);
493	if (n - m > 1)
494		lsmergesort(cmp, a + m, b + m, n - m);
495	for (i = m; i > 0; i--)
496		b[i - 1] = a[i - 1];
497	for (j = m - 1; j < n - 1; j++)
498		b[n + m - j - 2] = a[j + 1];
499	while (i < j)
500		*a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
501	*a = b[i];
502}
503
504static void
505coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
506{
507	int i, j;
508	lsrec_t *target, *current;
509
510	target = lock[0];
511
512	for (i = 1; i < n; i++) {
513		current = lock[i];
514		if (cmp(current, target) != 0) {
515			target = current;
516			continue;
517		}
518		current->ls_event = LS_MAX_EVENTS;
519		target->ls_count += current->ls_count;
520		target->ls_refcnt += current->ls_refcnt;
521		if (g_recsize < LS_TIME)
522			continue;
523		target->ls_time += current->ls_time;
524		if (g_recsize < LS_HIST)
525			continue;
526		for (j = 0; j < 64; j++)
527			target->ls_hist[j] += current->ls_hist[j];
528	}
529}
530
531static void
532coalesce_symbol(uintptr_t *addrp)
533{
534	uintptr_t symoff;
535	size_t symsize;
536
537	if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
538		*addrp -= symoff;
539}
540
541static void
542predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
543{
544	char *new;
545	int len, newlen;
546
547	if (what == NULL)
548		return;
549
550	if (*pred == NULL) {
551		*pred = malloc(1);
552		*pred[0] = '\0';
553	}
554
555	len = strlen(*pred);
556	newlen = len + strlen(what) + 32 + strlen("( && )");
557	new = malloc(newlen);
558
559	if (*pred[0] != '\0') {
560		if (cmp != NULL) {
561			(void) sprintf(new, "(%s) && (%s %s 0x%p)",
562			    *pred, what, cmp, (void *)value);
563		} else {
564			(void) sprintf(new, "(%s) && (%s)", *pred, what);
565		}
566	} else {
567		if (cmp != NULL) {
568			(void) sprintf(new, "%s %s 0x%p",
569			    what, cmp, (void *)value);
570		} else {
571			(void) sprintf(new, "%s", what);
572		}
573	}
574
575	free(*pred);
576	*pred = new;
577}
578
579static void
580predicate_destroy(char **pred)
581{
582	free(*pred);
583	*pred = NULL;
584}
585
586static void
587filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
588{
589	char buf[256], *c = buf, *new;
590	int len, newlen;
591
592	if (*filt == NULL) {
593		*filt = malloc(1);
594		*filt[0] = '\0';
595	}
596
597#if defined(sun)
598	(void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
599	    " || " : "", what, (void *)base, what, (void *)(base + size));
600#else
601	(void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ?
602	    " || " : "", what, (void *)base, what, (void *)(base + size));
603#endif
604
605	newlen = (len = strlen(*filt) + 1) + strlen(c);
606	new = malloc(newlen);
607	bcopy(*filt, new, len);
608	(void) strcat(new, c);
609	free(*filt);
610	*filt = new;
611}
612
613static void
614filter_destroy(char **filt)
615{
616	free(*filt);
617	*filt = NULL;
618}
619
620static void
621dprog_add(const char *fmt, ...)
622{
623	va_list args;
624	int size, offs;
625	char c;
626
627	va_start(args, fmt);
628	size = vsnprintf(&c, 1, fmt, args) + 1;
629	va_end(args);
630
631	if (g_proglen == 0) {
632		offs = 0;
633	} else {
634		offs = g_proglen - 1;
635	}
636
637	g_proglen = offs + size;
638
639	if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
640		fail(1, "failed to reallocate program text");
641
642	va_start(args, fmt);
643	(void) vsnprintf(&g_prog[offs], size, fmt, args);
644	va_end(args);
645}
646
647/*
648 * This function may read like an open sewer, but keep in mind that programs
649 * that generate other programs are rarely pretty.  If one has the unenviable
650 * task of maintaining or -- worse -- extending this code, use the -V option
651 * to examine the D program as generated by this function.
652 */
653static void
654dprog_addevent(int event)
655{
656	ls_event_info_t *info = &g_event_info[event];
657	char *pred = NULL;
658	char stack[20];
659	const char *arg0, *caller;
660	char *arg1 = "arg1";
661	char buf[80];
662	hrtime_t dur;
663	int depth;
664
665	if (info->ev_name[0] == '\0')
666		return;
667
668	if (info->ev_type == 'I') {
669		/*
670		 * For interrupt events, arg0 (normally the lock pointer) is
671		 * the CPU address plus the current pil, and arg1 (normally
672		 * the number of nanoseconds) is the number of nanoseconds
673		 * late -- and it's stored in arg2.
674		 */
675#if defined(sun)
676		arg0 = "(uintptr_t)curthread->t_cpu + \n"
677		    "\t    curthread->t_cpu->cpu_profile_pil";
678#else
679		arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n"
680		    "\t    0x01000000 + curthread->td_pri_class";
681#endif
682		caller = "(uintptr_t)arg0";
683		arg1 = "arg2";
684	} else {
685		arg0 = "(uintptr_t)arg0";
686		caller = "caller";
687	}
688
689	if (g_recsize > LS_HIST) {
690		for (depth = 0; g_recsize > LS_STACK(depth); depth++)
691			continue;
692
693		if (g_tracing) {
694			(void) sprintf(stack, "\tstack(%d);\n", depth);
695		} else {
696			(void) sprintf(stack, ", stack(%d)", depth);
697		}
698	} else {
699		(void) sprintf(stack, "");
700	}
701
702	if (info->ev_acquire != NULL) {
703		/*
704		 * If this is a hold event, we need to generate an additional
705		 * clause for the acquire; the clause for the release will be
706		 * generated with the aggregating statement, below.
707		 */
708		dprog_add("%s\n", info->ev_acquire);
709		predicate_add(&pred, info->ev_predicate, NULL, 0);
710		predicate_add(&pred, g_predicate, NULL, 0);
711		if (pred != NULL)
712			dprog_add("/%s/\n", pred);
713
714		dprog_add("{\n");
715		(void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
716
717		if (info->ev_type == 'H') {
718			dprog_add("\t%s = timestamp;\n", buf);
719		} else {
720			/*
721			 * If this isn't a hold event, it's the recursive
722			 * error event.  For this, we simply bump the
723			 * thread-local, per-lock count.
724			 */
725			dprog_add("\t%s++;\n", buf);
726		}
727
728		dprog_add("}\n\n");
729		predicate_destroy(&pred);
730		pred = NULL;
731
732		if (info->ev_type == 'E') {
733			/*
734			 * If this is the recursive lock error event, we need
735			 * to generate an additional clause to decrement the
736			 * thread-local, per-lock count.  This assures that we
737			 * only execute the aggregating clause if we have
738			 * recursive entry.
739			 */
740			dprog_add("%s\n", info->ev_name);
741			dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
742		}
743
744		predicate_add(&pred, buf, NULL, 0);
745
746		if (info->ev_type == 'H') {
747			(void) sprintf(buf, "timestamp -\n\t    "
748			    "self->ev%d[(uintptr_t)arg0]", event);
749		}
750
751		arg1 = buf;
752	} else {
753		predicate_add(&pred, info->ev_predicate, NULL, 0);
754		if (info->ev_type != 'I')
755			predicate_add(&pred, g_predicate, NULL, 0);
756		else
757			predicate_add(&pred, g_ipredicate, NULL, 0);
758	}
759
760	if ((dur = g_min_duration[event]) != 0)
761		predicate_add(&pred, arg1, ">=", dur);
762
763	dprog_add("%s\n", info->ev_name);
764
765	if (pred != NULL)
766		dprog_add("/%s/\n", pred);
767	predicate_destroy(&pred);
768
769	dprog_add("{\n");
770
771	if (g_tracing) {
772		dprog_add("\ttrace(%dULL);\n", event);
773		dprog_add("\ttrace(%s);\n", arg0);
774		dprog_add("\ttrace(%s);\n", caller);
775		dprog_add(stack);
776	} else {
777		/*
778		 * The ordering here is important:  when we process the
779		 * aggregate, we count on the fact that @avg appears before
780		 * @hist in program order to assure that @avg is assigned the
781		 * first aggregation variable ID and @hist assigned the
782		 * second; see the comment in process_aggregate() for details.
783		 */
784		dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
785		    event, arg0, caller, stack, arg1);
786
787		if (g_recsize >= LS_HIST) {
788			dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
789			    "(%s);\n", event, arg0, caller, stack, arg1);
790		}
791	}
792
793	if (info->ev_acquire != NULL)
794		dprog_add("\tself->ev%d[arg0] = 0;\n", event);
795
796	dprog_add("}\n\n");
797}
798
799static void
800dprog_compile()
801{
802	dtrace_prog_t *prog;
803	dtrace_proginfo_t info;
804
805	if (g_Vflag) {
806		(void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
807		(void) fputs(g_prog, stderr);
808		(void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
809	}
810
811	if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
812	    DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
813		dfail("failed to compile program");
814
815	if (dtrace_program_exec(g_dtp, prog, &info) == -1)
816		dfail("failed to enable probes");
817
818	if (dtrace_go(g_dtp) != 0)
819		dfail("couldn't start tracing");
820}
821
822static void
823#if defined(sun)
824status_fire(void)
825#else
826status_fire(int i)
827#endif
828{}
829
830static void
831status_init(void)
832{
833	dtrace_optval_t val, status, agg;
834	struct sigaction act;
835	struct itimerspec ts;
836	struct sigevent ev;
837	timer_t tid;
838
839	if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
840		dfail("failed to get 'statusrate'");
841
842	if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
843		dfail("failed to get 'statusrate'");
844
845	/*
846	 * We would want to awaken at a rate that is the GCD of the statusrate
847	 * and the aggrate -- but that seems a bit absurd.  Instead, we'll
848	 * simply awaken at a rate that is the more frequent of the two, which
849	 * assures that we're never later than the interval implied by the
850	 * more frequent rate.
851	 */
852	val = status < agg ? status : agg;
853
854	(void) sigemptyset(&act.sa_mask);
855	act.sa_flags = 0;
856	act.sa_handler = status_fire;
857	(void) sigaction(SIGUSR1, &act, NULL);
858
859	ev.sigev_notify = SIGEV_SIGNAL;
860	ev.sigev_signo = SIGUSR1;
861
862	if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
863		dfail("cannot create CLOCK_REALTIME timer");
864
865	ts.it_value.tv_sec = val / NANOSEC;
866	ts.it_value.tv_nsec = val % NANOSEC;
867	ts.it_interval = ts.it_value;
868
869	if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
870		dfail("cannot set time on CLOCK_REALTIME timer");
871}
872
873static void
874status_check(void)
875{
876	if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
877		dfail("failed to snap aggregate");
878
879	if (dtrace_status(g_dtp) == -1)
880		dfail("dtrace_status()");
881}
882
883static void
884lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
885{
886	bzero(lsrec, g_recsize);
887	lsrec->ls_count = 1;
888
889	if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
890		fail(0, "truncated DTrace record");
891
892	if (rec->dtrd_size != sizeof (uint64_t))
893		fail(0, "bad event size in first record");
894
895	/* LINTED - alignment */
896	lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
897	rec++;
898
899	if (rec->dtrd_size != sizeof (uintptr_t))
900		fail(0, "bad lock address size in second record");
901
902	/* LINTED - alignment */
903	lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
904	rec++;
905
906	if (rec->dtrd_size != sizeof (uintptr_t))
907		fail(0, "bad caller size in third record");
908
909	/* LINTED - alignment */
910	lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
911	rec++;
912
913	if (g_recsize > LS_HIST) {
914		int frames, i;
915		pc_t *stack;
916
917		frames = rec->dtrd_size / sizeof (pc_t);
918		/* LINTED - alignment */
919		stack = (pc_t *)(data + rec->dtrd_offset);
920
921		for (i = 1; i < frames; i++)
922			lsrec->ls_stack[i - 1] = stack[i];
923	}
924}
925
926/*ARGSUSED*/
927static int
928count_aggregate(const dtrace_aggdata_t *agg, void *arg)
929{
930	*((size_t *)arg) += 1;
931
932	return (DTRACE_AGGWALK_NEXT);
933}
934
935static int
936process_aggregate(const dtrace_aggdata_t *agg, void *arg)
937{
938	const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
939	caddr_t data = agg->dtada_data;
940	lsdata_t *lsdata = arg;
941	lsrec_t *lsrec = lsdata->lsd_next;
942	const dtrace_recdesc_t *rec;
943	uint64_t *avg, *quantized;
944	int i, j;
945
946	assert(lsdata->lsd_count < g_nrecs);
947
948	/*
949	 * Aggregation variable IDs are guaranteed to be generated in program
950	 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
951	 * plus one.  As "avg" appears before "hist" in program order, we know
952	 * that "avg" will be allocated the first aggregation variable ID, and
953	 * "hist" will be allocated the second aggregation variable ID -- and
954	 * we therefore use the aggregation variable ID to differentiate the
955	 * cases.
956	 */
957	if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
958		/*
959		 * If this is the histogram entry.  We'll copy the quantized
960		 * data into lc_hist, and jump over the rest.
961		 */
962		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
963
964		if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
965			fail(0, "bad variable ID in aggregation record");
966
967		if (rec->dtrd_size !=
968		    DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
969			fail(0, "bad quantize size in aggregation record");
970
971		/* LINTED - alignment */
972		quantized = (uint64_t *)(data + rec->dtrd_offset);
973
974		for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
975		    i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
976			lsrec->ls_hist[j] = quantized[i];
977
978		goto out;
979	}
980
981	lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
982	    aggdesc->dtagd_nrecs - 1, data);
983
984	rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
985
986	if (rec->dtrd_size != 2 * sizeof (uint64_t))
987		fail(0, "bad avg size in aggregation record");
988
989	/* LINTED - alignment */
990	avg = (uint64_t *)(data + rec->dtrd_offset);
991	lsrec->ls_count = (uint32_t)avg[0];
992	lsrec->ls_time = (uintptr_t)avg[1];
993
994	if (g_recsize >= LS_HIST)
995		return (DTRACE_AGGWALK_NEXT);
996
997out:
998	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
999	lsdata->lsd_count++;
1000
1001	return (DTRACE_AGGWALK_NEXT);
1002}
1003
1004static int
1005process_trace(const dtrace_probedata_t *pdata, void *arg)
1006{
1007	lsdata_t *lsdata = arg;
1008	lsrec_t *lsrec = lsdata->lsd_next;
1009	dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
1010	caddr_t data = pdata->dtpda_data;
1011
1012	if (lsdata->lsd_count >= g_nrecs)
1013		return (DTRACE_CONSUME_NEXT);
1014
1015	lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
1016
1017	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1018	lsdata->lsd_count++;
1019
1020	return (DTRACE_CONSUME_NEXT);
1021}
1022
1023static int
1024process_data(FILE *out, char *data)
1025{
1026	lsdata_t lsdata;
1027
1028	/* LINTED - alignment */
1029	lsdata.lsd_next = (lsrec_t *)data;
1030	lsdata.lsd_count = 0;
1031
1032	if (g_tracing) {
1033		if (dtrace_consume(g_dtp, out,
1034		    process_trace, NULL, &lsdata) != 0)
1035			dfail("failed to consume buffer");
1036
1037		return (lsdata.lsd_count);
1038	}
1039
1040	if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
1041	    process_aggregate, &lsdata) != 0)
1042		dfail("failed to walk aggregate");
1043
1044	return (lsdata.lsd_count);
1045}
1046
1047/*ARGSUSED*/
1048static int
1049drophandler(const dtrace_dropdata_t *data, void *arg)
1050{
1051	g_dropped++;
1052	(void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1053	return (DTRACE_HANDLE_OK);
1054}
1055
1056int
1057main(int argc, char **argv)
1058{
1059	char *data_buf;
1060	lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1061	FILE *out = stdout;
1062	char c;
1063	pid_t child;
1064	int status;
1065	int i, j;
1066	hrtime_t duration;
1067	char *addrp, *offp, *sizep, *evp, *lastp, *p;
1068	uintptr_t addr;
1069	size_t size, off;
1070	int events_specified = 0;
1071	int exec_errno = 0;
1072	uint32_t event;
1073	char *filt = NULL, *ifilt = NULL;
1074	static uint64_t ev_count[LS_MAX_EVENTS + 1];
1075	static uint64_t ev_time[LS_MAX_EVENTS + 1];
1076	dtrace_optval_t aggsize;
1077	char aggstr[10];
1078	long ncpus;
1079	int dynvar = 0;
1080	int err;
1081
1082	if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1083		fail(0, "cannot open dtrace library: %s",
1084		    dtrace_errmsg(NULL, err));
1085	}
1086
1087	if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1088		dfail("couldn't establish drop handler");
1089
1090	if (symtab_init() == -1)
1091		fail(1, "can't load kernel symbols");
1092
1093	g_nrecs = DEFAULT_NRECS;
1094
1095	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
1096		switch (c) {
1097		case 'b':
1098			g_recsize = LS_BASIC;
1099			break;
1100
1101		case 't':
1102			g_recsize = LS_TIME;
1103			break;
1104
1105		case 'h':
1106			g_recsize = LS_HIST;
1107			break;
1108
1109		case 's':
1110			if (!isdigit(optarg[0]))
1111				usage();
1112			g_stkdepth = atoi(optarg);
1113			if (g_stkdepth > LS_MAX_STACK_DEPTH)
1114				fail(0, "max stack depth is %d",
1115				    LS_MAX_STACK_DEPTH);
1116			g_recsize = LS_STACK(g_stkdepth);
1117			break;
1118
1119		case 'n':
1120			if (!isdigit(optarg[0]))
1121				usage();
1122			g_nrecs = atoi(optarg);
1123			break;
1124
1125		case 'd':
1126			if (!isdigit(optarg[0]))
1127				usage();
1128			duration = atoll(optarg);
1129
1130			/*
1131			 * XXX -- durations really should be per event
1132			 * since the units are different, but it's hard
1133			 * to express this nicely in the interface.
1134			 * Not clear yet what the cleanest solution is.
1135			 */
1136			for (i = 0; i < LS_MAX_EVENTS; i++)
1137				if (g_event_info[i].ev_type != 'E')
1138					g_min_duration[i] = duration;
1139
1140			break;
1141
1142		case 'i':
1143			if (!isdigit(optarg[0]))
1144				usage();
1145			i = atoi(optarg);
1146			if (i <= 0)
1147				usage();
1148			if (i > MAX_HZ)
1149				fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1150
1151			for (j = 0; j < LS_MAX_EVENTS; j++)
1152				if (strcmp(g_event_info[j].ev_desc,
1153				    "Profiling interrupt") == 0)
1154					break;
1155
1156			(void) sprintf(g_event_info[j].ev_name,
1157			    "profile:::profile-%d", i);
1158			break;
1159
1160		case 'l':
1161		case 'f':
1162			addrp = strtok(optarg, ",");
1163			sizep = strtok(NULL, ",");
1164			addrp = strtok(optarg, ",+");
1165			offp = strtok(NULL, ",");
1166
1167			size = sizep ? strtoul(sizep, NULL, 0) : 1;
1168			off = offp ? strtoul(offp, NULL, 0) : 0;
1169
1170			if (addrp[0] == '0') {
1171				addr = strtoul(addrp, NULL, 16) + off;
1172			} else {
1173				addr = sym_to_addr(addrp) + off;
1174				if (sizep == NULL)
1175					size = sym_size(addrp) - off;
1176				if (addr - off == 0)
1177					fail(0, "symbol '%s' not found", addrp);
1178				if (size == 0)
1179					size = 1;
1180			}
1181
1182
1183			if (c == 'l') {
1184				filter_add(&filt, "arg0", addr, size);
1185			} else {
1186				filter_add(&filt, "caller", addr, size);
1187				filter_add(&ifilt, "arg0", addr, size);
1188			}
1189			break;
1190
1191		case 'e':
1192			evp = strtok_r(optarg, ",", &lastp);
1193			while (evp) {
1194				int ev1, ev2;
1195				char *evp2;
1196
1197				(void) strtok(evp, "-");
1198				evp2 = strtok(NULL, "-");
1199				ev1 = atoi(evp);
1200				ev2 = evp2 ? atoi(evp2) : ev1;
1201				if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1202				    (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1203					fail(0, "-e events out of range");
1204				for (i = ev1; i <= ev2; i++)
1205					g_enabled[i] = 1;
1206				evp = strtok_r(NULL, ",", &lastp);
1207			}
1208			events_specified = 1;
1209			break;
1210
1211		case 'c':
1212			g_cflag = 1;
1213			break;
1214
1215		case 'k':
1216			g_kflag = 1;
1217			break;
1218
1219		case 'w':
1220			g_wflag = 1;
1221			break;
1222
1223		case 'W':
1224			g_Wflag = 1;
1225			break;
1226
1227		case 'g':
1228			g_gflag = 1;
1229			break;
1230
1231		case 'C':
1232		case 'E':
1233		case 'H':
1234		case 'I':
1235			for (i = 0; i < LS_MAX_EVENTS; i++)
1236				if (g_event_info[i].ev_type == c)
1237					g_enabled[i] = 1;
1238			events_specified = 1;
1239			break;
1240
1241		case 'A':
1242			for (i = 0; i < LS_MAX_EVENTS; i++)
1243				if (strchr("CH", g_event_info[i].ev_type))
1244					g_enabled[i] = 1;
1245			events_specified = 1;
1246			break;
1247
1248		case 'T':
1249			g_tracing = 1;
1250			break;
1251
1252		case 'D':
1253			if (!isdigit(optarg[0]))
1254				usage();
1255			g_topn = atoi(optarg);
1256			break;
1257
1258		case 'R':
1259			g_rates = 1;
1260			break;
1261
1262		case 'p':
1263			g_pflag = 1;
1264			break;
1265
1266		case 'P':
1267			g_Pflag = 1;
1268			break;
1269
1270		case 'o':
1271			if ((out = fopen(optarg, "w")) == NULL)
1272				fail(1, "error opening file");
1273			break;
1274
1275		case 'V':
1276			g_Vflag = 1;
1277			break;
1278
1279		default:
1280			if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1281				usage();
1282		}
1283	}
1284
1285	if (filt != NULL) {
1286		predicate_add(&g_predicate, filt, NULL, 0);
1287		filter_destroy(&filt);
1288	}
1289
1290	if (ifilt != NULL) {
1291		predicate_add(&g_ipredicate, ifilt, NULL, 0);
1292		filter_destroy(&ifilt);
1293	}
1294
1295	if (g_recsize == 0) {
1296		if (g_gflag) {
1297			g_stkdepth = LS_MAX_STACK_DEPTH;
1298			g_recsize = LS_STACK(g_stkdepth);
1299		} else {
1300			g_recsize = LS_TIME;
1301		}
1302	}
1303
1304	if (g_gflag && g_recsize <= LS_STACK(0))
1305		fail(0, "'-g' requires at least '-s 1' data gathering");
1306
1307	/*
1308	 * Make sure the alignment is reasonable
1309	 */
1310	g_recsize = -(-g_recsize & -sizeof (uint64_t));
1311
1312	for (i = 0; i < LS_MAX_EVENTS; i++) {
1313		/*
1314		 * If no events were specified, enable -C.
1315		 */
1316		if (!events_specified && g_event_info[i].ev_type == 'C')
1317			g_enabled[i] = 1;
1318	}
1319
1320	for (i = 0; i < LS_MAX_EVENTS; i++) {
1321		if (!g_enabled[i])
1322			continue;
1323
1324		if (g_event_info[i].ev_acquire != NULL) {
1325			/*
1326			 * If we've enabled a hold event, we must explicitly
1327			 * allocate dynamic variable space.
1328			 */
1329			dynvar = 1;
1330		}
1331
1332		dprog_addevent(i);
1333	}
1334
1335	/*
1336	 * Make sure there are remaining arguments to specify a child command
1337	 * to execute.
1338	 */
1339	if (argc <= optind)
1340		usage();
1341
1342	if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1343		dfail("couldn't determine number of online CPUs");
1344
1345	/*
1346	 * By default, we set our data buffer size to be the number of records
1347	 * multiplied by the size of the record, doubled to account for some
1348	 * DTrace slop and divided by the number of CPUs.  We silently clamp
1349	 * the aggregation size at both a minimum and a maximum to prevent
1350	 * absurdly low or high values.
1351	 */
1352	if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1353		aggsize = MIN_AGGSIZE;
1354
1355	if (aggsize > MAX_AGGSIZE)
1356		aggsize = MAX_AGGSIZE;
1357
1358	(void) sprintf(aggstr, "%lld", (long long)aggsize);
1359
1360	if (!g_tracing) {
1361		if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1362			dfail("failed to set 'bufsize'");
1363
1364		if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1365			dfail("failed to set 'aggsize'");
1366
1367		if (dynvar) {
1368			/*
1369			 * If we're using dynamic variables, we set our
1370			 * dynamic variable size to be one megabyte per CPU,
1371			 * with a hard-limit of 32 megabytes.  This may still
1372			 * be too small in some cases, but it can be tuned
1373			 * manually via -x if need be.
1374			 */
1375			(void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1376
1377			if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1378				dfail("failed to set 'dynvarsize'");
1379		}
1380	} else {
1381		if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1382			dfail("failed to set 'bufsize'");
1383	}
1384
1385	if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1386		dfail("failed to set 'statusrate'");
1387
1388	optind = 1;
1389	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
1390		switch (c) {
1391		case 'x':
1392			if ((p = strchr(optarg, '=')) != NULL)
1393				*p++ = '\0';
1394
1395			if (dtrace_setopt(g_dtp, optarg, p) != 0)
1396				dfail("failed to set -x %s", optarg);
1397			break;
1398		}
1399	}
1400
1401	argc -= optind;
1402	argv += optind;
1403
1404	dprog_compile();
1405	status_init();
1406
1407	g_elapsed = -gethrtime();
1408
1409	/*
1410	 * Spawn the specified command and wait for it to complete.
1411	 */
1412	child = fork();
1413	if (child == -1)
1414		fail(1, "cannot fork");
1415	if (child == 0) {
1416		(void) dtrace_close(g_dtp);
1417		(void) execvp(argv[0], &argv[0]);
1418		exec_errno = errno;
1419		exit(127);
1420	}
1421
1422#if defined(sun)
1423	while (waitpid(child, &status, WEXITED) != child)
1424#else
1425	while (waitpid(child, &status, 0) != child)
1426#endif
1427		status_check();
1428
1429	g_elapsed += gethrtime();
1430
1431	if (WIFEXITED(status)) {
1432		if (WEXITSTATUS(status) != 0) {
1433			if (exec_errno != 0) {
1434				errno = exec_errno;
1435				fail(1, "could not execute %s", argv[0]);
1436			}
1437			(void) fprintf(stderr,
1438			    "lockstat: warning: %s exited with code %d\n",
1439			    argv[0], WEXITSTATUS(status));
1440		}
1441	} else {
1442		(void) fprintf(stderr,
1443		    "lockstat: warning: %s died on signal %d\n",
1444		    argv[0], WTERMSIG(status));
1445	}
1446
1447	if (dtrace_stop(g_dtp) == -1)
1448		dfail("failed to stop dtrace");
1449
1450	/*
1451	 * Before we read out the results, we need to allocate our buffer.
1452	 * If we're tracing, then we'll just use the precalculated size.  If
1453	 * we're not, then we'll take a snapshot of the aggregate, and walk
1454	 * it to count the number of records.
1455	 */
1456	if (!g_tracing) {
1457		if (dtrace_aggregate_snap(g_dtp) != 0)
1458			dfail("failed to snap aggregate");
1459
1460		g_nrecs = 0;
1461
1462		if (dtrace_aggregate_walk(g_dtp,
1463		    count_aggregate, &g_nrecs) != 0)
1464			dfail("failed to walk aggregate");
1465	}
1466
1467#if defined(sun)
1468	if ((data_buf = memalign(sizeof (uint64_t),
1469	    (g_nrecs + 1) * g_recsize)) == NULL)
1470#else
1471	if (posix_memalign((void **)&data_buf, sizeof (uint64_t),
1472	    (g_nrecs + 1) * g_recsize) )
1473#endif
1474		fail(1, "Memory allocation failed");
1475
1476	/*
1477	 * Read out the DTrace data.
1478	 */
1479	g_nrecs_used = process_data(out, data_buf);
1480
1481	if (g_nrecs_used > g_nrecs || g_dropped)
1482		(void) fprintf(stderr, "lockstat: warning: "
1483		    "ran out of data records (use -n for more)\n");
1484
1485	/* LINTED - alignment */
1486	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1487	    /* LINTED - alignment */
1488	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1489		ev_count[lsp->ls_event] += lsp->ls_count;
1490		ev_time[lsp->ls_event] += lsp->ls_time;
1491	}
1492
1493	/*
1494	 * If -g was specified, convert stacks into individual records.
1495	 */
1496	if (g_gflag) {
1497		lsrec_t *newlsp, *oldlsp;
1498
1499#if defined(sun)
1500		newlsp = memalign(sizeof (uint64_t),
1501		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1502#else
1503		posix_memalign((void **)&newlsp, sizeof (uint64_t),
1504		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1505#endif
1506		if (newlsp == NULL)
1507			fail(1, "Cannot allocate space for -g processing");
1508		lsp = newlsp;
1509		/* LINTED - alignment */
1510		for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1511		    /* LINTED - alignment */
1512		    oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1513			int fr;
1514			int caller_in_stack = 0;
1515
1516			if (oldlsp->ls_count == 0)
1517				continue;
1518
1519			for (fr = 0; fr < g_stkdepth; fr++) {
1520				if (oldlsp->ls_stack[fr] == 0)
1521					break;
1522				if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1523					caller_in_stack = 1;
1524				bcopy(oldlsp, lsp, LS_TIME);
1525				lsp->ls_caller = oldlsp->ls_stack[fr];
1526				/* LINTED - alignment */
1527				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1528			}
1529			if (!caller_in_stack) {
1530				bcopy(oldlsp, lsp, LS_TIME);
1531				/* LINTED - alignment */
1532				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1533			}
1534		}
1535		g_nrecs = g_nrecs_used =
1536		    ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1537		g_recsize = LS_TIME;
1538		g_stkdepth = 0;
1539		free(data_buf);
1540		data_buf = (char *)newlsp;
1541	}
1542
1543	if ((sort_buf = calloc(2 * (g_nrecs + 1),
1544	    sizeof (void *))) == NULL)
1545		fail(1, "Sort buffer allocation failed");
1546	merge_buf = sort_buf + (g_nrecs + 1);
1547
1548	/*
1549	 * Build the sort buffer, discarding zero-count records along the way.
1550	 */
1551	/* LINTED - alignment */
1552	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1553	    /* LINTED - alignment */
1554	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1555		if (lsp->ls_count == 0)
1556			lsp->ls_event = LS_MAX_EVENTS;
1557		sort_buf[i] = lsp;
1558	}
1559
1560	if (g_nrecs_used == 0)
1561		exit(0);
1562
1563	/*
1564	 * Add a sentinel after the last record
1565	 */
1566	sort_buf[i] = lsp;
1567	lsp->ls_event = LS_MAX_EVENTS;
1568
1569	if (g_tracing) {
1570		report_trace(out, sort_buf);
1571		return (0);
1572	}
1573
1574	/*
1575	 * Application of -g may have resulted in multiple records
1576	 * with the same signature; coalesce them.
1577	 */
1578	if (g_gflag) {
1579		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1580		coalesce(lockcmp, sort_buf, g_nrecs_used);
1581	}
1582
1583	/*
1584	 * Coalesce locks within the same symbol if -c option specified.
1585	 * Coalesce PCs within the same function if -k option specified.
1586	 */
1587	if (g_cflag || g_kflag) {
1588		for (i = 0; i < g_nrecs_used; i++) {
1589			int fr;
1590			lsp = sort_buf[i];
1591			if (g_cflag)
1592				coalesce_symbol(&lsp->ls_lock);
1593			if (g_kflag) {
1594				for (fr = 0; fr < g_stkdepth; fr++)
1595					coalesce_symbol(&lsp->ls_stack[fr]);
1596				coalesce_symbol(&lsp->ls_caller);
1597			}
1598		}
1599		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1600		coalesce(lockcmp, sort_buf, g_nrecs_used);
1601	}
1602
1603	/*
1604	 * Coalesce callers if -w option specified
1605	 */
1606	if (g_wflag) {
1607		mergesort(lock_and_count_cmp_anywhere,
1608		    sort_buf, merge_buf, g_nrecs_used);
1609		coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1610	}
1611
1612	/*
1613	 * Coalesce locks if -W option specified
1614	 */
1615	if (g_Wflag) {
1616		mergesort(site_and_count_cmp_anylock,
1617		    sort_buf, merge_buf, g_nrecs_used);
1618		coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1619	}
1620
1621	/*
1622	 * Sort data by contention count (ls_count) or total time (ls_time),
1623	 * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
1624	 */
1625	if (g_recsize < LS_TIME)
1626		g_Pflag = 0;
1627
1628	if (g_Pflag)
1629		mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1630	else
1631		mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1632
1633	/*
1634	 * Display data by event type
1635	 */
1636	first = &sort_buf[0];
1637	while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1638		current = first;
1639		while ((lsp = *current)->ls_event == event)
1640			current++;
1641		report_stats(out, first, current - first, ev_count[event],
1642		    ev_time[event]);
1643		first = current;
1644	}
1645
1646	return (0);
1647}
1648
1649static char *
1650format_symbol(char *buf, uintptr_t addr, int show_size)
1651{
1652	uintptr_t symoff;
1653	char *symname;
1654	size_t symsize;
1655
1656	symname = addr_to_sym(addr, &symoff, &symsize);
1657
1658	if (show_size && symoff == 0)
1659		(void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1660	else if (symoff == 0)
1661		(void) sprintf(buf, "%s", symname);
1662	else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)	/* CPU+PIL */
1663#if defined(sun)
1664		(void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1665#else
1666		(void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]);
1667#endif
1668	else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1669		(void) sprintf(buf, "%s+0x%llx", symname,
1670		    (unsigned long long)symoff);
1671	else
1672		(void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1673	return (buf);
1674}
1675
1676static void
1677report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1678	uint64_t total_time)
1679{
1680	uint32_t event = sort_buf[0]->ls_event;
1681	lsrec_t *lsp;
1682	double ptotal = 0.0;
1683	double percent;
1684	int i, j, fr;
1685	int displayed;
1686	int first_bin, last_bin, max_bin_count, total_bin_count;
1687	int rectype;
1688	char buf[256];
1689	char lhdr[80], chdr[80];
1690
1691	rectype = g_recsize;
1692
1693	if (g_topn == 0) {
1694		(void) fprintf(out, "%20llu %s\n",
1695		    g_rates == 0 ? total_count :
1696		    ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1697		    g_event_info[event].ev_desc);
1698		return;
1699	}
1700
1701	(void) sprintf(lhdr, "%s%s",
1702	    g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1703	(void) sprintf(chdr, "%s%s",
1704	    g_wflag ? "Hottest " : "", "Caller");
1705
1706	if (!g_pflag)
1707		(void) fprintf(out,
1708		    "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1709		    g_event_info[event].ev_desc, (double)total_count,
1710		    (double)g_elapsed / NANOSEC,
1711		    (double)total_count * NANOSEC / g_elapsed);
1712
1713	if (!g_pflag && rectype < LS_HIST) {
1714		(void) sprintf(buf, "%s", g_event_info[event].ev_units);
1715		(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1716		    g_rates ? "ops/s" : "Count",
1717		    g_gflag ? "genr" : "indv",
1718		    "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1719		(void) fprintf(out, "---------------------------------"
1720		    "----------------------------------------------\n");
1721	}
1722
1723	displayed = 0;
1724	for (i = 0; i < nrecs; i++) {
1725		lsp = sort_buf[i];
1726
1727		if (displayed++ >= g_topn)
1728			break;
1729
1730		if (g_pflag) {
1731			int j;
1732
1733			(void) fprintf(out, "%u %u",
1734			    lsp->ls_event, lsp->ls_count);
1735			(void) fprintf(out, " %s",
1736			    format_symbol(buf, lsp->ls_lock, g_cflag));
1737			(void) fprintf(out, " %s",
1738			    format_symbol(buf, lsp->ls_caller, 0));
1739			(void) fprintf(out, " %f",
1740			    (double)lsp->ls_refcnt / lsp->ls_count);
1741			if (rectype >= LS_TIME)
1742				(void) fprintf(out, " %llu",
1743				    (unsigned long long)lsp->ls_time);
1744			if (rectype >= LS_HIST) {
1745				for (j = 0; j < 64; j++)
1746					(void) fprintf(out, " %u",
1747					    lsp->ls_hist[j]);
1748			}
1749			for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1750				if (rectype <= LS_STACK(j) ||
1751				    lsp->ls_stack[j] == 0)
1752					break;
1753				(void) fprintf(out, " %s",
1754				    format_symbol(buf, lsp->ls_stack[j], 0));
1755			}
1756			(void) fprintf(out, "\n");
1757			continue;
1758		}
1759
1760		if (rectype >= LS_HIST) {
1761			(void) fprintf(out, "---------------------------------"
1762			    "----------------------------------------------\n");
1763			(void) sprintf(buf, "%s",
1764			    g_event_info[event].ev_units);
1765			(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1766			    g_rates ? "ops/s" : "Count",
1767			    g_gflag ? "genr" : "indv",
1768			    "cuml", "rcnt", buf, lhdr, chdr);
1769		}
1770
1771		if (g_Pflag && total_time != 0)
1772			percent = (lsp->ls_time * 100.00) / total_time;
1773		else
1774			percent = (lsp->ls_count * 100.00) / total_count;
1775
1776		ptotal += percent;
1777
1778		if (rectype >= LS_TIME)
1779			(void) sprintf(buf, "%llu",
1780			    (unsigned long long)(lsp->ls_time / lsp->ls_count));
1781		else
1782			buf[0] = '\0';
1783
1784		(void) fprintf(out, "%5llu ",
1785		    g_rates == 0 ? lsp->ls_count :
1786		    ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1787
1788		(void) fprintf(out, "%3.0f%% ", percent);
1789
1790		if (g_gflag)
1791			(void) fprintf(out, "---- ");
1792		else
1793			(void) fprintf(out, "%3.0f%% ", ptotal);
1794
1795		(void) fprintf(out, "%4.2f %8s ",
1796		    (double)lsp->ls_refcnt / lsp->ls_count, buf);
1797
1798		(void) fprintf(out, "%-22s ",
1799		    format_symbol(buf, lsp->ls_lock, g_cflag));
1800
1801		(void) fprintf(out, "%-24s\n",
1802		    format_symbol(buf, lsp->ls_caller, 0));
1803
1804		if (rectype < LS_HIST)
1805			continue;
1806
1807		(void) fprintf(out, "\n");
1808		(void) fprintf(out, "%10s %31s %-9s %-24s\n",
1809		    g_event_info[event].ev_units,
1810		    "------ Time Distribution ------",
1811		    g_rates ? "ops/s" : "count",
1812		    rectype > LS_STACK(0) ? "Stack" : "");
1813
1814		first_bin = 0;
1815		while (lsp->ls_hist[first_bin] == 0)
1816			first_bin++;
1817
1818		last_bin = 63;
1819		while (lsp->ls_hist[last_bin] == 0)
1820			last_bin--;
1821
1822		max_bin_count = 0;
1823		total_bin_count = 0;
1824		for (j = first_bin; j <= last_bin; j++) {
1825			total_bin_count += lsp->ls_hist[j];
1826			if (lsp->ls_hist[j] > max_bin_count)
1827				max_bin_count = lsp->ls_hist[j];
1828		}
1829
1830		/*
1831		 * If we went a few frames below the caller, ignore them
1832		 */
1833		for (fr = 3; fr > 0; fr--)
1834			if (lsp->ls_stack[fr] == lsp->ls_caller)
1835				break;
1836
1837		for (j = first_bin; j <= last_bin; j++) {
1838			uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1839			(void) fprintf(out, "%10llu |%s%s %-9u ",
1840			    1ULL << j,
1841			    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1842			    "                              " + depth,
1843			    g_rates == 0 ? lsp->ls_hist[j] :
1844			    (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1845			    g_elapsed));
1846			if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1847				(void) fprintf(out, "\n");
1848				continue;
1849			}
1850			(void) fprintf(out, "%-24s\n",
1851			    format_symbol(buf, lsp->ls_stack[fr], 0));
1852			fr++;
1853		}
1854		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1855			(void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1856			    format_symbol(buf, lsp->ls_stack[fr], 0));
1857			fr++;
1858		}
1859	}
1860
1861	if (!g_pflag)
1862		(void) fprintf(out, "---------------------------------"
1863		    "----------------------------------------------\n");
1864
1865	(void) fflush(out);
1866}
1867
1868static void
1869report_trace(FILE *out, lsrec_t **sort_buf)
1870{
1871	lsrec_t *lsp;
1872	int i, fr;
1873	int rectype;
1874	char buf[256], buf2[256];
1875
1876	rectype = g_recsize;
1877
1878	if (!g_pflag) {
1879		(void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
1880		    "Event", "Time", "Owner", "Lock", "Caller");
1881		(void) fprintf(out, "---------------------------------"
1882		    "----------------------------------------------\n");
1883	}
1884
1885	for (i = 0; i < g_nrecs_used; i++) {
1886
1887		lsp = sort_buf[i];
1888
1889		if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1890			continue;
1891
1892		(void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
1893		    lsp->ls_event, (unsigned long long)lsp->ls_time,
1894		    (void *)lsp->ls_next,
1895		    format_symbol(buf, lsp->ls_lock, 0),
1896		    format_symbol(buf2, lsp->ls_caller, 0));
1897
1898		if (rectype <= LS_STACK(0))
1899			continue;
1900
1901		/*
1902		 * If we went a few frames below the caller, ignore them
1903		 */
1904		for (fr = 3; fr > 0; fr--)
1905			if (lsp->ls_stack[fr] == lsp->ls_caller)
1906				break;
1907
1908		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1909			(void) fprintf(out, "%53s  %-24s\n", "",
1910			    format_symbol(buf, lsp->ls_stack[fr], 0));
1911			fr++;
1912		}
1913		(void) fprintf(out, "\n");
1914	}
1915
1916	(void) fflush(out);
1917}
1918