1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <stdio.h>
29#include <stddef.h>
30#include <stdlib.h>
31#include <stdarg.h>
32#include <string.h>
33#include <strings.h>
34#include <ctype.h>
35#include <fcntl.h>
36#include <unistd.h>
37#include <errno.h>
38#include <limits.h>
39#include <sys/types.h>
40#include <sys/modctl.h>
41#include <sys/stat.h>
42#include <sys/wait.h>
43#include <dtrace.h>
44#include <sys/lockstat.h>
45#include <alloca.h>
46#include <signal.h>
47#include <assert.h>
48
49#define	LOCKSTAT_OPTSTR	"x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
50
51#define	LS_MAX_STACK_DEPTH	50
52#define	LS_MAX_EVENTS		64
53
54typedef struct lsrec {
55	struct lsrec	*ls_next;	/* next in hash chain */
56	uintptr_t	ls_lock;	/* lock address */
57	uintptr_t	ls_caller;	/* caller address */
58	uint32_t	ls_count;	/* cumulative event count */
59	uint32_t	ls_event;	/* type of event */
60	uintptr_t	ls_refcnt;	/* cumulative reference count */
61	uint64_t	ls_time;	/* cumulative event duration */
62	uint32_t	ls_hist[64];	/* log2(duration) histogram */
63	uintptr_t	ls_stack[LS_MAX_STACK_DEPTH];
64} lsrec_t;
65
66typedef struct lsdata {
67	struct lsrec	*lsd_next;	/* next available */
68	int		lsd_count;	/* number of records */
69} lsdata_t;
70
71/*
72 * Definitions for the types of experiments which can be run.  They are
73 * listed in increasing order of memory cost and processing time cost.
74 * The numerical value of each type is the number of bytes needed per record.
75 */
76#define	LS_BASIC	offsetof(lsrec_t, ls_time)
77#define	LS_TIME		offsetof(lsrec_t, ls_hist[0])
78#define	LS_HIST		offsetof(lsrec_t, ls_stack[0])
79#define	LS_STACK(depth)	offsetof(lsrec_t, ls_stack[depth])
80
81static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
82static void report_trace(FILE *, lsrec_t **);
83
84extern int symtab_init(void);
85extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
86extern uintptr_t sym_to_addr(char *name);
87extern size_t sym_size(char *name);
88extern char *strtok_r(char *, const char *, char **);
89
90#define	DEFAULT_NRECS	10000
91#define	DEFAULT_HZ	97
92#define	MAX_HZ		1000
93#define	MIN_AGGSIZE	(16 * 1024)
94#define	MAX_AGGSIZE	(32 * 1024 * 1024)
95
96static int g_stkdepth;
97static int g_topn = INT_MAX;
98static hrtime_t g_elapsed;
99static int g_rates = 0;
100static int g_pflag = 0;
101static int g_Pflag = 0;
102static int g_wflag = 0;
103static int g_Wflag = 0;
104static int g_cflag = 0;
105static int g_kflag = 0;
106static int g_gflag = 0;
107static int g_Vflag = 0;
108static int g_tracing = 0;
109static size_t g_recsize;
110static size_t g_nrecs;
111static int g_nrecs_used;
112static uchar_t g_enabled[LS_MAX_EVENTS];
113static hrtime_t g_min_duration[LS_MAX_EVENTS];
114static dtrace_hdl_t *g_dtp;
115static char *g_predicate;
116static char *g_ipredicate;
117static char *g_prog;
118static int g_proglen;
119static int g_dropped;
120
121typedef struct ls_event_info {
122	char	ev_type;
123	char	ev_lhdr[20];
124	char	ev_desc[80];
125	char	ev_units[10];
126	char	ev_name[DTRACE_NAMELEN];
127	char	*ev_predicate;
128	char	*ev_acquire;
129} ls_event_info_t;
130
131static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
132	{ 'C',	"Lock",	"Adaptive mutex spin",			"nsec",
133	    "lockstat:::adaptive-spin" },
134	{ 'C',	"Lock",	"Adaptive mutex block",			"nsec",
135	    "lockstat:::adaptive-block" },
136	{ 'C',	"Lock",	"Spin lock spin",			"nsec",
137	    "lockstat:::spin-spin" },
138	{ 'C',	"Lock",	"Thread lock spin",			"nsec",
139	    "lockstat:::thread-spin" },
140	{ 'C',	"Lock",	"R/W writer blocked by writer",		"nsec",
141	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
142	{ 'C',	"Lock",	"R/W writer blocked by readers",	"nsec",
143	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
144	{ 'C',	"Lock",	"R/W reader blocked by writer",		"nsec",
145	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
146	{ 'C',	"Lock",	"R/W reader blocked by write wanted",	"nsec",
147	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
148	{ 'C',	"Lock",	"Unknown event (type 8)",		"units"	},
149	{ 'C',	"Lock",	"Unknown event (type 9)",		"units"	},
150	{ 'C',	"Lock",	"Unknown event (type 10)",		"units"	},
151	{ 'C',	"Lock",	"Unknown event (type 11)",		"units"	},
152	{ 'C',	"Lock",	"Unknown event (type 12)",		"units"	},
153	{ 'C',	"Lock",	"Unknown event (type 13)",		"units"	},
154	{ 'C',	"Lock",	"Unknown event (type 14)",		"units"	},
155	{ 'C',	"Lock",	"Unknown event (type 15)",		"units"	},
156	{ 'C',	"Lock",	"Unknown event (type 16)",		"units"	},
157	{ 'C',	"Lock",	"Unknown event (type 17)",		"units"	},
158	{ 'C',	"Lock",	"Unknown event (type 18)",		"units"	},
159	{ 'C',	"Lock",	"Unknown event (type 19)",		"units"	},
160	{ 'C',	"Lock",	"Unknown event (type 20)",		"units"	},
161	{ 'C',	"Lock",	"Unknown event (type 21)",		"units"	},
162	{ 'C',	"Lock",	"Unknown event (type 22)",		"units"	},
163	{ 'C',	"Lock",	"Unknown event (type 23)",		"units"	},
164	{ 'C',	"Lock",	"Unknown event (type 24)",		"units"	},
165	{ 'C',	"Lock",	"Unknown event (type 25)",		"units"	},
166	{ 'C',	"Lock",	"Unknown event (type 26)",		"units"	},
167	{ 'C',	"Lock",	"Unknown event (type 27)",		"units"	},
168	{ 'C',	"Lock",	"Unknown event (type 28)",		"units"	},
169	{ 'C',	"Lock",	"Unknown event (type 29)",		"units"	},
170	{ 'C',	"Lock",	"Unknown event (type 30)",		"units"	},
171	{ 'C',	"Lock",	"Unknown event (type 31)",		"units"	},
172	{ 'H',	"Lock",	"Adaptive mutex hold",			"nsec",
173	    "lockstat:::adaptive-release", NULL,
174	    "lockstat:::adaptive-acquire" },
175	{ 'H',	"Lock",	"Spin lock hold",			"nsec",
176	    "lockstat:::spin-release", NULL,
177	    "lockstat:::spin-acquire" },
178	{ 'H',	"Lock",	"R/W writer hold",			"nsec",
179	    "lockstat:::rw-release", "arg1 == 0",
180	    "lockstat:::rw-acquire" },
181	{ 'H',	"Lock",	"R/W reader hold",			"nsec",
182	    "lockstat:::rw-release", "arg1 != 0",
183	    "lockstat:::rw-acquire" },
184	{ 'H',	"Lock",	"Unknown event (type 36)",		"units"	},
185	{ 'H',	"Lock",	"Unknown event (type 37)",		"units"	},
186	{ 'H',	"Lock",	"Unknown event (type 38)",		"units"	},
187	{ 'H',	"Lock",	"Unknown event (type 39)",		"units"	},
188	{ 'H',	"Lock",	"Unknown event (type 40)",		"units"	},
189	{ 'H',	"Lock",	"Unknown event (type 41)",		"units"	},
190	{ 'H',	"Lock",	"Unknown event (type 42)",		"units"	},
191	{ 'H',	"Lock",	"Unknown event (type 43)",		"units"	},
192	{ 'H',	"Lock",	"Unknown event (type 44)",		"units"	},
193	{ 'H',	"Lock",	"Unknown event (type 45)",		"units"	},
194	{ 'H',	"Lock",	"Unknown event (type 46)",		"units"	},
195	{ 'H',	"Lock",	"Unknown event (type 47)",		"units"	},
196	{ 'H',	"Lock",	"Unknown event (type 48)",		"units"	},
197	{ 'H',	"Lock",	"Unknown event (type 49)",		"units"	},
198	{ 'H',	"Lock",	"Unknown event (type 50)",		"units"	},
199	{ 'H',	"Lock",	"Unknown event (type 51)",		"units"	},
200	{ 'H',	"Lock",	"Unknown event (type 52)",		"units"	},
201	{ 'H',	"Lock",	"Unknown event (type 53)",		"units"	},
202	{ 'H',	"Lock",	"Unknown event (type 54)",		"units"	},
203	{ 'H',	"Lock",	"Unknown event (type 55)",		"units"	},
204	{ 'I',	"CPU+PIL", "Profiling interrupt",		"nsec",
205	    "profile:::profile-97", NULL },
206	{ 'I',	"Lock",	"Unknown event (type 57)",		"units"	},
207	{ 'I',	"Lock",	"Unknown event (type 58)",		"units"	},
208	{ 'I',	"Lock",	"Unknown event (type 59)",		"units"	},
209	{ 'E',	"Lock",	"Recursive lock entry detected",	"(N/A)",
210	    "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
211	{ 'E',	"Lock",	"Lockstat enter failure",		"(N/A)"	},
212	{ 'E',	"Lock",	"Lockstat exit failure",		"nsec"	},
213	{ 'E',	"Lock",	"Lockstat record failure",		"(N/A)"	},
214};
215
216static void
217fail(int do_perror, const char *message, ...)
218{
219	va_list args;
220	int save_errno = errno;
221
222	va_start(args, message);
223	(void) fprintf(stderr, "lockstat: ");
224	(void) vfprintf(stderr, message, args);
225	va_end(args);
226	if (do_perror)
227		(void) fprintf(stderr, ": %s", strerror(save_errno));
228	(void) fprintf(stderr, "\n");
229	exit(2);
230}
231
232static void
233dfail(const char *message, ...)
234{
235	va_list args;
236
237	va_start(args, message);
238	(void) fprintf(stderr, "lockstat: ");
239	(void) vfprintf(stderr, message, args);
240	va_end(args);
241	(void) fprintf(stderr, ": %s\n",
242	    dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
243
244	exit(2);
245}
246
247static void
248show_events(char event_type, char *desc)
249{
250	int i, first = -1, last;
251
252	for (i = 0; i < LS_MAX_EVENTS; i++) {
253		ls_event_info_t *evp = &g_event_info[i];
254		if (evp->ev_type != event_type ||
255		    strncmp(evp->ev_desc, "Unknown event", 13) == 0)
256			continue;
257		if (first == -1)
258			first = i;
259		last = i;
260	}
261
262	(void) fprintf(stderr,
263	    "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
264	    desc, event_type, first, last);
265
266	for (i = first; i <= last; i++)
267		(void) fprintf(stderr,
268		    "%4d = %s\n", i, g_event_info[i].ev_desc);
269}
270
271static void
272usage(void)
273{
274	(void) fprintf(stderr,
275	    "Usage: lockstat [options] command [args]\n"
276	    "\nEvent selection options:\n\n"
277	    "  -C              watch contention events [on by default]\n"
278	    "  -E              watch error events [off by default]\n"
279	    "  -H              watch hold events [off by default]\n"
280	    "  -I              watch interrupt events [off by default]\n"
281	    "  -A              watch all lock events [equivalent to -CH]\n"
282	    "  -e event_list   only watch the specified events (shown below);\n"
283	    "                  <event_list> is a comma-separated list of\n"
284	    "                  events or ranges of events, e.g. 1,4-7,35\n"
285	    "  -i rate         interrupt rate for -I [default: %d Hz]\n"
286	    "\nData gathering options:\n\n"
287	    "  -b              basic statistics (lock, caller, event count)\n"
288	    "  -t              timing for all events [default]\n"
289	    "  -h              histograms for event times\n"
290	    "  -s depth        stack traces <depth> deep\n"
291	    "  -x opt[=val]    enable or modify DTrace options\n"
292	    "\nData filtering options:\n\n"
293	    "  -n nrecords     maximum number of data records [default: %d]\n"
294	    "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
295	    "                  symbolic name or hex address; <size> defaults\n"
296	    "                  to the ELF symbol size if available, 1 if not\n"
297	    "  -f func[,size]  only watch events generated by <func>\n"
298	    "  -d duration     only watch events longer than <duration>\n"
299	    "  -T              trace (rather than sample) events\n"
300	    "\nData reporting options:\n\n"
301	    "  -c              coalesce lock data for arrays like pse_mutex[]\n"
302	    "  -k              coalesce PCs within functions\n"
303	    "  -g              show total events generated by function\n"
304	    "  -w              wherever: don't distinguish events by caller\n"
305	    "  -W              whichever: don't distinguish events by lock\n"
306	    "  -R              display rates rather than counts\n"
307	    "  -p              parsable output format (awk(1)-friendly)\n"
308	    "  -P              sort lock data by (count * avg_time) product\n"
309	    "  -D n            only display top <n> events of each type\n"
310	    "  -o filename     send output to <filename>\n",
311	    DEFAULT_HZ, DEFAULT_NRECS);
312
313	show_events('C', "Contention");
314	show_events('H', "Hold-time");
315	show_events('I', "Interrupt");
316	show_events('E', "Error");
317	(void) fprintf(stderr, "\n");
318
319	exit(1);
320}
321
322static int
323lockcmp(lsrec_t *a, lsrec_t *b)
324{
325	int i;
326
327	if (a->ls_event < b->ls_event)
328		return (-1);
329	if (a->ls_event > b->ls_event)
330		return (1);
331
332	for (i = g_stkdepth - 1; i >= 0; i--) {
333		if (a->ls_stack[i] < b->ls_stack[i])
334			return (-1);
335		if (a->ls_stack[i] > b->ls_stack[i])
336			return (1);
337	}
338
339	if (a->ls_caller < b->ls_caller)
340		return (-1);
341	if (a->ls_caller > b->ls_caller)
342		return (1);
343
344	if (a->ls_lock < b->ls_lock)
345		return (-1);
346	if (a->ls_lock > b->ls_lock)
347		return (1);
348
349	return (0);
350}
351
352static int
353countcmp(lsrec_t *a, lsrec_t *b)
354{
355	if (a->ls_event < b->ls_event)
356		return (-1);
357	if (a->ls_event > b->ls_event)
358		return (1);
359
360	return (b->ls_count - a->ls_count);
361}
362
363static int
364timecmp(lsrec_t *a, lsrec_t *b)
365{
366	if (a->ls_event < b->ls_event)
367		return (-1);
368	if (a->ls_event > b->ls_event)
369		return (1);
370
371	if (a->ls_time < b->ls_time)
372		return (1);
373	if (a->ls_time > b->ls_time)
374		return (-1);
375
376	return (0);
377}
378
379static int
380lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
381{
382	if (a->ls_event < b->ls_event)
383		return (-1);
384	if (a->ls_event > b->ls_event)
385		return (1);
386
387	if (a->ls_lock < b->ls_lock)
388		return (-1);
389	if (a->ls_lock > b->ls_lock)
390		return (1);
391
392	return (0);
393}
394
395static int
396lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
397{
398	if (a->ls_event < b->ls_event)
399		return (-1);
400	if (a->ls_event > b->ls_event)
401		return (1);
402
403	if (a->ls_lock < b->ls_lock)
404		return (-1);
405	if (a->ls_lock > b->ls_lock)
406		return (1);
407
408	return (b->ls_count - a->ls_count);
409}
410
411static int
412sitecmp_anylock(lsrec_t *a, lsrec_t *b)
413{
414	int i;
415
416	if (a->ls_event < b->ls_event)
417		return (-1);
418	if (a->ls_event > b->ls_event)
419		return (1);
420
421	for (i = g_stkdepth - 1; i >= 0; i--) {
422		if (a->ls_stack[i] < b->ls_stack[i])
423			return (-1);
424		if (a->ls_stack[i] > b->ls_stack[i])
425			return (1);
426	}
427
428	if (a->ls_caller < b->ls_caller)
429		return (-1);
430	if (a->ls_caller > b->ls_caller)
431		return (1);
432
433	return (0);
434}
435
436static int
437site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
438{
439	int i;
440
441	if (a->ls_event < b->ls_event)
442		return (-1);
443	if (a->ls_event > b->ls_event)
444		return (1);
445
446	for (i = g_stkdepth - 1; i >= 0; i--) {
447		if (a->ls_stack[i] < b->ls_stack[i])
448			return (-1);
449		if (a->ls_stack[i] > b->ls_stack[i])
450			return (1);
451	}
452
453	if (a->ls_caller < b->ls_caller)
454		return (-1);
455	if (a->ls_caller > b->ls_caller)
456		return (1);
457
458	return (b->ls_count - a->ls_count);
459}
460
461static void
462mergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
463{
464	int m = n / 2;
465	int i, j;
466
467	if (m > 1)
468		mergesort(cmp, a, b, m);
469	if (n - m > 1)
470		mergesort(cmp, a + m, b + m, n - m);
471	for (i = m; i > 0; i--)
472		b[i - 1] = a[i - 1];
473	for (j = m - 1; j < n - 1; j++)
474		b[n + m - j - 2] = a[j + 1];
475	while (i < j)
476		*a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
477	*a = b[i];
478}
479
480static void
481coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
482{
483	int i, j;
484	lsrec_t *target, *current;
485
486	target = lock[0];
487
488	for (i = 1; i < n; i++) {
489		current = lock[i];
490		if (cmp(current, target) != 0) {
491			target = current;
492			continue;
493		}
494		current->ls_event = LS_MAX_EVENTS;
495		target->ls_count += current->ls_count;
496		target->ls_refcnt += current->ls_refcnt;
497		if (g_recsize < LS_TIME)
498			continue;
499		target->ls_time += current->ls_time;
500		if (g_recsize < LS_HIST)
501			continue;
502		for (j = 0; j < 64; j++)
503			target->ls_hist[j] += current->ls_hist[j];
504	}
505}
506
507static void
508coalesce_symbol(uintptr_t *addrp)
509{
510	uintptr_t symoff;
511	size_t symsize;
512
513	if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
514		*addrp -= symoff;
515}
516
517static void
518predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
519{
520	char *new;
521	int len, newlen;
522
523	if (what == NULL)
524		return;
525
526	if (*pred == NULL) {
527		*pred = malloc(1);
528		*pred[0] = '\0';
529	}
530
531	len = strlen(*pred);
532	newlen = len + strlen(what) + 32 + strlen("( && )");
533	new = malloc(newlen);
534
535	if (*pred[0] != '\0') {
536		if (cmp != NULL) {
537			(void) sprintf(new, "(%s) && (%s %s 0x%p)",
538			    *pred, what, cmp, (void *)value);
539		} else {
540			(void) sprintf(new, "(%s) && (%s)", *pred, what);
541		}
542	} else {
543		if (cmp != NULL) {
544			(void) sprintf(new, "%s %s 0x%p",
545			    what, cmp, (void *)value);
546		} else {
547			(void) sprintf(new, "%s", what);
548		}
549	}
550
551	free(*pred);
552	*pred = new;
553}
554
555static void
556predicate_destroy(char **pred)
557{
558	free(*pred);
559	*pred = NULL;
560}
561
562static void
563filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
564{
565	char buf[256], *c = buf, *new;
566	int len, newlen;
567
568	if (*filt == NULL) {
569		*filt = malloc(1);
570		*filt[0] = '\0';
571	}
572
573	(void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
574	    " || " : "", what, (void *)base, what, (void *)(base + size));
575
576	newlen = (len = strlen(*filt) + 1) + strlen(c);
577	new = malloc(newlen);
578	bcopy(*filt, new, len);
579	(void) strcat(new, c);
580	free(*filt);
581	*filt = new;
582}
583
584static void
585filter_destroy(char **filt)
586{
587	free(*filt);
588	*filt = NULL;
589}
590
591static void
592dprog_add(const char *fmt, ...)
593{
594	va_list args;
595	int size, offs;
596	char c;
597
598	va_start(args, fmt);
599	size = vsnprintf(&c, 1, fmt, args) + 1;
600
601	if (g_proglen == 0) {
602		offs = 0;
603	} else {
604		offs = g_proglen - 1;
605	}
606
607	g_proglen = offs + size;
608
609	if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
610		fail(1, "failed to reallocate program text");
611
612	(void) vsnprintf(&g_prog[offs], size, fmt, args);
613}
614
615/*
616 * This function may read like an open sewer, but keep in mind that programs
617 * that generate other programs are rarely pretty.  If one has the unenviable
618 * task of maintaining or -- worse -- extending this code, use the -V option
619 * to examine the D program as generated by this function.
620 */
621static void
622dprog_addevent(int event)
623{
624	ls_event_info_t *info = &g_event_info[event];
625	char *pred = NULL;
626	char stack[20];
627	const char *arg0, *caller;
628	char *arg1 = "arg1";
629	char buf[80];
630	hrtime_t dur;
631	int depth;
632
633	if (info->ev_name[0] == '\0')
634		return;
635
636	if (info->ev_type == 'I') {
637		/*
638		 * For interrupt events, arg0 (normally the lock pointer) is
639		 * the CPU address plus the current pil, and arg1 (normally
640		 * the number of nanoseconds) is the number of nanoseconds
641		 * late -- and it's stored in arg2.
642		 */
643		arg0 = "(uintptr_t)curthread->t_cpu + \n"
644		    "\t    curthread->t_cpu->cpu_profile_pil";
645		caller = "(uintptr_t)arg0";
646		arg1 = "arg2";
647	} else {
648		arg0 = "(uintptr_t)arg0";
649		caller = "caller";
650	}
651
652	if (g_recsize > LS_HIST) {
653		for (depth = 0; g_recsize > LS_STACK(depth); depth++)
654			continue;
655
656		if (g_tracing) {
657			(void) sprintf(stack, "\tstack(%d);\n", depth);
658		} else {
659			(void) sprintf(stack, ", stack(%d)", depth);
660		}
661	} else {
662		(void) sprintf(stack, "");
663	}
664
665	if (info->ev_acquire != NULL) {
666		/*
667		 * If this is a hold event, we need to generate an additional
668		 * clause for the acquire; the clause for the release will be
669		 * generated with the aggregating statement, below.
670		 */
671		dprog_add("%s\n", info->ev_acquire);
672		predicate_add(&pred, info->ev_predicate, NULL, 0);
673		predicate_add(&pred, g_predicate, NULL, 0);
674		if (pred != NULL)
675			dprog_add("/%s/\n", pred);
676
677		dprog_add("{\n");
678		(void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
679
680		if (info->ev_type == 'H') {
681			dprog_add("\t%s = timestamp;\n", buf);
682		} else {
683			/*
684			 * If this isn't a hold event, it's the recursive
685			 * error event.  For this, we simply bump the
686			 * thread-local, per-lock count.
687			 */
688			dprog_add("\t%s++;\n", buf);
689		}
690
691		dprog_add("}\n\n");
692		predicate_destroy(&pred);
693		pred = NULL;
694
695		if (info->ev_type == 'E') {
696			/*
697			 * If this is the recursive lock error event, we need
698			 * to generate an additional clause to decrement the
699			 * thread-local, per-lock count.  This assures that we
700			 * only execute the aggregating clause if we have
701			 * recursive entry.
702			 */
703			dprog_add("%s\n", info->ev_name);
704			dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
705		}
706
707		predicate_add(&pred, buf, NULL, 0);
708
709		if (info->ev_type == 'H') {
710			(void) sprintf(buf, "timestamp -\n\t    "
711			    "self->ev%d[(uintptr_t)arg0]", event);
712		}
713
714		arg1 = buf;
715	} else {
716		predicate_add(&pred, info->ev_predicate, NULL, 0);
717		if (info->ev_type != 'I')
718			predicate_add(&pred, g_predicate, NULL, 0);
719		else
720			predicate_add(&pred, g_ipredicate, NULL, 0);
721	}
722
723	if ((dur = g_min_duration[event]) != 0)
724		predicate_add(&pred, arg1, ">=", dur);
725
726	dprog_add("%s\n", info->ev_name);
727
728	if (pred != NULL)
729		dprog_add("/%s/\n", pred);
730	predicate_destroy(&pred);
731
732	dprog_add("{\n");
733
734	if (g_tracing) {
735		dprog_add("\ttrace(%dULL);\n", event);
736		dprog_add("\ttrace(%s);\n", arg0);
737		dprog_add("\ttrace(%s);\n", caller);
738		dprog_add(stack);
739	} else {
740		/*
741		 * The ordering here is important:  when we process the
742		 * aggregate, we count on the fact that @avg appears before
743		 * @hist in program order to assure that @avg is assigned the
744		 * first aggregation variable ID and @hist assigned the
745		 * second; see the comment in process_aggregate() for details.
746		 */
747		dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
748		    event, arg0, caller, stack, arg1);
749
750		if (g_recsize >= LS_HIST) {
751			dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
752			    "(%s);\n", event, arg0, caller, stack, arg1);
753		}
754	}
755
756	if (info->ev_acquire != NULL)
757		dprog_add("\tself->ev%d[arg0] = 0;\n", event);
758
759	dprog_add("}\n\n");
760}
761
762static void
763dprog_compile()
764{
765	dtrace_prog_t *prog;
766	dtrace_proginfo_t info;
767
768	if (g_Vflag) {
769		(void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
770		(void) fputs(g_prog, stderr);
771		(void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
772	}
773
774	if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
775	    DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
776		dfail("failed to compile program");
777
778	if (dtrace_program_exec(g_dtp, prog, &info) == -1)
779		dfail("failed to enable probes");
780
781	if (dtrace_go(g_dtp) != 0)
782		dfail("couldn't start tracing");
783}
784
785static void
786status_fire(void)
787{}
788
789static void
790status_init(void)
791{
792	dtrace_optval_t val, status, agg;
793	struct sigaction act;
794	struct itimerspec ts;
795	struct sigevent ev;
796	timer_t tid;
797
798	if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
799		dfail("failed to get 'statusrate'");
800
801	if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
802		dfail("failed to get 'statusrate'");
803
804	/*
805	 * We would want to awaken at a rate that is the GCD of the statusrate
806	 * and the aggrate -- but that seems a bit absurd.  Instead, we'll
807	 * simply awaken at a rate that is the more frequent of the two, which
808	 * assures that we're never later than the interval implied by the
809	 * more frequent rate.
810	 */
811	val = status < agg ? status : agg;
812
813	(void) sigemptyset(&act.sa_mask);
814	act.sa_flags = 0;
815	act.sa_handler = status_fire;
816	(void) sigaction(SIGUSR1, &act, NULL);
817
818	ev.sigev_notify = SIGEV_SIGNAL;
819	ev.sigev_signo = SIGUSR1;
820
821	if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
822		dfail("cannot create CLOCK_REALTIME timer");
823
824	ts.it_value.tv_sec = val / NANOSEC;
825	ts.it_value.tv_nsec = val % NANOSEC;
826	ts.it_interval = ts.it_value;
827
828	if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
829		dfail("cannot set time on CLOCK_REALTIME timer");
830}
831
832static void
833status_check(void)
834{
835	if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
836		dfail("failed to snap aggregate");
837
838	if (dtrace_status(g_dtp) == -1)
839		dfail("dtrace_status()");
840}
841
842static void
843lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
844{
845	bzero(lsrec, g_recsize);
846	lsrec->ls_count = 1;
847
848	if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
849		fail(0, "truncated DTrace record");
850
851	if (rec->dtrd_size != sizeof (uint64_t))
852		fail(0, "bad event size in first record");
853
854	/* LINTED - alignment */
855	lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
856	rec++;
857
858	if (rec->dtrd_size != sizeof (uintptr_t))
859		fail(0, "bad lock address size in second record");
860
861	/* LINTED - alignment */
862	lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
863	rec++;
864
865	if (rec->dtrd_size != sizeof (uintptr_t))
866		fail(0, "bad caller size in third record");
867
868	/* LINTED - alignment */
869	lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
870	rec++;
871
872	if (g_recsize > LS_HIST) {
873		int frames, i;
874		pc_t *stack;
875
876		frames = rec->dtrd_size / sizeof (pc_t);
877		/* LINTED - alignment */
878		stack = (pc_t *)(data + rec->dtrd_offset);
879
880		for (i = 1; i < frames; i++)
881			lsrec->ls_stack[i - 1] = stack[i];
882	}
883}
884
885/*ARGSUSED*/
886static int
887count_aggregate(const dtrace_aggdata_t *agg, void *arg)
888{
889	*((size_t *)arg) += 1;
890
891	return (DTRACE_AGGWALK_NEXT);
892}
893
894static int
895process_aggregate(const dtrace_aggdata_t *agg, void *arg)
896{
897	const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
898	caddr_t data = agg->dtada_data;
899	lsdata_t *lsdata = arg;
900	lsrec_t *lsrec = lsdata->lsd_next;
901	const dtrace_recdesc_t *rec;
902	uint64_t *avg, *quantized;
903	int i, j;
904
905	assert(lsdata->lsd_count < g_nrecs);
906
907	/*
908	 * Aggregation variable IDs are guaranteed to be generated in program
909	 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
910	 * plus one.  As "avg" appears before "hist" in program order, we know
911	 * that "avg" will be allocated the first aggregation variable ID, and
912	 * "hist" will be allocated the second aggregation variable ID -- and
913	 * we therefore use the aggregation variable ID to differentiate the
914	 * cases.
915	 */
916	if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
917		/*
918		 * If this is the histogram entry.  We'll copy the quantized
919		 * data into lc_hist, and jump over the rest.
920		 */
921		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
922
923		if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
924			fail(0, "bad variable ID in aggregation record");
925
926		if (rec->dtrd_size !=
927		    DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
928			fail(0, "bad quantize size in aggregation record");
929
930		/* LINTED - alignment */
931		quantized = (uint64_t *)(data + rec->dtrd_offset);
932
933		for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
934		    i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
935			lsrec->ls_hist[j] = quantized[i];
936
937		goto out;
938	}
939
940	lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
941	    aggdesc->dtagd_nrecs - 1, data);
942
943	rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
944
945	if (rec->dtrd_size != 2 * sizeof (uint64_t))
946		fail(0, "bad avg size in aggregation record");
947
948	/* LINTED - alignment */
949	avg = (uint64_t *)(data + rec->dtrd_offset);
950	lsrec->ls_count = (uint32_t)avg[0];
951	lsrec->ls_time = (uintptr_t)avg[1];
952
953	if (g_recsize >= LS_HIST)
954		return (DTRACE_AGGWALK_NEXT);
955
956out:
957	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
958	lsdata->lsd_count++;
959
960	return (DTRACE_AGGWALK_NEXT);
961}
962
963static int
964process_trace(const dtrace_probedata_t *pdata, void *arg)
965{
966	lsdata_t *lsdata = arg;
967	lsrec_t *lsrec = lsdata->lsd_next;
968	dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
969	caddr_t data = pdata->dtpda_data;
970
971	if (lsdata->lsd_count >= g_nrecs)
972		return (DTRACE_CONSUME_NEXT);
973
974	lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
975
976	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
977	lsdata->lsd_count++;
978
979	return (DTRACE_CONSUME_NEXT);
980}
981
982static int
983process_data(FILE *out, char *data)
984{
985	lsdata_t lsdata;
986
987	/* LINTED - alignment */
988	lsdata.lsd_next = (lsrec_t *)data;
989	lsdata.lsd_count = 0;
990
991	if (g_tracing) {
992		if (dtrace_consume(g_dtp, out,
993		    process_trace, NULL, &lsdata) != 0)
994			dfail("failed to consume buffer");
995
996		return (lsdata.lsd_count);
997	}
998
999	if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
1000	    process_aggregate, &lsdata) != 0)
1001		dfail("failed to walk aggregate");
1002
1003	return (lsdata.lsd_count);
1004}
1005
1006/*ARGSUSED*/
1007static int
1008drophandler(const dtrace_dropdata_t *data, void *arg)
1009{
1010	g_dropped++;
1011	(void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1012	return (DTRACE_HANDLE_OK);
1013}
1014
1015int
1016main(int argc, char **argv)
1017{
1018	char *data_buf;
1019	lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1020	FILE *out = stdout;
1021	char c;
1022	pid_t child;
1023	int status;
1024	int i, j;
1025	hrtime_t duration;
1026	char *addrp, *offp, *sizep, *evp, *lastp, *p;
1027	uintptr_t addr;
1028	size_t size, off;
1029	int events_specified = 0;
1030	int exec_errno = 0;
1031	uint32_t event;
1032	char *filt = NULL, *ifilt = NULL;
1033	static uint64_t ev_count[LS_MAX_EVENTS + 1];
1034	static uint64_t ev_time[LS_MAX_EVENTS + 1];
1035	dtrace_optval_t aggsize;
1036	char aggstr[10];
1037	long ncpus;
1038	int dynvar = 0;
1039	int err;
1040
1041	if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1042		fail(0, "cannot open dtrace library: %s",
1043		    dtrace_errmsg(NULL, err));
1044	}
1045
1046	if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1047		dfail("couldn't establish drop handler");
1048
1049	if (symtab_init() == -1)
1050		fail(1, "can't load kernel symbols");
1051
1052	g_nrecs = DEFAULT_NRECS;
1053
1054	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
1055		switch (c) {
1056		case 'b':
1057			g_recsize = LS_BASIC;
1058			break;
1059
1060		case 't':
1061			g_recsize = LS_TIME;
1062			break;
1063
1064		case 'h':
1065			g_recsize = LS_HIST;
1066			break;
1067
1068		case 's':
1069			if (!isdigit(optarg[0]))
1070				usage();
1071			g_stkdepth = atoi(optarg);
1072			if (g_stkdepth > LS_MAX_STACK_DEPTH)
1073				fail(0, "max stack depth is %d",
1074				    LS_MAX_STACK_DEPTH);
1075			g_recsize = LS_STACK(g_stkdepth);
1076			break;
1077
1078		case 'n':
1079			if (!isdigit(optarg[0]))
1080				usage();
1081			g_nrecs = atoi(optarg);
1082			break;
1083
1084		case 'd':
1085			if (!isdigit(optarg[0]))
1086				usage();
1087			duration = atoll(optarg);
1088
1089			/*
1090			 * XXX -- durations really should be per event
1091			 * since the units are different, but it's hard
1092			 * to express this nicely in the interface.
1093			 * Not clear yet what the cleanest solution is.
1094			 */
1095			for (i = 0; i < LS_MAX_EVENTS; i++)
1096				if (g_event_info[i].ev_type != 'E')
1097					g_min_duration[i] = duration;
1098
1099			break;
1100
1101		case 'i':
1102			if (!isdigit(optarg[0]))
1103				usage();
1104			i = atoi(optarg);
1105			if (i <= 0)
1106				usage();
1107			if (i > MAX_HZ)
1108				fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1109
1110			for (j = 0; j < LS_MAX_EVENTS; j++)
1111				if (strcmp(g_event_info[j].ev_desc,
1112				    "Profiling interrupt") == 0)
1113					break;
1114
1115			(void) sprintf(g_event_info[j].ev_name,
1116			    "profile:::profile-%d", i);
1117			break;
1118
1119		case 'l':
1120		case 'f':
1121			addrp = strtok(optarg, ",");
1122			sizep = strtok(NULL, ",");
1123			addrp = strtok(optarg, ",+");
1124			offp = strtok(NULL, ",");
1125
1126			size = sizep ? strtoul(sizep, NULL, 0) : 1;
1127			off = offp ? strtoul(offp, NULL, 0) : 0;
1128
1129			if (addrp[0] == '0') {
1130				addr = strtoul(addrp, NULL, 16) + off;
1131			} else {
1132				addr = sym_to_addr(addrp) + off;
1133				if (sizep == NULL)
1134					size = sym_size(addrp) - off;
1135				if (addr - off == 0)
1136					fail(0, "symbol '%s' not found", addrp);
1137				if (size == 0)
1138					size = 1;
1139			}
1140
1141
1142			if (c == 'l') {
1143				filter_add(&filt, "arg0", addr, size);
1144			} else {
1145				filter_add(&filt, "caller", addr, size);
1146				filter_add(&ifilt, "arg0", addr, size);
1147			}
1148			break;
1149
1150		case 'e':
1151			evp = strtok_r(optarg, ",", &lastp);
1152			while (evp) {
1153				int ev1, ev2;
1154				char *evp2;
1155
1156				(void) strtok(evp, "-");
1157				evp2 = strtok(NULL, "-");
1158				ev1 = atoi(evp);
1159				ev2 = evp2 ? atoi(evp2) : ev1;
1160				if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1161				    (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1162					fail(0, "-e events out of range");
1163				for (i = ev1; i <= ev2; i++)
1164					g_enabled[i] = 1;
1165				evp = strtok_r(NULL, ",", &lastp);
1166			}
1167			events_specified = 1;
1168			break;
1169
1170		case 'c':
1171			g_cflag = 1;
1172			break;
1173
1174		case 'k':
1175			g_kflag = 1;
1176			break;
1177
1178		case 'w':
1179			g_wflag = 1;
1180			break;
1181
1182		case 'W':
1183			g_Wflag = 1;
1184			break;
1185
1186		case 'g':
1187			g_gflag = 1;
1188			break;
1189
1190		case 'C':
1191		case 'E':
1192		case 'H':
1193		case 'I':
1194			for (i = 0; i < LS_MAX_EVENTS; i++)
1195				if (g_event_info[i].ev_type == c)
1196					g_enabled[i] = 1;
1197			events_specified = 1;
1198			break;
1199
1200		case 'A':
1201			for (i = 0; i < LS_MAX_EVENTS; i++)
1202				if (strchr("CH", g_event_info[i].ev_type))
1203					g_enabled[i] = 1;
1204			events_specified = 1;
1205			break;
1206
1207		case 'T':
1208			g_tracing = 1;
1209			break;
1210
1211		case 'D':
1212			if (!isdigit(optarg[0]))
1213				usage();
1214			g_topn = atoi(optarg);
1215			break;
1216
1217		case 'R':
1218			g_rates = 1;
1219			break;
1220
1221		case 'p':
1222			g_pflag = 1;
1223			break;
1224
1225		case 'P':
1226			g_Pflag = 1;
1227			break;
1228
1229		case 'o':
1230			if ((out = fopen(optarg, "w")) == NULL)
1231				fail(1, "error opening file");
1232			break;
1233
1234		case 'V':
1235			g_Vflag = 1;
1236			break;
1237
1238		default:
1239			if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1240				usage();
1241		}
1242	}
1243
1244	if (filt != NULL) {
1245		predicate_add(&g_predicate, filt, NULL, 0);
1246		filter_destroy(&filt);
1247	}
1248
1249	if (ifilt != NULL) {
1250		predicate_add(&g_ipredicate, ifilt, NULL, 0);
1251		filter_destroy(&ifilt);
1252	}
1253
1254	if (g_recsize == 0) {
1255		if (g_gflag) {
1256			g_stkdepth = LS_MAX_STACK_DEPTH;
1257			g_recsize = LS_STACK(g_stkdepth);
1258		} else {
1259			g_recsize = LS_TIME;
1260		}
1261	}
1262
1263	if (g_gflag && g_recsize <= LS_STACK(0))
1264		fail(0, "'-g' requires at least '-s 1' data gathering");
1265
1266	/*
1267	 * Make sure the alignment is reasonable
1268	 */
1269	g_recsize = -(-g_recsize & -sizeof (uint64_t));
1270
1271	for (i = 0; i < LS_MAX_EVENTS; i++) {
1272		/*
1273		 * If no events were specified, enable -C.
1274		 */
1275		if (!events_specified && g_event_info[i].ev_type == 'C')
1276			g_enabled[i] = 1;
1277	}
1278
1279	for (i = 0; i < LS_MAX_EVENTS; i++) {
1280		if (!g_enabled[i])
1281			continue;
1282
1283		if (g_event_info[i].ev_acquire != NULL) {
1284			/*
1285			 * If we've enabled a hold event, we must explicitly
1286			 * allocate dynamic variable space.
1287			 */
1288			dynvar = 1;
1289		}
1290
1291		dprog_addevent(i);
1292	}
1293
1294	/*
1295	 * Make sure there are remaining arguments to specify a child command
1296	 * to execute.
1297	 */
1298	if (argc <= optind)
1299		usage();
1300
1301	if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1302		dfail("couldn't determine number of online CPUs");
1303
1304	/*
1305	 * By default, we set our data buffer size to be the number of records
1306	 * multiplied by the size of the record, doubled to account for some
1307	 * DTrace slop and divided by the number of CPUs.  We silently clamp
1308	 * the aggregation size at both a minimum and a maximum to prevent
1309	 * absurdly low or high values.
1310	 */
1311	if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1312		aggsize = MIN_AGGSIZE;
1313
1314	if (aggsize > MAX_AGGSIZE)
1315		aggsize = MAX_AGGSIZE;
1316
1317	(void) sprintf(aggstr, "%lld", (long long)aggsize);
1318
1319	if (!g_tracing) {
1320		if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1321			dfail("failed to set 'bufsize'");
1322
1323		if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1324			dfail("failed to set 'aggsize'");
1325
1326		if (dynvar) {
1327			/*
1328			 * If we're using dynamic variables, we set our
1329			 * dynamic variable size to be one megabyte per CPU,
1330			 * with a hard-limit of 32 megabytes.  This may still
1331			 * be too small in some cases, but it can be tuned
1332			 * manually via -x if need be.
1333			 */
1334			(void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1335
1336			if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1337				dfail("failed to set 'dynvarsize'");
1338		}
1339	} else {
1340		if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1341			dfail("failed to set 'bufsize'");
1342	}
1343
1344	if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1345		dfail("failed to set 'statusrate'");
1346
1347	optind = 1;
1348	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
1349		switch (c) {
1350		case 'x':
1351			if ((p = strchr(optarg, '=')) != NULL)
1352				*p++ = '\0';
1353
1354			if (dtrace_setopt(g_dtp, optarg, p) != 0)
1355				dfail("failed to set -x %s", optarg);
1356			break;
1357		}
1358	}
1359
1360	argc -= optind;
1361	argv += optind;
1362
1363	dprog_compile();
1364	status_init();
1365
1366	g_elapsed = -gethrtime();
1367
1368	/*
1369	 * Spawn the specified command and wait for it to complete.
1370	 */
1371	child = fork();
1372	if (child == -1)
1373		fail(1, "cannot fork");
1374	if (child == 0) {
1375		(void) dtrace_close(g_dtp);
1376		(void) execvp(argv[0], &argv[0]);
1377		exec_errno = errno;
1378		exit(127);
1379	}
1380
1381	while (waitpid(child, &status, WEXITED) != child)
1382		status_check();
1383
1384	g_elapsed += gethrtime();
1385
1386	if (WIFEXITED(status)) {
1387		if (WEXITSTATUS(status) != 0) {
1388			if (exec_errno != 0) {
1389				errno = exec_errno;
1390				fail(1, "could not execute %s", argv[0]);
1391			}
1392			(void) fprintf(stderr,
1393			    "lockstat: warning: %s exited with code %d\n",
1394			    argv[0], WEXITSTATUS(status));
1395		}
1396	} else {
1397		(void) fprintf(stderr,
1398		    "lockstat: warning: %s died on signal %d\n",
1399		    argv[0], WTERMSIG(status));
1400	}
1401
1402	if (dtrace_stop(g_dtp) == -1)
1403		dfail("failed to stop dtrace");
1404
1405	/*
1406	 * Before we read out the results, we need to allocate our buffer.
1407	 * If we're tracing, then we'll just use the precalculated size.  If
1408	 * we're not, then we'll take a snapshot of the aggregate, and walk
1409	 * it to count the number of records.
1410	 */
1411	if (!g_tracing) {
1412		if (dtrace_aggregate_snap(g_dtp) != 0)
1413			dfail("failed to snap aggregate");
1414
1415		g_nrecs = 0;
1416
1417		if (dtrace_aggregate_walk(g_dtp,
1418		    count_aggregate, &g_nrecs) != 0)
1419			dfail("failed to walk aggregate");
1420	}
1421
1422	if ((data_buf = memalign(sizeof (uint64_t),
1423	    (g_nrecs + 1) * g_recsize)) == NULL)
1424		fail(1, "Memory allocation failed");
1425
1426	/*
1427	 * Read out the DTrace data.
1428	 */
1429	g_nrecs_used = process_data(out, data_buf);
1430
1431	if (g_nrecs_used > g_nrecs || g_dropped)
1432		(void) fprintf(stderr, "lockstat: warning: "
1433		    "ran out of data records (use -n for more)\n");
1434
1435	/* LINTED - alignment */
1436	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1437	    /* LINTED - alignment */
1438	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1439		ev_count[lsp->ls_event] += lsp->ls_count;
1440		ev_time[lsp->ls_event] += lsp->ls_time;
1441	}
1442
1443	/*
1444	 * If -g was specified, convert stacks into individual records.
1445	 */
1446	if (g_gflag) {
1447		lsrec_t *newlsp, *oldlsp;
1448
1449		newlsp = memalign(sizeof (uint64_t),
1450		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1451		if (newlsp == NULL)
1452			fail(1, "Cannot allocate space for -g processing");
1453		lsp = newlsp;
1454		/* LINTED - alignment */
1455		for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1456		    /* LINTED - alignment */
1457		    oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1458			int fr;
1459			int caller_in_stack = 0;
1460
1461			if (oldlsp->ls_count == 0)
1462				continue;
1463
1464			for (fr = 0; fr < g_stkdepth; fr++) {
1465				if (oldlsp->ls_stack[fr] == 0)
1466					break;
1467				if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1468					caller_in_stack = 1;
1469				bcopy(oldlsp, lsp, LS_TIME);
1470				lsp->ls_caller = oldlsp->ls_stack[fr];
1471				/* LINTED - alignment */
1472				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1473			}
1474			if (!caller_in_stack) {
1475				bcopy(oldlsp, lsp, LS_TIME);
1476				/* LINTED - alignment */
1477				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1478			}
1479		}
1480		g_nrecs = g_nrecs_used =
1481		    ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1482		g_recsize = LS_TIME;
1483		g_stkdepth = 0;
1484		free(data_buf);
1485		data_buf = (char *)newlsp;
1486	}
1487
1488	if ((sort_buf = calloc(2 * (g_nrecs + 1),
1489	    sizeof (void *))) == NULL)
1490		fail(1, "Sort buffer allocation failed");
1491	merge_buf = sort_buf + (g_nrecs + 1);
1492
1493	/*
1494	 * Build the sort buffer, discarding zero-count records along the way.
1495	 */
1496	/* LINTED - alignment */
1497	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1498	    /* LINTED - alignment */
1499	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1500		if (lsp->ls_count == 0)
1501			lsp->ls_event = LS_MAX_EVENTS;
1502		sort_buf[i] = lsp;
1503	}
1504
1505	if (g_nrecs_used == 0)
1506		exit(0);
1507
1508	/*
1509	 * Add a sentinel after the last record
1510	 */
1511	sort_buf[i] = lsp;
1512	lsp->ls_event = LS_MAX_EVENTS;
1513
1514	if (g_tracing) {
1515		report_trace(out, sort_buf);
1516		return (0);
1517	}
1518
1519	/*
1520	 * Application of -g may have resulted in multiple records
1521	 * with the same signature; coalesce them.
1522	 */
1523	if (g_gflag) {
1524		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1525		coalesce(lockcmp, sort_buf, g_nrecs_used);
1526	}
1527
1528	/*
1529	 * Coalesce locks within the same symbol if -c option specified.
1530	 * Coalesce PCs within the same function if -k option specified.
1531	 */
1532	if (g_cflag || g_kflag) {
1533		for (i = 0; i < g_nrecs_used; i++) {
1534			int fr;
1535			lsp = sort_buf[i];
1536			if (g_cflag)
1537				coalesce_symbol(&lsp->ls_lock);
1538			if (g_kflag) {
1539				for (fr = 0; fr < g_stkdepth; fr++)
1540					coalesce_symbol(&lsp->ls_stack[fr]);
1541				coalesce_symbol(&lsp->ls_caller);
1542			}
1543		}
1544		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1545		coalesce(lockcmp, sort_buf, g_nrecs_used);
1546	}
1547
1548	/*
1549	 * Coalesce callers if -w option specified
1550	 */
1551	if (g_wflag) {
1552		mergesort(lock_and_count_cmp_anywhere,
1553		    sort_buf, merge_buf, g_nrecs_used);
1554		coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1555	}
1556
1557	/*
1558	 * Coalesce locks if -W option specified
1559	 */
1560	if (g_Wflag) {
1561		mergesort(site_and_count_cmp_anylock,
1562		    sort_buf, merge_buf, g_nrecs_used);
1563		coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1564	}
1565
1566	/*
1567	 * Sort data by contention count (ls_count) or total time (ls_time),
1568	 * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
1569	 */
1570	if (g_recsize < LS_TIME)
1571		g_Pflag = 0;
1572
1573	if (g_Pflag)
1574		mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1575	else
1576		mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1577
1578	/*
1579	 * Display data by event type
1580	 */
1581	first = &sort_buf[0];
1582	while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1583		current = first;
1584		while ((lsp = *current)->ls_event == event)
1585			current++;
1586		report_stats(out, first, current - first, ev_count[event],
1587		    ev_time[event]);
1588		first = current;
1589	}
1590
1591	return (0);
1592}
1593
1594static char *
1595format_symbol(char *buf, uintptr_t addr, int show_size)
1596{
1597	uintptr_t symoff;
1598	char *symname;
1599	size_t symsize;
1600
1601	symname = addr_to_sym(addr, &symoff, &symsize);
1602
1603	if (show_size && symoff == 0)
1604		(void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1605	else if (symoff == 0)
1606		(void) sprintf(buf, "%s", symname);
1607	else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)	/* CPU+PIL */
1608		(void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1609	else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1610		(void) sprintf(buf, "%s+0x%llx", symname,
1611		    (unsigned long long)symoff);
1612	else
1613		(void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1614	return (buf);
1615}
1616
1617static void
1618report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1619	uint64_t total_time)
1620{
1621	uint32_t event = sort_buf[0]->ls_event;
1622	lsrec_t *lsp;
1623	double ptotal = 0.0;
1624	double percent;
1625	int i, j, fr;
1626	int displayed;
1627	int first_bin, last_bin, max_bin_count, total_bin_count;
1628	int rectype;
1629	char buf[256];
1630	char lhdr[80], chdr[80];
1631
1632	rectype = g_recsize;
1633
1634	if (g_topn == 0) {
1635		(void) fprintf(out, "%20llu %s\n",
1636		    g_rates == 0 ? total_count :
1637		    ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1638		    g_event_info[event].ev_desc);
1639		return;
1640	}
1641
1642	(void) sprintf(lhdr, "%s%s",
1643	    g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1644	(void) sprintf(chdr, "%s%s",
1645	    g_wflag ? "Hottest " : "", "Caller");
1646
1647	if (!g_pflag)
1648		(void) fprintf(out,
1649		    "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1650		    g_event_info[event].ev_desc, (double)total_count,
1651		    (double)g_elapsed / NANOSEC,
1652		    (double)total_count * NANOSEC / g_elapsed);
1653
1654	if (!g_pflag && rectype < LS_HIST) {
1655		(void) sprintf(buf, "%s", g_event_info[event].ev_units);
1656		(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1657		    g_rates ? "ops/s" : "Count",
1658		    g_gflag ? "genr" : "indv",
1659		    "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1660		(void) fprintf(out, "---------------------------------"
1661		    "----------------------------------------------\n");
1662	}
1663
1664	displayed = 0;
1665	for (i = 0; i < nrecs; i++) {
1666		lsp = sort_buf[i];
1667
1668		if (displayed++ >= g_topn)
1669			break;
1670
1671		if (g_pflag) {
1672			int j;
1673
1674			(void) fprintf(out, "%u %u",
1675			    lsp->ls_event, lsp->ls_count);
1676			(void) fprintf(out, " %s",
1677			    format_symbol(buf, lsp->ls_lock, g_cflag));
1678			(void) fprintf(out, " %s",
1679			    format_symbol(buf, lsp->ls_caller, 0));
1680			(void) fprintf(out, " %f",
1681			    (double)lsp->ls_refcnt / lsp->ls_count);
1682			if (rectype >= LS_TIME)
1683				(void) fprintf(out, " %llu",
1684				    (unsigned long long)lsp->ls_time);
1685			if (rectype >= LS_HIST) {
1686				for (j = 0; j < 64; j++)
1687					(void) fprintf(out, " %u",
1688					    lsp->ls_hist[j]);
1689			}
1690			for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1691				if (rectype <= LS_STACK(j) ||
1692				    lsp->ls_stack[j] == 0)
1693					break;
1694				(void) fprintf(out, " %s",
1695				    format_symbol(buf, lsp->ls_stack[j], 0));
1696			}
1697			(void) fprintf(out, "\n");
1698			continue;
1699		}
1700
1701		if (rectype >= LS_HIST) {
1702			(void) fprintf(out, "---------------------------------"
1703			    "----------------------------------------------\n");
1704			(void) sprintf(buf, "%s",
1705			    g_event_info[event].ev_units);
1706			(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1707			    g_rates ? "ops/s" : "Count",
1708			    g_gflag ? "genr" : "indv",
1709			    "cuml", "rcnt", buf, lhdr, chdr);
1710		}
1711
1712		if (g_Pflag && total_time != 0)
1713			percent = (lsp->ls_time * 100.00) / total_time;
1714		else
1715			percent = (lsp->ls_count * 100.00) / total_count;
1716
1717		ptotal += percent;
1718
1719		if (rectype >= LS_TIME)
1720			(void) sprintf(buf, "%llu",
1721			    (unsigned long long)(lsp->ls_time / lsp->ls_count));
1722		else
1723			buf[0] = '\0';
1724
1725		(void) fprintf(out, "%5llu ",
1726		    g_rates == 0 ? lsp->ls_count :
1727		    ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1728
1729		(void) fprintf(out, "%3.0f%% ", percent);
1730
1731		if (g_gflag)
1732			(void) fprintf(out, "---- ");
1733		else
1734			(void) fprintf(out, "%3.0f%% ", ptotal);
1735
1736		(void) fprintf(out, "%4.2f %8s ",
1737		    (double)lsp->ls_refcnt / lsp->ls_count, buf);
1738
1739		(void) fprintf(out, "%-22s ",
1740		    format_symbol(buf, lsp->ls_lock, g_cflag));
1741
1742		(void) fprintf(out, "%-24s\n",
1743		    format_symbol(buf, lsp->ls_caller, 0));
1744
1745		if (rectype < LS_HIST)
1746			continue;
1747
1748		(void) fprintf(out, "\n");
1749		(void) fprintf(out, "%10s %31s %-9s %-24s\n",
1750		    g_event_info[event].ev_units,
1751		    "------ Time Distribution ------",
1752		    g_rates ? "ops/s" : "count",
1753		    rectype > LS_STACK(0) ? "Stack" : "");
1754
1755		first_bin = 0;
1756		while (lsp->ls_hist[first_bin] == 0)
1757			first_bin++;
1758
1759		last_bin = 63;
1760		while (lsp->ls_hist[last_bin] == 0)
1761			last_bin--;
1762
1763		max_bin_count = 0;
1764		total_bin_count = 0;
1765		for (j = first_bin; j <= last_bin; j++) {
1766			total_bin_count += lsp->ls_hist[j];
1767			if (lsp->ls_hist[j] > max_bin_count)
1768				max_bin_count = lsp->ls_hist[j];
1769		}
1770
1771		/*
1772		 * If we went a few frames below the caller, ignore them
1773		 */
1774		for (fr = 3; fr > 0; fr--)
1775			if (lsp->ls_stack[fr] == lsp->ls_caller)
1776				break;
1777
1778		for (j = first_bin; j <= last_bin; j++) {
1779			uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1780			(void) fprintf(out, "%10llu |%s%s %-9u ",
1781			    1ULL << j,
1782			    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1783			    "                              " + depth,
1784			    g_rates == 0 ? lsp->ls_hist[j] :
1785			    (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1786			    g_elapsed));
1787			if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1788				(void) fprintf(out, "\n");
1789				continue;
1790			}
1791			(void) fprintf(out, "%-24s\n",
1792			    format_symbol(buf, lsp->ls_stack[fr], 0));
1793			fr++;
1794		}
1795		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1796			(void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1797			    format_symbol(buf, lsp->ls_stack[fr], 0));
1798			fr++;
1799		}
1800	}
1801
1802	if (!g_pflag)
1803		(void) fprintf(out, "---------------------------------"
1804		    "----------------------------------------------\n");
1805
1806	(void) fflush(out);
1807}
1808
1809static void
1810report_trace(FILE *out, lsrec_t **sort_buf)
1811{
1812	lsrec_t *lsp;
1813	int i, fr;
1814	int rectype;
1815	char buf[256], buf2[256];
1816
1817	rectype = g_recsize;
1818
1819	if (!g_pflag) {
1820		(void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
1821		    "Event", "Time", "Owner", "Lock", "Caller");
1822		(void) fprintf(out, "---------------------------------"
1823		    "----------------------------------------------\n");
1824	}
1825
1826	for (i = 0; i < g_nrecs_used; i++) {
1827
1828		lsp = sort_buf[i];
1829
1830		if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1831			continue;
1832
1833		(void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
1834		    lsp->ls_event, (unsigned long long)lsp->ls_time,
1835		    (void *)lsp->ls_next,
1836		    format_symbol(buf, lsp->ls_lock, 0),
1837		    format_symbol(buf2, lsp->ls_caller, 0));
1838
1839		if (rectype <= LS_STACK(0))
1840			continue;
1841
1842		/*
1843		 * If we went a few frames below the caller, ignore them
1844		 */
1845		for (fr = 3; fr > 0; fr--)
1846			if (lsp->ls_stack[fr] == lsp->ls_caller)
1847				break;
1848
1849		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1850			(void) fprintf(out, "%53s  %-24s\n", "",
1851			    format_symbol(buf, lsp->ls_stack[fr], 0));
1852			fr++;
1853		}
1854		(void) fprintf(out, "\n");
1855	}
1856
1857	(void) fflush(out);
1858}
1859