lockstat.c revision 302408
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <stdio.h>
29#include <stddef.h>
30#include <stdlib.h>
31#include <stdarg.h>
32#include <string.h>
33#include <strings.h>
34#include <ctype.h>
35#include <fcntl.h>
36#include <unistd.h>
37#include <errno.h>
38#include <limits.h>
39#include <sys/types.h>
40#include <sys/modctl.h>
41#include <sys/stat.h>
42#include <sys/wait.h>
43#include <dtrace.h>
44#include <sys/lockstat.h>
45#include <alloca.h>
46#include <signal.h>
47#include <assert.h>
48
49#ifdef illumos
50#define	GETOPT_EOF	EOF
51#else
52#include <sys/time.h>
53#include <sys/resource.h>
54
55#define	mergesort(a, b, c, d)	lsmergesort(a, b, c, d)
56#define	GETOPT_EOF		(-1)
57
58typedef	uintptr_t	pc_t;
59#endif
60
61#define	LOCKSTAT_OPTSTR	"x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
62
63#define	LS_MAX_STACK_DEPTH	50
64#define	LS_MAX_EVENTS		64
65
66typedef struct lsrec {
67	struct lsrec	*ls_next;	/* next in hash chain */
68#ifdef illumos
69	uintptr_t	ls_lock;	/* lock address */
70#else
71	char		*ls_lock;	/* lock name */
72#endif
73	uintptr_t	ls_caller;	/* caller address */
74	uint32_t	ls_count;	/* cumulative event count */
75	uint32_t	ls_event;	/* type of event */
76	uintptr_t	ls_refcnt;	/* cumulative reference count */
77	uint64_t	ls_time;	/* cumulative event duration */
78	uint32_t	ls_hist[64];	/* log2(duration) histogram */
79	uintptr_t	ls_stack[LS_MAX_STACK_DEPTH];
80} lsrec_t;
81
82typedef struct lsdata {
83	struct lsrec	*lsd_next;	/* next available */
84	int		lsd_count;	/* number of records */
85} lsdata_t;
86
87/*
88 * Definitions for the types of experiments which can be run.  They are
89 * listed in increasing order of memory cost and processing time cost.
90 * The numerical value of each type is the number of bytes needed per record.
91 */
92#define	LS_BASIC	offsetof(lsrec_t, ls_time)
93#define	LS_TIME		offsetof(lsrec_t, ls_hist[0])
94#define	LS_HIST		offsetof(lsrec_t, ls_stack[0])
95#define	LS_STACK(depth)	offsetof(lsrec_t, ls_stack[depth])
96
97static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
98static void report_trace(FILE *, lsrec_t **);
99
100extern int symtab_init(void);
101extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
102extern uintptr_t sym_to_addr(char *name);
103extern size_t sym_size(char *name);
104extern char *strtok_r(char *, const char *, char **);
105
106#define	DEFAULT_NRECS	10000
107#define	DEFAULT_HZ	97
108#define	MAX_HZ		1000
109#define	MIN_AGGSIZE	(16 * 1024)
110#define	MAX_AGGSIZE	(32 * 1024 * 1024)
111
112static int g_stkdepth;
113static int g_topn = INT_MAX;
114static hrtime_t g_elapsed;
115static int g_rates = 0;
116static int g_pflag = 0;
117static int g_Pflag = 0;
118static int g_wflag = 0;
119static int g_Wflag = 0;
120static int g_cflag = 0;
121static int g_kflag = 0;
122static int g_gflag = 0;
123static int g_Vflag = 0;
124static int g_tracing = 0;
125static size_t g_recsize;
126static size_t g_nrecs;
127static int g_nrecs_used;
128static uchar_t g_enabled[LS_MAX_EVENTS];
129static hrtime_t g_min_duration[LS_MAX_EVENTS];
130static dtrace_hdl_t *g_dtp;
131static char *g_predicate;
132static char *g_ipredicate;
133static char *g_prog;
134static int g_proglen;
135static int g_dropped;
136
137typedef struct ls_event_info {
138	char	ev_type;
139	char	ev_lhdr[20];
140	char	ev_desc[80];
141	char	ev_units[10];
142	char	ev_name[DTRACE_NAMELEN];
143	char	*ev_predicate;
144	char	*ev_acquire;
145} ls_event_info_t;
146
147static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
148	{ 'C',	"Lock",	"Adaptive mutex spin",			"nsec",
149	    "lockstat:::adaptive-spin" },
150	{ 'C',	"Lock",	"Adaptive mutex block",			"nsec",
151	    "lockstat:::adaptive-block" },
152	{ 'C',	"Lock",	"Spin lock spin",			"nsec",
153	    "lockstat:::spin-spin" },
154	{ 'C',	"Lock",	"Thread lock spin",			"nsec",
155	    "lockstat:::thread-spin" },
156	{ 'C',	"Lock",	"R/W writer blocked by writer",		"nsec",
157	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
158	{ 'C',	"Lock",	"R/W writer blocked by readers",	"nsec",
159	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
160	{ 'C',	"Lock",	"R/W reader blocked by writer",		"nsec",
161	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
162	{ 'C',	"Lock",	"R/W reader blocked by write wanted",	"nsec",
163	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
164	{ 'C',	"Lock",	"R/W writer spin on writer",		"nsec",
165	    "lockstat:::rw-spin", "arg2 == 0 && arg3 == 1" },
166	{ 'C',	"Lock",	"R/W writer spin on readers",		"nsec",
167	    "lockstat:::rw-spin", "arg2 == 0 && arg3 == 0 && arg4" },
168	{ 'C',	"Lock",	"R/W reader spin on writer",		"nsec",
169	    "lockstat:::rw-spin", "arg2 != 0 && arg3 == 1" },
170	{ 'C',	"Lock",	"R/W reader spin on write wanted",	"nsec",
171	    "lockstat:::rw-spin", "arg2 != 0 && arg3 == 0 && arg4" },
172	{ 'C',	"Lock",	"SX exclusive block",			"nsec",
173	    "lockstat:::sx-block", "arg2 == 0" },
174	{ 'C',	"Lock",	"SX shared block",			"nsec",
175	    "lockstat:::sx-block", "arg2 != 0" },
176	{ 'C',	"Lock",	"SX exclusive spin",			"nsec",
177	    "lockstat:::sx-spin", "arg2 == 0" },
178	{ 'C',	"Lock",	"SX shared spin",			"nsec",
179	    "lockstat:::sx-spin", "arg2 != 0" },
180	{ 'C',	"Lock",	"Unknown event (type 16)",		"units"	},
181	{ 'C',	"Lock",	"Unknown event (type 17)",		"units"	},
182	{ 'C',	"Lock",	"Unknown event (type 18)",		"units"	},
183	{ 'C',	"Lock",	"Unknown event (type 19)",		"units"	},
184	{ 'C',	"Lock",	"Unknown event (type 20)",		"units"	},
185	{ 'C',	"Lock",	"Unknown event (type 21)",		"units"	},
186	{ 'C',	"Lock",	"Unknown event (type 22)",		"units"	},
187	{ 'C',	"Lock",	"Unknown event (type 23)",		"units"	},
188	{ 'C',	"Lock",	"Unknown event (type 24)",		"units"	},
189	{ 'C',	"Lock",	"Unknown event (type 25)",		"units"	},
190	{ 'C',	"Lock",	"Unknown event (type 26)",		"units"	},
191	{ 'C',	"Lock",	"Unknown event (type 27)",		"units"	},
192	{ 'C',	"Lock",	"Unknown event (type 28)",		"units"	},
193	{ 'C',	"Lock",	"Unknown event (type 29)",		"units"	},
194	{ 'C',	"Lock",	"Unknown event (type 30)",		"units"	},
195	{ 'C',	"Lock",	"Unknown event (type 31)",		"units"	},
196	{ 'H',	"Lock",	"Adaptive mutex hold",			"nsec",
197	    "lockstat:::adaptive-release", NULL,
198	    "lockstat:::adaptive-acquire" },
199	{ 'H',	"Lock",	"Spin lock hold",			"nsec",
200	    "lockstat:::spin-release", NULL,
201	    "lockstat:::spin-acquire" },
202	{ 'H',	"Lock",	"R/W writer hold",			"nsec",
203	    "lockstat:::rw-release", "arg1 == 0",
204	    "lockstat:::rw-acquire" },
205	{ 'H',	"Lock",	"R/W reader hold",			"nsec",
206	    "lockstat:::rw-release", "arg1 == 1",
207	    "lockstat:::rw-acquire" },
208	{ 'H',	"Lock",	"SX shared hold",			"nsec",
209	    "lockstat:::sx-release", "arg1 == 0",
210	    "lockstat:::sx-acquire" },
211	{ 'H',	"Lock",	"SX exclusive hold",			"nsec",
212	    "lockstat:::sx-release", "arg1 == 1",
213	    "lockstat:::sx-acquire" },
214	{ 'H',	"Lock",	"Unknown event (type 38)",		"units"	},
215	{ 'H',	"Lock",	"Unknown event (type 39)",		"units"	},
216	{ 'H',	"Lock",	"Unknown event (type 40)",		"units"	},
217	{ 'H',	"Lock",	"Unknown event (type 41)",		"units"	},
218	{ 'H',	"Lock",	"Unknown event (type 42)",		"units"	},
219	{ 'H',	"Lock",	"Unknown event (type 43)",		"units"	},
220	{ 'H',	"Lock",	"Unknown event (type 44)",		"units"	},
221	{ 'H',	"Lock",	"Unknown event (type 45)",		"units"	},
222	{ 'H',	"Lock",	"Unknown event (type 46)",		"units"	},
223	{ 'H',	"Lock",	"Unknown event (type 47)",		"units"	},
224	{ 'H',	"Lock",	"Unknown event (type 48)",		"units"	},
225	{ 'H',	"Lock",	"Unknown event (type 49)",		"units"	},
226	{ 'H',	"Lock",	"Unknown event (type 50)",		"units"	},
227	{ 'H',	"Lock",	"Unknown event (type 51)",		"units"	},
228	{ 'H',	"Lock",	"Unknown event (type 52)",		"units"	},
229	{ 'H',	"Lock",	"Unknown event (type 53)",		"units"	},
230	{ 'H',	"Lock",	"Unknown event (type 54)",		"units"	},
231	{ 'H',	"Lock",	"Unknown event (type 55)",		"units"	},
232#ifdef illumos
233	{ 'I',	"CPU+PIL", "Profiling interrupt",		"nsec",
234#else
235	{ 'I',	"CPU+Pri_Class", "Profiling interrupt",		"nsec",
236#endif
237	    "profile:::profile-97", NULL },
238	{ 'I',	"Lock",	"Unknown event (type 57)",		"units"	},
239	{ 'I',	"Lock",	"Unknown event (type 58)",		"units"	},
240	{ 'I',	"Lock",	"Unknown event (type 59)",		"units"	},
241	{ 'E',	"Lock",	"Recursive lock entry detected",	"(N/A)",
242	    "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
243	{ 'E',	"Lock",	"Lockstat enter failure",		"(N/A)"	},
244	{ 'E',	"Lock",	"Lockstat exit failure",		"nsec"	},
245	{ 'E',	"Lock",	"Lockstat record failure",		"(N/A)"	},
246};
247
248#ifndef illumos
249static char *g_pri_class[] = {
250	"",
251	"Intr",
252	"RealT",
253	"TShar",
254	"Idle"
255};
256#endif
257
258static void
259fail(int do_perror, const char *message, ...)
260{
261	va_list args;
262	int save_errno = errno;
263
264	va_start(args, message);
265	(void) fprintf(stderr, "lockstat: ");
266	(void) vfprintf(stderr, message, args);
267	va_end(args);
268	if (do_perror)
269		(void) fprintf(stderr, ": %s", strerror(save_errno));
270	(void) fprintf(stderr, "\n");
271	exit(2);
272}
273
274static void
275dfail(const char *message, ...)
276{
277	va_list args;
278
279	va_start(args, message);
280	(void) fprintf(stderr, "lockstat: ");
281	(void) vfprintf(stderr, message, args);
282	va_end(args);
283	(void) fprintf(stderr, ": %s\n",
284	    dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
285
286	exit(2);
287}
288
289static void
290show_events(char event_type, char *desc)
291{
292	int i, first = -1, last;
293
294	for (i = 0; i < LS_MAX_EVENTS; i++) {
295		ls_event_info_t *evp = &g_event_info[i];
296		if (evp->ev_type != event_type ||
297		    strncmp(evp->ev_desc, "Unknown event", 13) == 0)
298			continue;
299		if (first == -1)
300			first = i;
301		last = i;
302	}
303
304	(void) fprintf(stderr,
305	    "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
306	    desc, event_type, first, last);
307
308	for (i = first; i <= last; i++)
309		(void) fprintf(stderr,
310		    "%4d = %s\n", i, g_event_info[i].ev_desc);
311}
312
313static void
314usage(void)
315{
316	(void) fprintf(stderr,
317	    "Usage: lockstat [options] command [args]\n"
318	    "\nGeneral options:\n\n"
319	    "  -V              print the corresponding D program\n"
320	    "\nEvent selection options:\n\n"
321	    "  -C              watch contention events [on by default]\n"
322	    "  -E              watch error events [off by default]\n"
323	    "  -H              watch hold events [off by default]\n"
324	    "  -I              watch interrupt events [off by default]\n"
325	    "  -A              watch all lock events [equivalent to -CH]\n"
326	    "  -e event_list   only watch the specified events (shown below);\n"
327	    "                  <event_list> is a comma-separated list of\n"
328	    "                  events or ranges of events, e.g. 1,4-7,35\n"
329	    "  -i rate         interrupt rate for -I [default: %d Hz]\n"
330	    "\nData gathering options:\n\n"
331	    "  -b              basic statistics (lock, caller, event count)\n"
332	    "  -t              timing for all events [default]\n"
333	    "  -h              histograms for event times\n"
334	    "  -s depth        stack traces <depth> deep\n"
335	    "  -x opt[=val]    enable or modify DTrace options\n"
336	    "\nData filtering options:\n\n"
337	    "  -n nrecords     maximum number of data records [default: %d]\n"
338	    "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
339	    "                  symbolic name or hex address; <size> defaults\n"
340	    "                  to the ELF symbol size if available, 1 if not\n"
341	    "  -f func[,size]  only watch events generated by <func>\n"
342	    "  -d duration     only watch events longer than <duration>\n"
343	    "  -T              trace (rather than sample) events\n"
344	    "\nData reporting options:\n\n"
345#ifdef illumos
346	    "  -c              coalesce lock data for arrays like pse_mutex[]\n"
347#endif
348	    "  -k              coalesce PCs within functions\n"
349	    "  -g              show total events generated by function\n"
350	    "  -w              wherever: don't distinguish events by caller\n"
351	    "  -W              whichever: don't distinguish events by lock\n"
352	    "  -R              display rates rather than counts\n"
353	    "  -p              parsable output format (awk(1)-friendly)\n"
354	    "  -P              sort lock data by (count * avg_time) product\n"
355	    "  -D n            only display top <n> events of each type\n"
356	    "  -o filename     send output to <filename>\n",
357	    DEFAULT_HZ, DEFAULT_NRECS);
358
359	show_events('C', "Contention");
360	show_events('H', "Hold-time");
361	show_events('I', "Interrupt");
362	show_events('E', "Error");
363	(void) fprintf(stderr, "\n");
364
365	exit(1);
366}
367
368static int
369lockcmp(lsrec_t *a, lsrec_t *b)
370{
371	int i;
372
373	if (a->ls_event < b->ls_event)
374		return (-1);
375	if (a->ls_event > b->ls_event)
376		return (1);
377
378	for (i = g_stkdepth - 1; i >= 0; i--) {
379		if (a->ls_stack[i] < b->ls_stack[i])
380			return (-1);
381		if (a->ls_stack[i] > b->ls_stack[i])
382			return (1);
383	}
384
385	if (a->ls_caller < b->ls_caller)
386		return (-1);
387	if (a->ls_caller > b->ls_caller)
388		return (1);
389
390#ifdef illumos
391	if (a->ls_lock < b->ls_lock)
392		return (-1);
393	if (a->ls_lock > b->ls_lock)
394		return (1);
395
396	return (0);
397#else
398	return (strcmp(a->ls_lock, b->ls_lock));
399#endif
400}
401
402static int
403countcmp(lsrec_t *a, lsrec_t *b)
404{
405	if (a->ls_event < b->ls_event)
406		return (-1);
407	if (a->ls_event > b->ls_event)
408		return (1);
409
410	return (b->ls_count - a->ls_count);
411}
412
413static int
414timecmp(lsrec_t *a, lsrec_t *b)
415{
416	if (a->ls_event < b->ls_event)
417		return (-1);
418	if (a->ls_event > b->ls_event)
419		return (1);
420
421	if (a->ls_time < b->ls_time)
422		return (1);
423	if (a->ls_time > b->ls_time)
424		return (-1);
425
426	return (0);
427}
428
429static int
430lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
431{
432	if (a->ls_event < b->ls_event)
433		return (-1);
434	if (a->ls_event > b->ls_event)
435		return (1);
436
437#ifdef illumos
438	if (a->ls_lock < b->ls_lock)
439		return (-1);
440	if (a->ls_lock > b->ls_lock)
441		return (1);
442
443	return (0);
444#else
445	return (strcmp(a->ls_lock, b->ls_lock));
446#endif
447}
448
449static int
450lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
451{
452#ifndef illumos
453	int cmp;
454#endif
455
456	if (a->ls_event < b->ls_event)
457		return (-1);
458	if (a->ls_event > b->ls_event)
459		return (1);
460
461#ifdef illumos
462	if (a->ls_lock < b->ls_lock)
463		return (-1);
464	if (a->ls_lock > b->ls_lock)
465		return (1);
466#else
467	cmp = strcmp(a->ls_lock, b->ls_lock);
468	if (cmp != 0)
469		return (cmp);
470#endif
471
472	return (b->ls_count - a->ls_count);
473}
474
475static int
476sitecmp_anylock(lsrec_t *a, lsrec_t *b)
477{
478	int i;
479
480	if (a->ls_event < b->ls_event)
481		return (-1);
482	if (a->ls_event > b->ls_event)
483		return (1);
484
485	for (i = g_stkdepth - 1; i >= 0; i--) {
486		if (a->ls_stack[i] < b->ls_stack[i])
487			return (-1);
488		if (a->ls_stack[i] > b->ls_stack[i])
489			return (1);
490	}
491
492	if (a->ls_caller < b->ls_caller)
493		return (-1);
494	if (a->ls_caller > b->ls_caller)
495		return (1);
496
497	return (0);
498}
499
500static int
501site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
502{
503	int i;
504
505	if (a->ls_event < b->ls_event)
506		return (-1);
507	if (a->ls_event > b->ls_event)
508		return (1);
509
510	for (i = g_stkdepth - 1; i >= 0; i--) {
511		if (a->ls_stack[i] < b->ls_stack[i])
512			return (-1);
513		if (a->ls_stack[i] > b->ls_stack[i])
514			return (1);
515	}
516
517	if (a->ls_caller < b->ls_caller)
518		return (-1);
519	if (a->ls_caller > b->ls_caller)
520		return (1);
521
522	return (b->ls_count - a->ls_count);
523}
524
525static void
526lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
527{
528	int m = n / 2;
529	int i, j;
530
531	if (m > 1)
532		lsmergesort(cmp, a, b, m);
533	if (n - m > 1)
534		lsmergesort(cmp, a + m, b + m, n - m);
535	for (i = m; i > 0; i--)
536		b[i - 1] = a[i - 1];
537	for (j = m - 1; j < n - 1; j++)
538		b[n + m - j - 2] = a[j + 1];
539	while (i < j)
540		*a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
541	*a = b[i];
542}
543
544static void
545coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
546{
547	int i, j;
548	lsrec_t *target, *current;
549
550	target = lock[0];
551
552	for (i = 1; i < n; i++) {
553		current = lock[i];
554		if (cmp(current, target) != 0) {
555			target = current;
556			continue;
557		}
558		current->ls_event = LS_MAX_EVENTS;
559		target->ls_count += current->ls_count;
560		target->ls_refcnt += current->ls_refcnt;
561		if (g_recsize < LS_TIME)
562			continue;
563		target->ls_time += current->ls_time;
564		if (g_recsize < LS_HIST)
565			continue;
566		for (j = 0; j < 64; j++)
567			target->ls_hist[j] += current->ls_hist[j];
568	}
569}
570
571static void
572coalesce_symbol(uintptr_t *addrp)
573{
574	uintptr_t symoff;
575	size_t symsize;
576
577	if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
578		*addrp -= symoff;
579}
580
581static void
582predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
583{
584	char *new;
585	int len, newlen;
586
587	if (what == NULL)
588		return;
589
590	if (*pred == NULL) {
591		*pred = malloc(1);
592		*pred[0] = '\0';
593	}
594
595	len = strlen(*pred);
596	newlen = len + strlen(what) + 32 + strlen("( && )");
597	new = malloc(newlen);
598
599	if (*pred[0] != '\0') {
600		if (cmp != NULL) {
601			(void) sprintf(new, "(%s) && (%s %s 0x%p)",
602			    *pred, what, cmp, (void *)value);
603		} else {
604			(void) sprintf(new, "(%s) && (%s)", *pred, what);
605		}
606	} else {
607		if (cmp != NULL) {
608			(void) sprintf(new, "%s %s 0x%p",
609			    what, cmp, (void *)value);
610		} else {
611			(void) sprintf(new, "%s", what);
612		}
613	}
614
615	free(*pred);
616	*pred = new;
617}
618
619static void
620predicate_destroy(char **pred)
621{
622	free(*pred);
623	*pred = NULL;
624}
625
626static void
627filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
628{
629	char buf[256], *c = buf, *new;
630	int len, newlen;
631
632	if (*filt == NULL) {
633		*filt = malloc(1);
634		*filt[0] = '\0';
635	}
636
637#ifdef illumos
638	(void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
639	    " || " : "", what, (void *)base, what, (void *)(base + size));
640#else
641	(void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ?
642	    " || " : "", what, (void *)base, what, (void *)(base + size));
643#endif
644
645	newlen = (len = strlen(*filt) + 1) + strlen(c);
646	new = malloc(newlen);
647	bcopy(*filt, new, len);
648	(void) strcat(new, c);
649	free(*filt);
650	*filt = new;
651}
652
653static void
654filter_destroy(char **filt)
655{
656	free(*filt);
657	*filt = NULL;
658}
659
660static void
661dprog_add(const char *fmt, ...)
662{
663	va_list args;
664	int size, offs;
665	char c;
666
667	va_start(args, fmt);
668	size = vsnprintf(&c, 1, fmt, args) + 1;
669	va_end(args);
670
671	if (g_proglen == 0) {
672		offs = 0;
673	} else {
674		offs = g_proglen - 1;
675	}
676
677	g_proglen = offs + size;
678
679	if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
680		fail(1, "failed to reallocate program text");
681
682	va_start(args, fmt);
683	(void) vsnprintf(&g_prog[offs], size, fmt, args);
684	va_end(args);
685}
686
687/*
688 * This function may read like an open sewer, but keep in mind that programs
689 * that generate other programs are rarely pretty.  If one has the unenviable
690 * task of maintaining or -- worse -- extending this code, use the -V option
691 * to examine the D program as generated by this function.
692 */
693static void
694dprog_addevent(int event)
695{
696	ls_event_info_t *info = &g_event_info[event];
697	char *pred = NULL;
698	char stack[20];
699	const char *arg0, *caller;
700	char *arg1 = "arg1";
701	char buf[80];
702	hrtime_t dur;
703	int depth;
704
705	if (info->ev_name[0] == '\0')
706		return;
707
708	if (info->ev_type == 'I') {
709		/*
710		 * For interrupt events, arg0 (normally the lock pointer) is
711		 * the CPU address plus the current pil, and arg1 (normally
712		 * the number of nanoseconds) is the number of nanoseconds
713		 * late -- and it's stored in arg2.
714		 */
715#ifdef illumos
716		arg0 = "(uintptr_t)curthread->t_cpu + \n"
717		    "\t    curthread->t_cpu->cpu_profile_pil";
718#else
719		arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n"
720		    "\t    0x01000000 + curthread->td_pri_class";
721#endif
722		caller = "(uintptr_t)arg0";
723		arg1 = "arg2";
724	} else {
725#ifdef illumos
726		arg0 = "(uintptr_t)arg0";
727#else
728		arg0 = "stringof(args[0]->lock_object.lo_name)";
729#endif
730		caller = "caller";
731	}
732
733	if (g_recsize > LS_HIST) {
734		for (depth = 0; g_recsize > LS_STACK(depth); depth++)
735			continue;
736
737		if (g_tracing) {
738			(void) sprintf(stack, "\tstack(%d);\n", depth);
739		} else {
740			(void) sprintf(stack, ", stack(%d)", depth);
741		}
742	} else {
743		(void) sprintf(stack, "");
744	}
745
746	if (info->ev_acquire != NULL) {
747		/*
748		 * If this is a hold event, we need to generate an additional
749		 * clause for the acquire; the clause for the release will be
750		 * generated with the aggregating statement, below.
751		 */
752		dprog_add("%s\n", info->ev_acquire);
753		predicate_add(&pred, info->ev_predicate, NULL, 0);
754		predicate_add(&pred, g_predicate, NULL, 0);
755		if (pred != NULL)
756			dprog_add("/%s/\n", pred);
757
758		dprog_add("{\n");
759		(void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
760
761		if (info->ev_type == 'H') {
762			dprog_add("\t%s = timestamp;\n", buf);
763		} else {
764			/*
765			 * If this isn't a hold event, it's the recursive
766			 * error event.  For this, we simply bump the
767			 * thread-local, per-lock count.
768			 */
769			dprog_add("\t%s++;\n", buf);
770		}
771
772		dprog_add("}\n\n");
773		predicate_destroy(&pred);
774		pred = NULL;
775
776		if (info->ev_type == 'E') {
777			/*
778			 * If this is the recursive lock error event, we need
779			 * to generate an additional clause to decrement the
780			 * thread-local, per-lock count.  This assures that we
781			 * only execute the aggregating clause if we have
782			 * recursive entry.
783			 */
784			dprog_add("%s\n", info->ev_name);
785			dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
786		}
787
788		predicate_add(&pred, buf, NULL, 0);
789
790		if (info->ev_type == 'H') {
791			(void) sprintf(buf, "timestamp -\n\t    "
792			    "self->ev%d[(uintptr_t)arg0]", event);
793		}
794
795		arg1 = buf;
796	} else {
797		predicate_add(&pred, info->ev_predicate, NULL, 0);
798		if (info->ev_type != 'I')
799			predicate_add(&pred, g_predicate, NULL, 0);
800		else
801			predicate_add(&pred, g_ipredicate, NULL, 0);
802	}
803
804	if ((dur = g_min_duration[event]) != 0)
805		predicate_add(&pred, arg1, ">=", dur);
806
807	dprog_add("%s\n", info->ev_name);
808
809	if (pred != NULL)
810		dprog_add("/%s/\n", pred);
811	predicate_destroy(&pred);
812
813	dprog_add("{\n");
814
815	if (g_tracing) {
816		dprog_add("\ttrace(%dULL);\n", event);
817		dprog_add("\ttrace(%s);\n", arg0);
818		dprog_add("\ttrace(%s);\n", caller);
819		dprog_add(stack);
820	} else {
821		/*
822		 * The ordering here is important:  when we process the
823		 * aggregate, we count on the fact that @avg appears before
824		 * @hist in program order to assure that @avg is assigned the
825		 * first aggregation variable ID and @hist assigned the
826		 * second; see the comment in process_aggregate() for details.
827		 */
828		dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
829		    event, arg0, caller, stack, arg1);
830
831		if (g_recsize >= LS_HIST) {
832			dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
833			    "(%s);\n", event, arg0, caller, stack, arg1);
834		}
835	}
836
837	if (info->ev_acquire != NULL)
838		dprog_add("\tself->ev%d[arg0] = 0;\n", event);
839
840	dprog_add("}\n\n");
841}
842
843static void
844dprog_compile()
845{
846	dtrace_prog_t *prog;
847	dtrace_proginfo_t info;
848
849	if (g_Vflag) {
850		(void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
851		(void) fputs(g_prog, stderr);
852		(void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
853	}
854
855	if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
856	    DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
857		dfail("failed to compile program");
858
859	if (dtrace_program_exec(g_dtp, prog, &info) == -1)
860		dfail("failed to enable probes");
861
862	if (dtrace_go(g_dtp) != 0)
863		dfail("couldn't start tracing");
864}
865
866static void
867#ifdef illumos
868status_fire(void)
869#else
870status_fire(int i)
871#endif
872{}
873
874static void
875status_init(void)
876{
877	dtrace_optval_t val, status, agg;
878	struct sigaction act;
879	struct itimerspec ts;
880	struct sigevent ev;
881	timer_t tid;
882
883	if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
884		dfail("failed to get 'statusrate'");
885
886	if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
887		dfail("failed to get 'statusrate'");
888
889	/*
890	 * We would want to awaken at a rate that is the GCD of the statusrate
891	 * and the aggrate -- but that seems a bit absurd.  Instead, we'll
892	 * simply awaken at a rate that is the more frequent of the two, which
893	 * assures that we're never later than the interval implied by the
894	 * more frequent rate.
895	 */
896	val = status < agg ? status : agg;
897
898	(void) sigemptyset(&act.sa_mask);
899	act.sa_flags = 0;
900	act.sa_handler = status_fire;
901	(void) sigaction(SIGUSR1, &act, NULL);
902
903	ev.sigev_notify = SIGEV_SIGNAL;
904	ev.sigev_signo = SIGUSR1;
905
906	if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
907		dfail("cannot create CLOCK_REALTIME timer");
908
909	ts.it_value.tv_sec = val / NANOSEC;
910	ts.it_value.tv_nsec = val % NANOSEC;
911	ts.it_interval = ts.it_value;
912
913	if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
914		dfail("cannot set time on CLOCK_REALTIME timer");
915}
916
917static void
918status_check(void)
919{
920	if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
921		dfail("failed to snap aggregate");
922
923	if (dtrace_status(g_dtp) == -1)
924		dfail("dtrace_status()");
925}
926
927static void
928lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
929{
930	bzero(lsrec, g_recsize);
931	lsrec->ls_count = 1;
932
933	if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
934		fail(0, "truncated DTrace record");
935
936	if (rec->dtrd_size != sizeof (uint64_t))
937		fail(0, "bad event size in first record");
938
939	/* LINTED - alignment */
940	lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
941	rec++;
942
943#ifdef illumos
944	if (rec->dtrd_size != sizeof (uintptr_t))
945		fail(0, "bad lock address size in second record");
946
947	/* LINTED - alignment */
948	lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
949	rec++;
950#else
951	lsrec->ls_lock = strdup((const char *)(data + rec->dtrd_offset));
952	rec++;
953#endif
954
955	if (rec->dtrd_size != sizeof (uintptr_t))
956		fail(0, "bad caller size in third record");
957
958	/* LINTED - alignment */
959	lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
960	rec++;
961
962	if (g_recsize > LS_HIST) {
963		int frames, i;
964		pc_t *stack;
965
966		frames = rec->dtrd_size / sizeof (pc_t);
967		/* LINTED - alignment */
968		stack = (pc_t *)(data + rec->dtrd_offset);
969
970		for (i = 1; i < frames; i++)
971			lsrec->ls_stack[i - 1] = stack[i];
972	}
973}
974
975/*ARGSUSED*/
976static int
977count_aggregate(const dtrace_aggdata_t *agg, void *arg)
978{
979	*((size_t *)arg) += 1;
980
981	return (DTRACE_AGGWALK_NEXT);
982}
983
984static int
985process_aggregate(const dtrace_aggdata_t *agg, void *arg)
986{
987	const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
988	caddr_t data = agg->dtada_data;
989	lsdata_t *lsdata = arg;
990	lsrec_t *lsrec = lsdata->lsd_next;
991	const dtrace_recdesc_t *rec;
992	uint64_t *avg, *quantized;
993	int i, j;
994
995	assert(lsdata->lsd_count < g_nrecs);
996
997	/*
998	 * Aggregation variable IDs are guaranteed to be generated in program
999	 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
1000	 * plus one.  As "avg" appears before "hist" in program order, we know
1001	 * that "avg" will be allocated the first aggregation variable ID, and
1002	 * "hist" will be allocated the second aggregation variable ID -- and
1003	 * we therefore use the aggregation variable ID to differentiate the
1004	 * cases.
1005	 */
1006	if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
1007		/*
1008		 * If this is the histogram entry.  We'll copy the quantized
1009		 * data into lc_hist, and jump over the rest.
1010		 */
1011		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1012
1013		if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
1014			fail(0, "bad variable ID in aggregation record");
1015
1016		if (rec->dtrd_size !=
1017		    DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
1018			fail(0, "bad quantize size in aggregation record");
1019
1020		/* LINTED - alignment */
1021		quantized = (uint64_t *)(data + rec->dtrd_offset);
1022
1023		for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
1024		    i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
1025			lsrec->ls_hist[j] = quantized[i];
1026
1027		goto out;
1028	}
1029
1030	lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
1031	    aggdesc->dtagd_nrecs - 1, data);
1032
1033	rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1034
1035	if (rec->dtrd_size != 2 * sizeof (uint64_t))
1036		fail(0, "bad avg size in aggregation record");
1037
1038	/* LINTED - alignment */
1039	avg = (uint64_t *)(data + rec->dtrd_offset);
1040	lsrec->ls_count = (uint32_t)avg[0];
1041	lsrec->ls_time = (uintptr_t)avg[1];
1042
1043	if (g_recsize >= LS_HIST)
1044		return (DTRACE_AGGWALK_NEXT);
1045
1046out:
1047	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1048	lsdata->lsd_count++;
1049
1050	return (DTRACE_AGGWALK_NEXT);
1051}
1052
1053static int
1054process_trace(const dtrace_probedata_t *pdata, void *arg)
1055{
1056	lsdata_t *lsdata = arg;
1057	lsrec_t *lsrec = lsdata->lsd_next;
1058	dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
1059	caddr_t data = pdata->dtpda_data;
1060
1061	if (lsdata->lsd_count >= g_nrecs)
1062		return (DTRACE_CONSUME_NEXT);
1063
1064	lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
1065
1066	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1067	lsdata->lsd_count++;
1068
1069	return (DTRACE_CONSUME_NEXT);
1070}
1071
1072static int
1073process_data(FILE *out, char *data)
1074{
1075	lsdata_t lsdata;
1076
1077	/* LINTED - alignment */
1078	lsdata.lsd_next = (lsrec_t *)data;
1079	lsdata.lsd_count = 0;
1080
1081	if (g_tracing) {
1082		if (dtrace_consume(g_dtp, out,
1083		    process_trace, NULL, &lsdata) != 0)
1084			dfail("failed to consume buffer");
1085
1086		return (lsdata.lsd_count);
1087	}
1088
1089	if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
1090	    process_aggregate, &lsdata) != 0)
1091		dfail("failed to walk aggregate");
1092
1093	return (lsdata.lsd_count);
1094}
1095
1096/*ARGSUSED*/
1097static int
1098drophandler(const dtrace_dropdata_t *data, void *arg)
1099{
1100	g_dropped++;
1101	(void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1102	return (DTRACE_HANDLE_OK);
1103}
1104
1105int
1106main(int argc, char **argv)
1107{
1108	char *data_buf;
1109	lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1110	FILE *out = stdout;
1111	int c;
1112	pid_t child;
1113	int status;
1114	int i, j;
1115	hrtime_t duration;
1116	char *addrp, *offp, *sizep, *evp, *lastp, *p;
1117	uintptr_t addr;
1118	size_t size, off;
1119	int events_specified = 0;
1120	int exec_errno = 0;
1121	uint32_t event;
1122	char *filt = NULL, *ifilt = NULL;
1123	static uint64_t ev_count[LS_MAX_EVENTS + 1];
1124	static uint64_t ev_time[LS_MAX_EVENTS + 1];
1125	dtrace_optval_t aggsize;
1126	char aggstr[10];
1127	long ncpus;
1128	int dynvar = 0;
1129	int err;
1130
1131	if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1132		fail(0, "cannot open dtrace library: %s",
1133		    dtrace_errmsg(NULL, err));
1134	}
1135
1136	if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1137		dfail("couldn't establish drop handler");
1138
1139	if (symtab_init() == -1)
1140		fail(1, "can't load kernel symbols");
1141
1142	g_nrecs = DEFAULT_NRECS;
1143
1144	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1145		switch (c) {
1146		case 'b':
1147			g_recsize = LS_BASIC;
1148			break;
1149
1150		case 't':
1151			g_recsize = LS_TIME;
1152			break;
1153
1154		case 'h':
1155			g_recsize = LS_HIST;
1156			break;
1157
1158		case 's':
1159			if (!isdigit(optarg[0]))
1160				usage();
1161			g_stkdepth = atoi(optarg);
1162			if (g_stkdepth > LS_MAX_STACK_DEPTH)
1163				fail(0, "max stack depth is %d",
1164				    LS_MAX_STACK_DEPTH);
1165			g_recsize = LS_STACK(g_stkdepth);
1166			break;
1167
1168		case 'n':
1169			if (!isdigit(optarg[0]))
1170				usage();
1171			g_nrecs = atoi(optarg);
1172			break;
1173
1174		case 'd':
1175			if (!isdigit(optarg[0]))
1176				usage();
1177			duration = atoll(optarg);
1178
1179			/*
1180			 * XXX -- durations really should be per event
1181			 * since the units are different, but it's hard
1182			 * to express this nicely in the interface.
1183			 * Not clear yet what the cleanest solution is.
1184			 */
1185			for (i = 0; i < LS_MAX_EVENTS; i++)
1186				if (g_event_info[i].ev_type != 'E')
1187					g_min_duration[i] = duration;
1188
1189			break;
1190
1191		case 'i':
1192			if (!isdigit(optarg[0]))
1193				usage();
1194			i = atoi(optarg);
1195			if (i <= 0)
1196				usage();
1197			if (i > MAX_HZ)
1198				fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1199
1200			for (j = 0; j < LS_MAX_EVENTS; j++)
1201				if (strcmp(g_event_info[j].ev_desc,
1202				    "Profiling interrupt") == 0)
1203					break;
1204
1205			(void) sprintf(g_event_info[j].ev_name,
1206			    "profile:::profile-%d", i);
1207			break;
1208
1209		case 'l':
1210		case 'f':
1211			addrp = strtok(optarg, ",");
1212			sizep = strtok(NULL, ",");
1213			addrp = strtok(optarg, ",+");
1214			offp = strtok(NULL, ",");
1215
1216			size = sizep ? strtoul(sizep, NULL, 0) : 1;
1217			off = offp ? strtoul(offp, NULL, 0) : 0;
1218
1219			if (addrp[0] == '0') {
1220				addr = strtoul(addrp, NULL, 16) + off;
1221			} else {
1222				addr = sym_to_addr(addrp) + off;
1223				if (sizep == NULL)
1224					size = sym_size(addrp) - off;
1225				if (addr - off == 0)
1226					fail(0, "symbol '%s' not found", addrp);
1227				if (size == 0)
1228					size = 1;
1229			}
1230
1231
1232			if (c == 'l') {
1233				filter_add(&filt, "arg0", addr, size);
1234			} else {
1235				filter_add(&filt, "caller", addr, size);
1236				filter_add(&ifilt, "arg0", addr, size);
1237			}
1238			break;
1239
1240		case 'e':
1241			evp = strtok_r(optarg, ",", &lastp);
1242			while (evp) {
1243				int ev1, ev2;
1244				char *evp2;
1245
1246				(void) strtok(evp, "-");
1247				evp2 = strtok(NULL, "-");
1248				ev1 = atoi(evp);
1249				ev2 = evp2 ? atoi(evp2) : ev1;
1250				if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1251				    (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1252					fail(0, "-e events out of range");
1253				for (i = ev1; i <= ev2; i++)
1254					g_enabled[i] = 1;
1255				evp = strtok_r(NULL, ",", &lastp);
1256			}
1257			events_specified = 1;
1258			break;
1259
1260#ifdef illumos
1261		case 'c':
1262			g_cflag = 1;
1263			break;
1264#endif
1265
1266		case 'k':
1267			g_kflag = 1;
1268			break;
1269
1270		case 'w':
1271			g_wflag = 1;
1272			break;
1273
1274		case 'W':
1275			g_Wflag = 1;
1276			break;
1277
1278		case 'g':
1279			g_gflag = 1;
1280			break;
1281
1282		case 'C':
1283		case 'E':
1284		case 'H':
1285		case 'I':
1286			for (i = 0; i < LS_MAX_EVENTS; i++)
1287				if (g_event_info[i].ev_type == c)
1288					g_enabled[i] = 1;
1289			events_specified = 1;
1290			break;
1291
1292		case 'A':
1293			for (i = 0; i < LS_MAX_EVENTS; i++)
1294				if (strchr("CH", g_event_info[i].ev_type))
1295					g_enabled[i] = 1;
1296			events_specified = 1;
1297			break;
1298
1299		case 'T':
1300			g_tracing = 1;
1301			break;
1302
1303		case 'D':
1304			if (!isdigit(optarg[0]))
1305				usage();
1306			g_topn = atoi(optarg);
1307			break;
1308
1309		case 'R':
1310			g_rates = 1;
1311			break;
1312
1313		case 'p':
1314			g_pflag = 1;
1315			break;
1316
1317		case 'P':
1318			g_Pflag = 1;
1319			break;
1320
1321		case 'o':
1322			if ((out = fopen(optarg, "w")) == NULL)
1323				fail(1, "error opening file");
1324			break;
1325
1326		case 'V':
1327			g_Vflag = 1;
1328			break;
1329
1330		default:
1331			if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1332				usage();
1333		}
1334	}
1335
1336	if (filt != NULL) {
1337		predicate_add(&g_predicate, filt, NULL, 0);
1338		filter_destroy(&filt);
1339	}
1340
1341	if (ifilt != NULL) {
1342		predicate_add(&g_ipredicate, ifilt, NULL, 0);
1343		filter_destroy(&ifilt);
1344	}
1345
1346	if (g_recsize == 0) {
1347		if (g_gflag) {
1348			g_stkdepth = LS_MAX_STACK_DEPTH;
1349			g_recsize = LS_STACK(g_stkdepth);
1350		} else {
1351			g_recsize = LS_TIME;
1352		}
1353	}
1354
1355	if (g_gflag && g_recsize <= LS_STACK(0))
1356		fail(0, "'-g' requires at least '-s 1' data gathering");
1357
1358	/*
1359	 * Make sure the alignment is reasonable
1360	 */
1361	g_recsize = -(-g_recsize & -sizeof (uint64_t));
1362
1363	for (i = 0; i < LS_MAX_EVENTS; i++) {
1364		/*
1365		 * If no events were specified, enable -C.
1366		 */
1367		if (!events_specified && g_event_info[i].ev_type == 'C')
1368			g_enabled[i] = 1;
1369	}
1370
1371	for (i = 0; i < LS_MAX_EVENTS; i++) {
1372		if (!g_enabled[i])
1373			continue;
1374
1375		if (g_event_info[i].ev_acquire != NULL) {
1376			/*
1377			 * If we've enabled a hold event, we must explicitly
1378			 * allocate dynamic variable space.
1379			 */
1380			dynvar = 1;
1381		}
1382
1383		dprog_addevent(i);
1384	}
1385
1386	/*
1387	 * Make sure there are remaining arguments to specify a child command
1388	 * to execute.
1389	 */
1390	if (argc <= optind)
1391		usage();
1392
1393	if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1394		dfail("couldn't determine number of online CPUs");
1395
1396	/*
1397	 * By default, we set our data buffer size to be the number of records
1398	 * multiplied by the size of the record, doubled to account for some
1399	 * DTrace slop and divided by the number of CPUs.  We silently clamp
1400	 * the aggregation size at both a minimum and a maximum to prevent
1401	 * absurdly low or high values.
1402	 */
1403	if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1404		aggsize = MIN_AGGSIZE;
1405
1406	if (aggsize > MAX_AGGSIZE)
1407		aggsize = MAX_AGGSIZE;
1408
1409	(void) sprintf(aggstr, "%lld", (long long)aggsize);
1410
1411	if (!g_tracing) {
1412		if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1413			dfail("failed to set 'bufsize'");
1414
1415		if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1416			dfail("failed to set 'aggsize'");
1417
1418		if (dynvar) {
1419			/*
1420			 * If we're using dynamic variables, we set our
1421			 * dynamic variable size to be one megabyte per CPU,
1422			 * with a hard-limit of 32 megabytes.  This may still
1423			 * be too small in some cases, but it can be tuned
1424			 * manually via -x if need be.
1425			 */
1426			(void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1427
1428			if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1429				dfail("failed to set 'dynvarsize'");
1430		}
1431	} else {
1432		if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1433			dfail("failed to set 'bufsize'");
1434	}
1435
1436	if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1437		dfail("failed to set 'statusrate'");
1438
1439	optind = 1;
1440	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1441		switch (c) {
1442		case 'x':
1443			if ((p = strchr(optarg, '=')) != NULL)
1444				*p++ = '\0';
1445
1446			if (dtrace_setopt(g_dtp, optarg, p) != 0)
1447				dfail("failed to set -x %s", optarg);
1448			break;
1449		}
1450	}
1451
1452	argc -= optind;
1453	argv += optind;
1454
1455	dprog_compile();
1456	status_init();
1457
1458	g_elapsed = -gethrtime();
1459
1460	/*
1461	 * Spawn the specified command and wait for it to complete.
1462	 */
1463	child = fork();
1464	if (child == -1)
1465		fail(1, "cannot fork");
1466	if (child == 0) {
1467		(void) dtrace_close(g_dtp);
1468		(void) execvp(argv[0], &argv[0]);
1469		exec_errno = errno;
1470		exit(127);
1471	}
1472
1473#ifdef illumos
1474	while (waitpid(child, &status, WEXITED) != child)
1475#else
1476	while (waitpid(child, &status, 0) != child)
1477#endif
1478		status_check();
1479
1480	g_elapsed += gethrtime();
1481
1482	if (WIFEXITED(status)) {
1483		if (WEXITSTATUS(status) != 0) {
1484			if (exec_errno != 0) {
1485				errno = exec_errno;
1486				fail(1, "could not execute %s", argv[0]);
1487			}
1488			(void) fprintf(stderr,
1489			    "lockstat: warning: %s exited with code %d\n",
1490			    argv[0], WEXITSTATUS(status));
1491		}
1492	} else {
1493		(void) fprintf(stderr,
1494		    "lockstat: warning: %s died on signal %d\n",
1495		    argv[0], WTERMSIG(status));
1496	}
1497
1498	if (dtrace_stop(g_dtp) == -1)
1499		dfail("failed to stop dtrace");
1500
1501	/*
1502	 * Before we read out the results, we need to allocate our buffer.
1503	 * If we're tracing, then we'll just use the precalculated size.  If
1504	 * we're not, then we'll take a snapshot of the aggregate, and walk
1505	 * it to count the number of records.
1506	 */
1507	if (!g_tracing) {
1508		if (dtrace_aggregate_snap(g_dtp) != 0)
1509			dfail("failed to snap aggregate");
1510
1511		g_nrecs = 0;
1512
1513		if (dtrace_aggregate_walk(g_dtp,
1514		    count_aggregate, &g_nrecs) != 0)
1515			dfail("failed to walk aggregate");
1516	}
1517
1518#ifdef illumos
1519	if ((data_buf = memalign(sizeof (uint64_t),
1520	    (g_nrecs + 1) * g_recsize)) == NULL)
1521#else
1522	if (posix_memalign((void **)&data_buf, sizeof (uint64_t),
1523	    (g_nrecs + 1) * g_recsize) )
1524#endif
1525		fail(1, "Memory allocation failed");
1526
1527	/*
1528	 * Read out the DTrace data.
1529	 */
1530	g_nrecs_used = process_data(out, data_buf);
1531
1532	if (g_nrecs_used > g_nrecs || g_dropped)
1533		(void) fprintf(stderr, "lockstat: warning: "
1534		    "ran out of data records (use -n for more)\n");
1535
1536	/* LINTED - alignment */
1537	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1538	    /* LINTED - alignment */
1539	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1540		ev_count[lsp->ls_event] += lsp->ls_count;
1541		ev_time[lsp->ls_event] += lsp->ls_time;
1542	}
1543
1544	/*
1545	 * If -g was specified, convert stacks into individual records.
1546	 */
1547	if (g_gflag) {
1548		lsrec_t *newlsp, *oldlsp;
1549
1550#ifdef illumos
1551		newlsp = memalign(sizeof (uint64_t),
1552		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1553#else
1554		posix_memalign((void **)&newlsp, sizeof (uint64_t),
1555		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1556#endif
1557		if (newlsp == NULL)
1558			fail(1, "Cannot allocate space for -g processing");
1559		lsp = newlsp;
1560		/* LINTED - alignment */
1561		for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1562		    /* LINTED - alignment */
1563		    oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1564			int fr;
1565			int caller_in_stack = 0;
1566
1567			if (oldlsp->ls_count == 0)
1568				continue;
1569
1570			for (fr = 0; fr < g_stkdepth; fr++) {
1571				if (oldlsp->ls_stack[fr] == 0)
1572					break;
1573				if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1574					caller_in_stack = 1;
1575				bcopy(oldlsp, lsp, LS_TIME);
1576				lsp->ls_caller = oldlsp->ls_stack[fr];
1577#ifndef illumos
1578				lsp->ls_lock = strdup(oldlsp->ls_lock);
1579#endif
1580				/* LINTED - alignment */
1581				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1582			}
1583			if (!caller_in_stack) {
1584				bcopy(oldlsp, lsp, LS_TIME);
1585				/* LINTED - alignment */
1586				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1587			}
1588#ifndef illumos
1589			free(oldlsp->ls_lock);
1590#endif
1591		}
1592		g_nrecs = g_nrecs_used =
1593		    ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1594		g_recsize = LS_TIME;
1595		g_stkdepth = 0;
1596		free(data_buf);
1597		data_buf = (char *)newlsp;
1598	}
1599
1600	if ((sort_buf = calloc(2 * (g_nrecs + 1),
1601	    sizeof (void *))) == NULL)
1602		fail(1, "Sort buffer allocation failed");
1603	merge_buf = sort_buf + (g_nrecs + 1);
1604
1605	/*
1606	 * Build the sort buffer, discarding zero-count records along the way.
1607	 */
1608	/* LINTED - alignment */
1609	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1610	    /* LINTED - alignment */
1611	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1612		if (lsp->ls_count == 0)
1613			lsp->ls_event = LS_MAX_EVENTS;
1614		sort_buf[i] = lsp;
1615	}
1616
1617	if (g_nrecs_used == 0)
1618		exit(0);
1619
1620	/*
1621	 * Add a sentinel after the last record
1622	 */
1623	sort_buf[i] = lsp;
1624	lsp->ls_event = LS_MAX_EVENTS;
1625
1626	if (g_tracing) {
1627		report_trace(out, sort_buf);
1628		return (0);
1629	}
1630
1631	/*
1632	 * Application of -g may have resulted in multiple records
1633	 * with the same signature; coalesce them.
1634	 */
1635	if (g_gflag) {
1636		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1637		coalesce(lockcmp, sort_buf, g_nrecs_used);
1638	}
1639
1640	/*
1641	 * Coalesce locks within the same symbol if -c option specified.
1642	 * Coalesce PCs within the same function if -k option specified.
1643	 */
1644	if (g_cflag || g_kflag) {
1645		for (i = 0; i < g_nrecs_used; i++) {
1646			int fr;
1647			lsp = sort_buf[i];
1648#ifdef illumos
1649			if (g_cflag)
1650				coalesce_symbol(&lsp->ls_lock);
1651#endif
1652			if (g_kflag) {
1653				for (fr = 0; fr < g_stkdepth; fr++)
1654					coalesce_symbol(&lsp->ls_stack[fr]);
1655				coalesce_symbol(&lsp->ls_caller);
1656			}
1657		}
1658		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1659		coalesce(lockcmp, sort_buf, g_nrecs_used);
1660	}
1661
1662	/*
1663	 * Coalesce callers if -w option specified
1664	 */
1665	if (g_wflag) {
1666		mergesort(lock_and_count_cmp_anywhere,
1667		    sort_buf, merge_buf, g_nrecs_used);
1668		coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1669	}
1670
1671	/*
1672	 * Coalesce locks if -W option specified
1673	 */
1674	if (g_Wflag) {
1675		mergesort(site_and_count_cmp_anylock,
1676		    sort_buf, merge_buf, g_nrecs_used);
1677		coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1678	}
1679
1680	/*
1681	 * Sort data by contention count (ls_count) or total time (ls_time),
1682	 * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
1683	 */
1684	if (g_recsize < LS_TIME)
1685		g_Pflag = 0;
1686
1687	if (g_Pflag)
1688		mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1689	else
1690		mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1691
1692	/*
1693	 * Display data by event type
1694	 */
1695	first = &sort_buf[0];
1696	while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1697		current = first;
1698		while ((lsp = *current)->ls_event == event)
1699			current++;
1700		report_stats(out, first, current - first, ev_count[event],
1701		    ev_time[event]);
1702		first = current;
1703	}
1704
1705#ifndef illumos
1706	/*
1707	 * Free lock name buffers
1708	 */
1709	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1710	    lsp = (lsrec_t *)((char *)lsp + g_recsize))
1711		free(lsp->ls_lock);
1712#endif
1713
1714	return (0);
1715}
1716
1717static char *
1718format_symbol(char *buf, uintptr_t addr, int show_size)
1719{
1720	uintptr_t symoff;
1721	char *symname;
1722	size_t symsize;
1723
1724	symname = addr_to_sym(addr, &symoff, &symsize);
1725
1726	if (show_size && symoff == 0)
1727		(void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1728	else if (symoff == 0)
1729		(void) sprintf(buf, "%s", symname);
1730	else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)	/* CPU+PIL */
1731#ifdef illumos
1732		(void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1733#else
1734		(void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]);
1735#endif
1736	else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1737		(void) sprintf(buf, "%s+0x%llx", symname,
1738		    (unsigned long long)symoff);
1739	else
1740		(void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1741	return (buf);
1742}
1743
1744static void
1745report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1746	uint64_t total_time)
1747{
1748	uint32_t event = sort_buf[0]->ls_event;
1749	lsrec_t *lsp;
1750	double ptotal = 0.0;
1751	double percent;
1752	int i, j, fr;
1753	int displayed;
1754	int first_bin, last_bin, max_bin_count, total_bin_count;
1755	int rectype;
1756	char buf[256];
1757	char lhdr[80], chdr[80];
1758
1759	rectype = g_recsize;
1760
1761	if (g_topn == 0) {
1762		(void) fprintf(out, "%20llu %s\n",
1763		    g_rates == 0 ? total_count :
1764		    ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1765		    g_event_info[event].ev_desc);
1766		return;
1767	}
1768
1769	(void) sprintf(lhdr, "%s%s",
1770	    g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1771	(void) sprintf(chdr, "%s%s",
1772	    g_wflag ? "Hottest " : "", "Caller");
1773
1774	if (!g_pflag)
1775		(void) fprintf(out,
1776		    "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1777		    g_event_info[event].ev_desc, (double)total_count,
1778		    (double)g_elapsed / NANOSEC,
1779		    (double)total_count * NANOSEC / g_elapsed);
1780
1781	if (!g_pflag && rectype < LS_HIST) {
1782		(void) sprintf(buf, "%s", g_event_info[event].ev_units);
1783		(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1784		    g_rates ? "ops/s" : "Count",
1785		    g_gflag ? "genr" : "indv",
1786		    "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1787		(void) fprintf(out, "---------------------------------"
1788		    "----------------------------------------------\n");
1789	}
1790
1791	displayed = 0;
1792	for (i = 0; i < nrecs; i++) {
1793		lsp = sort_buf[i];
1794
1795		if (displayed++ >= g_topn)
1796			break;
1797
1798		if (g_pflag) {
1799			int j;
1800
1801			(void) fprintf(out, "%u %u",
1802			    lsp->ls_event, lsp->ls_count);
1803#ifdef illumos
1804			(void) fprintf(out, " %s",
1805			    format_symbol(buf, lsp->ls_lock, g_cflag));
1806#else
1807			(void) fprintf(out, " %s", lsp->ls_lock);
1808#endif
1809			(void) fprintf(out, " %s",
1810			    format_symbol(buf, lsp->ls_caller, 0));
1811			(void) fprintf(out, " %f",
1812			    (double)lsp->ls_refcnt / lsp->ls_count);
1813			if (rectype >= LS_TIME)
1814				(void) fprintf(out, " %llu",
1815				    (unsigned long long)lsp->ls_time);
1816			if (rectype >= LS_HIST) {
1817				for (j = 0; j < 64; j++)
1818					(void) fprintf(out, " %u",
1819					    lsp->ls_hist[j]);
1820			}
1821			for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1822				if (rectype <= LS_STACK(j) ||
1823				    lsp->ls_stack[j] == 0)
1824					break;
1825				(void) fprintf(out, " %s",
1826				    format_symbol(buf, lsp->ls_stack[j], 0));
1827			}
1828			(void) fprintf(out, "\n");
1829			continue;
1830		}
1831
1832		if (rectype >= LS_HIST) {
1833			(void) fprintf(out, "---------------------------------"
1834			    "----------------------------------------------\n");
1835			(void) sprintf(buf, "%s",
1836			    g_event_info[event].ev_units);
1837			(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1838			    g_rates ? "ops/s" : "Count",
1839			    g_gflag ? "genr" : "indv",
1840			    "cuml", "rcnt", buf, lhdr, chdr);
1841		}
1842
1843		if (g_Pflag && total_time != 0)
1844			percent = (lsp->ls_time * 100.00) / total_time;
1845		else
1846			percent = (lsp->ls_count * 100.00) / total_count;
1847
1848		ptotal += percent;
1849
1850		if (rectype >= LS_TIME)
1851			(void) sprintf(buf, "%llu",
1852			    (unsigned long long)(lsp->ls_time / lsp->ls_count));
1853		else
1854			buf[0] = '\0';
1855
1856		(void) fprintf(out, "%5llu ",
1857		    g_rates == 0 ? lsp->ls_count :
1858		    ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1859
1860		(void) fprintf(out, "%3.0f%% ", percent);
1861
1862		if (g_gflag)
1863			(void) fprintf(out, "---- ");
1864		else
1865			(void) fprintf(out, "%3.0f%% ", ptotal);
1866
1867		(void) fprintf(out, "%4.2f %8s ",
1868		    (double)lsp->ls_refcnt / lsp->ls_count, buf);
1869
1870#ifdef illumos
1871		(void) fprintf(out, "%-22s ",
1872		    format_symbol(buf, lsp->ls_lock, g_cflag));
1873#else
1874		(void) fprintf(out, "%-22s ", lsp->ls_lock);
1875#endif
1876
1877		(void) fprintf(out, "%-24s\n",
1878		    format_symbol(buf, lsp->ls_caller, 0));
1879
1880		if (rectype < LS_HIST)
1881			continue;
1882
1883		(void) fprintf(out, "\n");
1884		(void) fprintf(out, "%10s %31s %-9s %-24s\n",
1885		    g_event_info[event].ev_units,
1886		    "------ Time Distribution ------",
1887		    g_rates ? "ops/s" : "count",
1888		    rectype > LS_STACK(0) ? "Stack" : "");
1889
1890		first_bin = 0;
1891		while (lsp->ls_hist[first_bin] == 0)
1892			first_bin++;
1893
1894		last_bin = 63;
1895		while (lsp->ls_hist[last_bin] == 0)
1896			last_bin--;
1897
1898		max_bin_count = 0;
1899		total_bin_count = 0;
1900		for (j = first_bin; j <= last_bin; j++) {
1901			total_bin_count += lsp->ls_hist[j];
1902			if (lsp->ls_hist[j] > max_bin_count)
1903				max_bin_count = lsp->ls_hist[j];
1904		}
1905
1906		/*
1907		 * If we went a few frames below the caller, ignore them
1908		 */
1909		for (fr = 3; fr > 0; fr--)
1910			if (lsp->ls_stack[fr] == lsp->ls_caller)
1911				break;
1912
1913		for (j = first_bin; j <= last_bin; j++) {
1914			uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1915			(void) fprintf(out, "%10llu |%s%s %-9u ",
1916			    1ULL << j,
1917			    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1918			    "                              " + depth,
1919			    g_rates == 0 ? lsp->ls_hist[j] :
1920			    (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1921			    g_elapsed));
1922			if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1923				(void) fprintf(out, "\n");
1924				continue;
1925			}
1926			(void) fprintf(out, "%-24s\n",
1927			    format_symbol(buf, lsp->ls_stack[fr], 0));
1928			fr++;
1929		}
1930		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1931			(void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1932			    format_symbol(buf, lsp->ls_stack[fr], 0));
1933			fr++;
1934		}
1935	}
1936
1937	if (!g_pflag)
1938		(void) fprintf(out, "---------------------------------"
1939		    "----------------------------------------------\n");
1940
1941	(void) fflush(out);
1942}
1943
1944static void
1945report_trace(FILE *out, lsrec_t **sort_buf)
1946{
1947	lsrec_t *lsp;
1948	int i, fr;
1949	int rectype;
1950	char buf[256], buf2[256];
1951
1952	rectype = g_recsize;
1953
1954	if (!g_pflag) {
1955		(void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
1956		    "Event", "Time", "Owner", "Lock", "Caller");
1957		(void) fprintf(out, "---------------------------------"
1958		    "----------------------------------------------\n");
1959	}
1960
1961	for (i = 0; i < g_nrecs_used; i++) {
1962
1963		lsp = sort_buf[i];
1964
1965		if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1966			continue;
1967
1968		(void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
1969		    lsp->ls_event, (unsigned long long)lsp->ls_time,
1970		    (void *)lsp->ls_next,
1971#ifdef illumos
1972		    format_symbol(buf, lsp->ls_lock, 0),
1973#else
1974		    lsp->ls_lock,
1975#endif
1976		    format_symbol(buf2, lsp->ls_caller, 0));
1977
1978		if (rectype <= LS_STACK(0))
1979			continue;
1980
1981		/*
1982		 * If we went a few frames below the caller, ignore them
1983		 */
1984		for (fr = 3; fr > 0; fr--)
1985			if (lsp->ls_stack[fr] == lsp->ls_caller)
1986				break;
1987
1988		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1989			(void) fprintf(out, "%53s  %-24s\n", "",
1990			    format_symbol(buf, lsp->ls_stack[fr], 0));
1991			fr++;
1992		}
1993		(void) fprintf(out, "\n");
1994	}
1995
1996	(void) fflush(out);
1997}
1998