1#!/usr/bin/ksh
2#
3# dexplorer - DTrace system explorer, runs a collection of scripts.
4#             Written using DTrace (Solaris 10 3/05).
5#
6# This program automatically runs a collection of DTrace scripts to examine
7# many areas of the system, and places the output in a meaningful directory
8# structure that is tar'd and gzip'd.
9#
10# $Id: dexplorer 3 2007-08-01 10:50:08Z brendan $
11#
12# USAGE:	dexplorer [-yDT] [-d outputdir] [-i interval]
13#
14#                  -q              # quiet mode
15#                  -y              # "yes", don't prompt for confirmation
16#                  -D              # don't delete output dir
17#                  -T              # don't create output tar.gz
18#                  -d outputdir    # output directory
19#                  -i interval     # interval for each sample
20#    eg,
21#               dexplorer          # default is 5 second samples
22#               dexplorer -y -i30  # no prompting, with 30 second samples
23#
24# SEE ALSO:	DTraceToolkit
25#
26# THANKS: David Visser, et all. for the idea and encouragement.
27#
28# COPYRIGHT: Copyright (c) 2005 Brendan Gregg.
29#
30# CDDL HEADER START
31#
32#  The contents of this file are subject to the terms of the
33#  Common Development and Distribution License, Version 1.0 only
34#  (the "License").  You may not use this file except in compliance
35#  with the License.
36#
37#  You can obtain a copy of the license at Docs/cddl1.txt
38#  or http://www.opensolaris.org/os/licensing.
39#  See the License for the specific language governing permissions
40#  and limitations under the License.
41#
42# CDDL HEADER END
43#
44# CODE:
45#
46#  This is currently a monolithic script, and while it contains only
47#  a few dozen straigftforward DTrace scripts I think it's desirable to
48#  keep it that way. The scripts themselves have designed to be very
49#  generic (eg, switching on all sdt:::), and are aggregations to keep a 
50#  limit on the size of the output.
51#
52# Author: Brendan Gregg  [Sydney, Australia]
53#
54# 23-Jun-2005	Brendan Gregg	Created this.
55# 28-Jun-2005	   "      "	Last update.
56
57#
58#  Default variables
59#
60interval=5				# time of each sample
61verbose=1				# print screen output
62prompt=1				# prompt before run
63tar=1					# create tar file
64delete=1				# delete output dirs
65dtrace=/usr/sbin/dtrace			# path to dtrace
66root=.					# default output dir
67PATH=/usr/bin:/usr/sbin			# safe path
68dir=de_`uname -n`_`date +%Y%m%d%H%M`	# OUTPUT FILENAME
69samples=20				# max number of tests
70current=0				# current sample
71
72#
73#  Process options
74#
75while getopts d:hi:qyDT name
76do
77	case $name in
78	d)      root=$OPTARG ;;
79	i)      interval=$OPTARG ;;
80	q)      verbose=0 ;;
81	y)      prompt=0 ;;
82	D)      delete=0 ;;
83	T)      tar=0 ;;
84	h|?)    cat <<-END >&2
85		USAGE: dexplorer [-qyDT] [-d outputdir] [-i interval]
86		 
87		        -q               # quiet mode
88		        -y               # "yes", don't prompt for confirmation
89		        -D               # don't delete output dir
90		        -T               # don't create output tar.gz
91		        -d outputdir     # output directory
92		        -i interval      # interval for each sample
93		   eg,
94		       dexplorer         # default is 5 second samples
95		       dexplorer -y -i30 # no prompting, with 30 second samples
96		END
97		exit 1
98	esac
99done
100shift $(( OPTIND - 1 ))
101
102#
103#  Confirm path
104#
105if [[ "$prompt" == "1" ]] ; then
106	if [[ "$root" == "." ]]; then
107		print "Output dir will be the current dir ($PWD)."
108	else
109		print "Output dir will be $root"
110	fi
111	print -n "Hit enter for yes, or type path: "
112	read ans junk
113	if [[ "$ans" == [yY] || "$ans" == [yY]es ]]; then
114		print "WARNING: I didn't ask for \"$ans\"!"
115		print "\tI was asking for the path or just enter."
116		print "\tignoring \"$ans\"..."
117	fi
118	if [[ "$ans" != "" ]]; then
119		root=$ans
120		print "Output is now $root."
121	fi
122fi
123
124#
125#  Sanity checks
126#
127if [[ "$interval" == *[a-zA-Z]* ]]; then
128	print "ERROR2: Invalid interval $interval.\n"
129	print "Please use a number of seconds."
130	exit 2
131fi
132if (( ${#interval} < 1 )); then
133	print "ERROR3: Length of interval $interval too short.\n"
134	print "Minimum 1 second."
135	exit 3
136fi
137if [[ ! -d "$root" ]]; then
138	print "ERROR4: Output directory \"$root\" does not exist.\n"
139	print "Perhaps try a mkdir first?"
140	print "or use an existing dir, eg \"/tmp\""
141	exit 4
142fi
143if [[ ! -w "$root" ]]; then
144	print "ERROR5: Can't write to output directory \"$root\".\n"
145	print "Are you logged in as root?"
146	print "Perhaps try another directory, eg \"/tmp\""
147	exit 5
148fi
149if [[ `$dtrace -b1k -qn 'BEGIN { trace(pid); exit(0); }'` == "" ]]; then
150	print "ERROR6: Unable to run dtrace!\n"
151	print "Perhaps this is a permission problem? Try running as root."
152	exit 6
153fi
154
155# calculate total time
156(( total = interval * samples ))
157if (( total > 180 )); then
158	(( total = total / 60 ))
159	total="$total minutes"
160else
161	total="$total seconds"
162fi
163
164#
165#  Common Functions
166#
167function decho {
168	if (( verbose )); then print "$*"; fi
169}
170clean="sed /^\$/d"
171header='dtrace:::BEGIN {
172		printf("%Y, ", walltimestamp);
173		printf("%s %s %s %s %s, ", `utsname.sysname, `utsname.nodename,
174		    `utsname.release, `utsname.version, `utsname.machine);
175		printf("%d secs\n",'$interval');
176	}
177	profile:::tick-'$interval'sec { exit(0); }
178	'
179function dstatus {
180	if (( verbose )); then 
181		(( percent = current * 100 / samples ))
182		printf "%3d%% $*\n" $percent
183		(( current = current + 1 ))
184	fi
185}
186
187########################################
188#  START                               #
189########################################
190
191#
192#  Make dirs
193#
194err=0
195cd $root
196(( err = err + $? ))
197mkdir $dir
198(( err = err + $? ))
199cd $dir
200(( err = err + $? ))
201base1=${PWD##*/}
202base2=${dir##*/}
203if [[ "$base1" != "$base2" || "$err" != "0" ]]; then
204	print "ERROR7: tried to mkdir $dir from $root, but something failed.\n"
205	print "Check directories before rerunning."
206	exit 7
207fi
208mkdir Cpu
209mkdir Disk
210mkdir Mem
211mkdir Net
212mkdir Proc
213mkdir Info
214
215#
216#  Create Log
217#
218decho "Starting dexplorer ver 0.76."
219decho "Sample interval is $interval seconds. Total run is > $total."
220( print "dexplorer ver 0.76\n------------------"
221print -n "System: "
222uname -a
223print -n "Start:  "
224date ) > log
225
226#
227#  Capture Standard Info
228#
229args='pid,ppid,uid,gid,projid,zoneid,pset,pri,nice,'
230args=$args'class,vsz,rss,time,pcpu,pmem,args'
231uname -a > Info/uname-a		# System
232psrinfo -v > Info/psrinfo-v	# CPU
233prtconf > Info/prtconf		# Memory (+ devices)
234df -k > Info/df-k		# Disk
235ifconfig -a > Info/ifconfig-a	# Network
236ps -eo $args > Info/ps-o	# Processes
237uptime > Info/uptime		# Load
238
239#
240#  Cpu Tests, DTrace
241#
242
243dstatus "Interrupts by CPU..."
244$dtrace -qn "$header"'
245	sdt:::interrupt-start { @num[cpu] = count(); }
246	dtrace:::END
247	{ 
248		printf("%-16s %16s\n", "CPU", "INTERRUPTS");
249		printa("%-16d %@16d\n", @num);
250	}
251' | $clean > Cpu/interrupt_by_cpu
252
253dstatus "Interrupt times..."
254$dtrace -qn "$header"'
255	sdt:::interrupt-start { self->ts = vtimestamp; }
256	sdt:::interrupt-complete
257	/self->ts && arg0 != 0/
258	{
259		this->devi = (struct dev_info *)arg0;
260		self->name = this->devi != 0 ?
261		    stringof(`devnamesp[this->devi->devi_major].dn_name) : "?";
262		this->inst = this->devi != 0 ? this->devi->devi_instance : 0;
263		@num[self->name, this->inst] = sum(vtimestamp - self->ts);
264		self->name = 0;
265	}
266	sdt:::interrupt-complete { self->ts = 0; }
267	dtrace:::END
268	{ 
269		printf("%11s    %16s\n", "DEVICE", "TIME (ns)");
270		printa("%10s%-3d %@16d\n", @num);
271	}
272' | $clean > Cpu/interrupt_time
273
274dstatus "Dispatcher queue length by CPU..."
275$dtrace -qn "$header"'
276	profile:::profile-1000
277	{
278		this->num = curthread->t_cpu->cpu_disp->disp_nrunnable;
279		@length[cpu] = lquantize(this->num, 0, 100, 1);
280	}
281	dtrace:::END { printa(" CPU %d%@d\n", @length); }
282' | $clean > Cpu/dispqlen_by_cpu
283
284dstatus "Sdt counts..."
285$dtrace -qn "$header"'
286	sdt:::{ @num[probefunc, probename] = count(); }
287	dtrace:::END
288	{ 
289		printf("%-32s %-32s %10s\n", "FUNC", "NAME", "COUNT");
290		printa("%-32s %-32s %@10d\n", @num);
291	}
292' | $clean > Cpu/sdt_count
293
294#
295#  Disk Tests, DTrace
296#
297
298dstatus "Pages paged in by process..."
299$dtrace -qn "$header"'
300	vminfo:::pgpgin { @pg[pid, execname] = sum(arg0); }
301	dtrace:::END
302	{ 
303		printf("%6s %-16s %16s\n", "PID", "CMD", "PAGES");
304		printa("%6d %-16s %@16d\n", @pg);
305	}
306' | $clean > Disk/pgpgin_by_process
307
308dstatus "Files opened successfully count..."
309$dtrace -qn "$header"'
310	syscall::open*:entry { self->file = copyinstr(arg0); self->ok = 1; }
311	syscall::open*:return /self->ok && arg0 != -1/ 
312	{ 
313		@num[self->file] = count();
314	}
315	syscall::open*:return /self->ok/ { self->file = 0; self->ok = 0; }
316	dtrace:::END
317	{ 
318		printf("%-64s %8s\n", "FILE", "COUNT");
319		printa("%-64s %@8d\n", @num);
320	}
321' | $clean > Disk/fileopen_count
322
323dstatus "Disk I/O size distribution by process..."
324$dtrace -qn "$header"'
325	io:::start { @size[pid, execname] = quantize(args[0]->b_bcount); }
326' | $clean > Disk/sizedist_by_process
327
328#
329#  Mem Tests, DTrace
330#
331
332dstatus "Minor faults by process..."
333$dtrace -qn "$header"'
334	vminfo:::as_fault { @mem[pid, execname] = sum(arg0); }
335	dtrace:::END
336	{ 
337		printf("%6s %-16s %16s\n", "PID", "CMD", "MINFAULTS");
338		printa("%6d %-16s %@16d\n", @mem);
339	}
340' | $clean > Mem/minf_by_process
341
342
343dstatus "Vminfo data by process..."
344$dtrace -qn "$header"'
345	vminfo::: { @data[pid, execname, probename] = sum(arg0); }
346	dtrace:::END
347	{ 
348		printf("%6s %-16s %-16s %16s\n",
349		    "PID", "CMD", "STATISTIC", "VALUE");
350		printa("%6d %-16s %-16s %@16d\n", @data);
351	}
352' | $clean > Mem/vminfo_by_process
353
354#
355#  Net Tests, DTrace
356#
357
358dstatus "Mib data by mib statistic..."
359$dtrace -qn "$header"'
360	mib::: { @data[probename] = sum(arg0); }
361	dtrace:::END
362	{ 
363		printf("%-32s %16s\n", "STATISTIC", "VALUE");
364		printa("%-32s %@16d\n", @data);
365	}
366' | $clean > Net/mib_data
367
368dstatus "TCP write bytes by process..."
369$dtrace -qn "$header"'
370	fbt:ip:tcp_output:entry
371	{
372		this->size = msgdsize(args[1]);
373		@size[pid, execname] = sum(this->size);
374	}
375	dtrace:::END
376	{ 
377		printf("%6s %-16s %12s\n", "PID", "CMD", "BYTES");
378		printa("%6d %-16s %@12d\n", @size);
379	}
380' | $clean > Net/tcpw_by_process
381
382#
383#  Proc Tests, DTrace
384#
385
386dstatus "Sample process @ 1000 Hz..."
387$dtrace -qn "$header"'
388	profile:::profile-1000
389	{
390		@num[pid, curpsinfo->pr_psargs] = count();
391	}
392	dtrace:::END
393	{ 
394		printf("%6s %12s %s\n", "PID", "SAMPLES", "ARGS");
395		printa("%6d %@12d %S\n", @num);
396	}
397' | $clean > Proc/sample_process
398
399dstatus "Syscall count by process..."
400$dtrace -qn "$header"'
401	syscall:::entry { @num[pid, execname, probefunc] = count(); }
402	dtrace:::END
403	{ 
404		printf("%6s %-24s %-24s %8s\n",
405		    "PID", "CMD", "SYSCALL", "COUNT");
406		printa("%6d %-24s %-24s %@8d\n", @num);
407	}
408' | $clean > Proc/syscall_by_process
409
410dstatus "Syscall count by syscall..."
411$dtrace -qn "$header"'
412	syscall:::entry { @num[probefunc] = count(); }
413	dtrace:::END
414	{ 
415		printf("%-32s %16s\n", "SYSCALL", "COUNT");
416		printa("%-32s %@16d\n", @num);
417	}
418' | $clean > Proc/syscall_count
419
420dstatus "Read bytes by process..."
421$dtrace -qn "$header"'
422	sysinfo:::readch { @bytes[pid, execname] = sum(arg0); }
423	dtrace:::END
424	{ 
425		printf("%6s %-16s %16s\n", "PID", "CMD", "BYTES");
426		printa("%6d %-16s %@16d\n", @bytes);
427	}
428' | $clean > Proc/readb_by_process
429
430dstatus "Write bytes by process..."
431$dtrace -qn "$header"'
432	sysinfo:::writech { @bytes[pid, execname] = sum(arg0); }
433	dtrace:::END
434	{ 
435		printf("%6s %-16s %16s\n", "PID", "CMD", "BYTES");
436		printa("%6d %-16s %@16d\n", @bytes);
437	}
438' | $clean > Proc/writeb_by_process
439
440dstatus "Sysinfo counts by process..."
441$dtrace -qn "$header"'
442	sysinfo::: { @num[pid, execname, probename] = sum(arg0); }
443	dtrace:::END
444	{ 
445		printf("%6s %-16s %-16s %16s\n", 
446		    "PID", "CMD", "STATISTIC", "COUNT");
447		printa("%6d %-16s %-16s %@16d\n", @num);
448	}
449' | $clean > Proc/sysinfo_by_process
450
451dstatus "New process counts with arguments..."
452$dtrace -qn "$header"'
453	proc:::exec-success
454	{
455		@num[pid, ppid, curpsinfo->pr_psargs] = count();
456	}
457	dtrace:::END
458	{ 
459		printf("%6s %6s %8s %s\n", "PID", "PPID", "COUNT", "ARGS");
460		printa("%6d %6d %@8d %S\n", @num);
461	}
462' | $clean > Proc/newprocess_count
463
464dstatus "Signal counts..."
465$dtrace -qn "$header"'
466	proc:::signal-send { 
467		@num[execname,args[2],stringof(args[1]->pr_fname)] = count();
468	}
469	dtrace:::END
470	{ 
471		printf("%-16s %-8s %-16s %8s\n",
472		    "FROM", "SIG", "TO", "COUNT");
473		printa("%-16s %-8d %-16s %@8d\n", @num);
474	}
475' | $clean > Proc/signal_count
476
477dstatus "Syscall error counts..."
478$dtrace -qn "$header"'
479	syscall:::return /(int)arg0 == -1/
480	{
481		@num[pid, execname, probefunc, errno] = count();
482	}
483	dtrace:::END
484	{ 
485		printf("%6s %-16s %-32s %-6s %8s\n",
486		    "PID", "CMD", "SYSCALL", "ERRNO", "COUNT");
487		printa("%6d %-16s %-32s %-6d %@8d\n", @num);
488	}
489' | $clean > Proc/syscall_errors
490
491
492###########
493#  Done
494#
495( print -n "End:    "
496date ) >> log
497decho "100% Done."
498if (( tar )); then
499	cd ..
500	tar cf $dir.tar $dir
501	gzip $dir.tar
502	decho "File is $dir.tar.gz"
503fi
504if (( delete && tar )); then
505	cd $dir
506	# this could be all an "rm -r $dir", but since it will be run 
507	# as root on production servers - lets be analy cautious,
508	rm Cpu/interrupt_by_cpu
509	rm Cpu/interrupt_time
510	rm Cpu/dispqlen_by_cpu
511	rm Cpu/sdt_count
512	rm Disk/pgpgin_by_process
513	rm Disk/fileopen_count
514	rm Disk/sizedist_by_process
515	rm Mem/minf_by_process
516	rm Mem/vminfo_by_process
517	rm Net/mib_data
518	rm Net/tcpw_by_process
519	rm Proc/sample_process
520	rm Proc/syscall_by_process
521	rm Proc/syscall_count
522	rm Proc/readb_by_process
523	rm Proc/writeb_by_process
524	rm Proc/sysinfo_by_process
525	rm Proc/newprocess_count
526	rm Proc/signal_count
527	rm Proc/syscall_errors
528	rmdir Cpu
529	rmdir Disk
530	rmdir Mem
531	rmdir Net
532	rmdir Proc
533	rm Info/uname-a
534	rm Info/psrinfo-v
535	rm Info/prtconf
536	rm Info/df-k
537	rm Info/ifconfig-a
538	rm Info/ps-o
539	rm Info/uptime
540	rmdir Info
541	rm log
542	cd ..
543	rmdir $dir
544else
545	decho "Directory is $dir"
546fi
547
548