1#!/usr/bin/ksh
2#
3# dexplorer - DTrace system explorer, runs a collection of scripts.
4#             Written using DTrace (Solaris 10 3/05).
5#
6# This program automatically runs a collection of DTrace scripts to examine
7# many areas of the system, and places the output in a meaningful directory
8# structure that is tar'd and gzip'd.
9#
10# 28-Jun-2005, ver 0.76		(check for newer versions)
11#
12# USAGE:	dexplorer [-yDT] [-d outputdir] [-i interval]
13#
14#                  -q              # quiet mode
15#                  -y              # "yes", don't prompt for confirmation
16#                  -D              # don't delete output dir
17#                  -T              # don't create output tar.gz
18#                  -d outputdir    # output directory
19#                  -i interval     # interval for each sample
20#    eg,
21#               dexplorer          # default is 5 second samples
22#               dexplorer -y -i30  # no prompting, with 30 second samples
23#
24# SEE ALSO:	DTraceToolkit
25#
26# THANKS: David Visser, et all. for the idea and encouragement.
27#
28# COPYRIGHT: Copyright (c) 2005 Brendan Gregg.
29#
30# CDDL HEADER START
31#
32#  The contents of this file are subject to the terms of the
33#  Common Development and Distribution License, Version 1.0 only
34#  (the "License").  You may not use this file except in compliance
35#  with the License.
36#
37#  You can obtain a copy of the license at Docs/cddl1.txt
38#  or http://www.opensolaris.org/os/licensing.
39#  See the License for the specific language governing permissions
40#  and limitations under the License.
41#
42# CDDL HEADER END
43#
44# CODE:
45#
46#  This is currently a monolithic script, and while it contains only
47#  a few dozen straigftforward DTrace scripts I think it's desirable to
48#  keep it that way. The scripts themselves have designed to be very
49#  generic (eg, switching on all sdt:::), and are aggregations to keep a 
50#  limit on the size of the output.
51#
52# Author: Brendan Gregg  [Sydney, Australia]
53#
54# 23-Jun-2005  Brendan Gregg   Created this.
55
56#
57#  Default variables
58#
59interval=5				# time of each sample
60verbose=1				# print screen output
61prompt=1				# prompt before run
62tar=1					# create tar file
63delete=1				# delete output dirs
64dtrace=/usr/sbin/dtrace			# path to dtrace
65root=.					# default output dir
66PATH=/usr/bin:/usr/sbin			# safe path
67dir=de_`uname -n`_`date +%Y%m%d%H%M`	# OUTPUT FILENAME
68samples=20				# max number of tests
69current=0				# current sample
70
71#
72#  Process options
73#
74while getopts d:hi:qyDT name
75do
76	case $name in
77	d)      root=$OPTARG ;;
78	i)      interval=$OPTARG ;;
79	q)      verbose=0 ;;
80	y)      prompt=0 ;;
81	D)      delete=0 ;;
82	T)      tar=0 ;;
83	h|?)    cat <<-END >&2
84		USAGE: dexplorer [-qyDT] [-d outputdir] [-i interval]
85		 
86		        -q               # quiet mode
87		        -y               # "yes", don't prompt for confirmation
88		        -D               # don't delete output dir
89		        -T               # don't create output tar.gz
90		        -d outputdir     # output directory
91		        -i interval      # interval for each sample
92		   eg,
93		       dexplorer         # default is 5 second samples
94		       dexplorer -y -i30 # no prompting, with 30 second samples
95		END
96		exit 1
97	esac
98done
99shift $(( OPTIND - 1 ))
100
101#
102#  Confirm path
103#
104if [[ "$prompt" == "1" ]] ; then
105	if [[ "$root" == "." ]]; then
106		print "Output dir will be the current dir ($PWD)."
107	else
108		print "Output dir will be $root"
109	fi
110	print -n "Hit enter for yes, or type path: "
111	read ans junk
112	if [[ "$ans" == [yY] || "$ans" == [yY]es ]]; then
113		print "WARNING: I didn't ask for \"$ans\"!"
114		print "\tI was asking for the path or just enter."
115		print "\tignoring \"$ans\"..."
116	fi
117	if [[ "$ans" != "" ]]; then
118		root=$ans
119		print "Output is now $root."
120	fi
121fi
122
123#
124#  Sanity checks
125#
126if [[ "$interval" == *[a-zA-Z]* ]]; then
127	print "ERROR2: Invalid interval $interval.\n"
128	print "Please use a number of seconds."
129	exit 2
130fi
131if (( ${#interval} < 1 )); then
132	print "ERROR3: Length of interval $interval too short.\n"
133	print "Minimum 1 second."
134	exit 3
135fi
136if [[ ! -d "$root" ]]; then
137	print "ERROR4: Output directory \"$root\" does not exist.\n"
138	print "Perhaps try a mkdir first?"
139	print "or use an existing dir, eg \"/tmp\""
140	exit 4
141fi
142if [[ ! -w "$root" ]]; then
143	print "ERROR5: Can't write to output directory \"$root\".\n"
144	print "Are you logged in as root?"
145	print "Perhaps try another directory, eg \"/tmp\""
146	exit 5
147fi
148if [[ `$dtrace -b1k -qn 'BEGIN { trace(pid); exit(0); }'` == "" ]]; then
149	print "ERROR6: Unable to run dtrace!\n"
150	print "Perhaps this is a permission problem? Try running as root."
151	exit 6
152fi
153
154# calculate total time
155(( total = interval * samples ))
156if (( total > 180 )); then
157	(( total = total / 60 ))
158	total="$total minutes"
159else
160	total="$total seconds"
161fi
162
163#
164#  Common Functions
165#
166function decho {
167	if (( verbose )); then print "$*"; fi
168}
169clean="sed /^\$/d"
170header='dtrace:::BEGIN {
171		printf("%Y, ", walltimestamp);
172		printf("%s %s %s %s %s, ", `utsname.sysname, `utsname.nodename,
173		    `utsname.release, `utsname.version, `utsname.machine);
174		printf("%d secs\n",'$interval');
175	}
176	profile:::tick-'$interval'sec { exit(0); }
177	'
178function dstatus {
179	if (( verbose )); then 
180		(( percent = current * 100 / samples ))
181		printf "%3d%% $*\n" $percent
182		(( current = current + 1 ))
183	fi
184}
185
186########################################
187#  START                               #
188########################################
189
190#
191#  Make dirs
192#
193err=0
194cd $root
195(( err = err + $? ))
196mkdir $dir
197(( err = err + $? ))
198cd $dir
199(( err = err + $? ))
200base1=${PWD##*/}
201base2=${dir##*/}
202if [[ "$base1" != "$base2" || "$err" != "0" ]]; then
203	print "ERROR7: tried to mkdir $dir from $root, but something failed.\n"
204	print "Check directories before rerunning."
205	exit 7
206fi
207mkdir Cpu
208mkdir Disk
209mkdir Mem
210mkdir Net
211mkdir Proc
212mkdir Info
213
214#
215#  Create Log
216#
217decho "Starting dexplorer ver 0.76."
218decho "Sample interval is $interval seconds. Total run is > $total."
219( print "dexplorer ver 0.76\n------------------"
220print -n "System: "
221uname -a
222print -n "Start:  "
223date ) > log
224
225#
226#  Capture Standard Info
227#
228args='pid,ppid,uid,gid,projid,zoneid,pset,pri,nice,'
229args=$args'class,vsz,rss,time,pcpu,pmem,args'
230uname -a > Info/uname-a		# System
231psrinfo -v > Info/psrinfo-v	# CPU
232prtconf > Info/prtconf		# Memory (+ devices)
233df -k > Info/df-k		# Disk
234ifconfig -a > Info/ifconfig-a	# Network
235ps -eo $args > Info/ps-o	# Processes
236uptime > Info/uptime		# Load
237
238#
239#  Cpu Tests, DTrace
240#
241
242dstatus "Interrupts by CPU..."
243$dtrace -qn "$header"'
244	sdt:::interrupt-start { @num[cpu] = count(); }
245	dtrace:::END
246	{ 
247		printf("%-16s %16s\n", "CPU", "INTERRUPTS");
248		printa("%-16d %@16d\n", @num);
249	}
250' | $clean > Cpu/interrupt_by_cpu
251
252dstatus "Interrupt times..."
253$dtrace -qn "$header"'
254	sdt:::interrupt-start { self->ts = vtimestamp; }
255	sdt:::interrupt-complete
256	/self->ts && arg0 != 0/
257	{
258		this->devi = (struct dev_info *)arg0;
259		self->name = this->devi != 0 ?
260		    stringof(`devnamesp[this->devi->devi_major].dn_name) : "?";
261		this->inst = this->devi != 0 ? this->devi->devi_instance : 0;
262		@num[self->name, this->inst] = sum(vtimestamp - self->ts);
263		self->name = 0;
264	}
265	sdt:::interrupt-complete { self->ts = 0; }
266	dtrace:::END
267	{ 
268		printf("%11s    %16s\n", "DEVICE", "TIME (ns)");
269		printa("%10s%-3d %@16d\n", @num);
270	}
271' | $clean > Cpu/interrupt_time
272
273dstatus "Dispatcher queue length by CPU..."
274$dtrace -qn "$header"'
275	profile:::profile-1000
276	{
277		this->num = curthread->t_cpu->cpu_disp->disp_nrunnable;
278		@length[cpu] = lquantize(this->num, 0, 100, 1);
279	}
280	dtrace:::END { printa(" CPU %d%@d\n", @length); }
281' | $clean > Cpu/dispqlen_by_cpu
282
283dstatus "Sdt counts..."
284$dtrace -qn "$header"'
285	sdt:::{ @num[probefunc, probename] = count(); }
286	dtrace:::END
287	{ 
288		printf("%-32s %-32s %10s\n", "FUNC", "NAME", "COUNT");
289		printa("%-32s %-32s %@10d\n", @num);
290	}
291' | $clean > Cpu/sdt_count
292
293#
294#  Disk Tests, DTrace
295#
296
297dstatus "Pages paged in by process..."
298$dtrace -qn "$header"'
299	vminfo:::pgpgin { @pg[pid, execname] = sum(arg0); }
300	dtrace:::END
301	{ 
302		printf("%6s %-16s %16s\n", "PID", "CMD", "PAGES");
303		printa("%6d %-16s %@16d\n", @pg);
304	}
305' | $clean > Disk/pgpgin_by_process
306
307dstatus "Files opened successfully count..."
308$dtrace -qn "$header"'
309	syscall::open*:entry { self->file = copyinstr(arg0); self->ok = 1; }
310	syscall::open*:return /self->ok && arg0 != -1/ 
311	{ 
312		@num[self->file] = count();
313	}
314	syscall::open*:return /self->ok/ { self->file = 0; self->ok = 0; }
315	dtrace:::END
316	{ 
317		printf("%-64s %8s\n", "FILE", "COUNT");
318		printa("%-64s %@8d\n", @num);
319	}
320' | $clean > Disk/fileopen_count
321
322dstatus "Disk I/O size distribution by process..."
323$dtrace -qn "$header"'
324	io:::start { @size[pid, execname] = quantize(args[0]->b_bcount); }
325' | $clean > Disk/sizedist_by_process
326
327#
328#  Mem Tests, DTrace
329#
330
331dstatus "Minor faults by process..."
332$dtrace -qn "$header"'
333	vminfo:::as_fault { @mem[pid, execname] = sum(arg0); }
334	dtrace:::END
335	{ 
336		printf("%6s %-16s %16s\n", "PID", "CMD", "MINFAULTS");
337		printa("%6d %-16s %@16d\n", @mem);
338	}
339' | $clean > Mem/minf_by_process
340
341
342dstatus "Vminfo data by process..."
343$dtrace -qn "$header"'
344	vminfo::: { @data[pid, execname, probename] = sum(arg0); }
345	dtrace:::END
346	{ 
347		printf("%6s %-16s %-16s %16s\n",
348		    "PID", "CMD", "STATISTIC", "VALUE");
349		printa("%6d %-16s %-16s %@16d\n", @data);
350	}
351' | $clean > Mem/vminfo_by_process
352
353#
354#  Net Tests, DTrace
355#
356
357dstatus "Mib data by mib statistic..."
358$dtrace -qn "$header"'
359	mib::: { @data[probename] = sum(arg0); }
360	dtrace:::END
361	{ 
362		printf("%-32s %16s\n", "STATISTIC", "VALUE");
363		printa("%-32s %@16d\n", @data);
364	}
365' | $clean > Net/mib_data
366
367dstatus "TCP write bytes by process..."
368$dtrace -qn "$header"'
369	fbt:ip:tcp_output:entry
370	{
371		this->size = msgdsize(args[1]);
372		@size[pid, execname] = sum(this->size);
373	}
374	dtrace:::END
375	{ 
376		printf("%6s %-16s %12s\n", "PID", "CMD", "BYTES");
377		printa("%6d %-16s %@12d\n", @size);
378	}
379' | $clean > Net/tcpw_by_process
380
381#
382#  Proc Tests, DTrace
383#
384
385dstatus "Sample process @ 1000 Hz..."
386$dtrace -qn "$header"'
387	profile:::profile-1000
388	{
389		@num[pid, curpsinfo->pr_psargs] = count();
390	}
391	dtrace:::END
392	{ 
393		printf("%6s %12s %s\n", "PID", "SAMPLES", "ARGS");
394		printa("%6d %@12d %S\n", @num);
395	}
396' | $clean > Proc/sample_process
397
398dstatus "Syscall count by process..."
399$dtrace -qn "$header"'
400	syscall:::entry { @num[pid, execname, probefunc] = count(); }
401	dtrace:::END
402	{ 
403		printf("%6s %-24s %-24s %8s\n",
404		    "PID", "CMD", "SYSCALL", "COUNT");
405		printa("%6d %-24s %-24s %@8d\n", @num);
406	}
407' | $clean > Proc/syscall_by_process
408
409dstatus "Syscall count by syscall..."
410$dtrace -qn "$header"'
411	syscall:::entry { @num[probefunc] = count(); }
412	dtrace:::END
413	{ 
414		printf("%-32s %16s\n", "SYSCALL", "COUNT");
415		printa("%-32s %@16d\n", @num);
416	}
417' | $clean > Proc/syscall_count
418
419dstatus "Read bytes by process..."
420$dtrace -qn "$header"'
421	sysinfo:::readch { @bytes[pid, execname] = sum(arg0); }
422	dtrace:::END
423	{ 
424		printf("%6s %-16s %16s\n", "PID", "CMD", "BYTES");
425		printa("%6d %-16s %@16d\n", @bytes);
426	}
427' | $clean > Proc/readb_by_process
428
429dstatus "Write bytes by process..."
430$dtrace -qn "$header"'
431	sysinfo:::writech { @bytes[pid, execname] = sum(arg0); }
432	dtrace:::END
433	{ 
434		printf("%6s %-16s %16s\n", "PID", "CMD", "BYTES");
435		printa("%6d %-16s %@16d\n", @bytes);
436	}
437' | $clean > Proc/writeb_by_process
438
439dstatus "Sysinfo counts by process..."
440$dtrace -qn "$header"'
441	sysinfo::: { @num[pid, execname, probename] = sum(arg0); }
442	dtrace:::END
443	{ 
444		printf("%6s %-16s %-16s %16s\n", 
445		    "PID", "CMD", "STATISTIC", "COUNT");
446		printa("%6d %-16s %-16s %@16d\n", @num);
447	}
448' | $clean > Proc/sysinfo_by_process
449
450dstatus "New process counts with arguments..."
451$dtrace -qn "$header"'
452	proc:::exec-success
453	{
454		@num[pid, ppid, curpsinfo->pr_psargs] = count();
455	}
456	dtrace:::END
457	{ 
458		printf("%6s %6s %8s %s\n", "PID", "PPID", "COUNT", "ARGS");
459		printa("%6d %6d %@8d %S\n", @num);
460	}
461' | $clean > Proc/newprocess_count
462
463dstatus "Signal counts..."
464$dtrace -qn "$header"'
465	proc:::signal-send { 
466		@num[execname,args[2],stringof(args[1]->pr_fname)] = count();
467	}
468	dtrace:::END
469	{ 
470		printf("%-16s %-8s %-16s %8s\n",
471		    "FROM", "SIG", "TO", "COUNT");
472		printa("%-16s %-8d %-16s %@8d\n", @num);
473	}
474' | $clean > Proc/signal_count
475
476dstatus "Syscall error counts..."
477$dtrace -qn "$header"'
478	syscall:::return /(int)arg0 == -1/
479	{
480		@num[pid, execname, probefunc, errno] = count();
481	}
482	dtrace:::END
483	{ 
484		printf("%6s %-16s %-32s %-6s %8s\n",
485		    "PID", "CMD", "SYSCALL", "ERRNO", "COUNT");
486		printa("%6d %-16s %-32s %-6d %@8d\n", @num);
487	}
488' | $clean > Proc/syscall_errors
489
490
491###########
492#  Done
493#
494( print -n "End:    "
495date ) >> log
496decho "100% Done."
497if (( tar )); then
498	cd ..
499	tar cf $dir.tar $dir
500	gzip $dir.tar
501	decho "File is $dir.tar.gz"
502fi
503if (( delete && tar )); then
504	cd $dir
505	# this could be all an "rm -r $dir", but since it will be run 
506	# as root on production servers - lets be analy cautious,
507	rm Cpu/interrupt_by_cpu
508	rm Cpu/interrupt_time
509	rm Cpu/dispqlen_by_cpu
510	rm Cpu/sdt_count
511	rm Disk/pgpgin_by_process
512	rm Disk/fileopen_count
513	rm Disk/sizedist_by_process
514	rm Mem/minf_by_process
515	rm Mem/vminfo_by_process
516	rm Net/mib_data
517	rm Net/tcpw_by_process
518	rm Proc/sample_process
519	rm Proc/syscall_by_process
520	rm Proc/syscall_count
521	rm Proc/readb_by_process
522	rm Proc/writeb_by_process
523	rm Proc/sysinfo_by_process
524	rm Proc/newprocess_count
525	rm Proc/signal_count
526	rm Proc/syscall_errors
527	rmdir Cpu
528	rmdir Disk
529	rmdir Mem
530	rmdir Net
531	rmdir Proc
532	rm Info/uname-a
533	rm Info/psrinfo-v
534	rm Info/prtconf
535	rm Info/df-k
536	rm Info/ifconfig-a
537	rm Info/ps-o
538	rm Info/uptime
539	rmdir Info
540	rm log
541	cd ..
542	rmdir $dir
543else
544	decho "Directory is $dir"
545fi
546
547