1#!/bin/ksh
2##!/usr/bin/ksh
3#
4# iotop - display top disk I/O events by process.
5#         Written using DTrace (Solaris 10 3/05).
6#
7# This is measuring disk events that have made it past system caches.
8#
9# 20-Apr-2006, ver 0.76
10#
11# USAGE:	iotop [-C] [-D|-o|-P] [-j|-Z] [-d device] [-f filename] 
12#		      [-m mount_point] [-t top] [interval [count]]
13#
14#		iotop   	# default output, 5 second intervals
15#
16#		-C		# don't clear the screen
17#		-D		# print delta times, elapsed, us
18#		-j		# print project ID
19#		-o		# print disk delta times, us
20#		-P		# print %I/O (disk delta times)
21#		-Z		# print zone ID
22#		-d device	# instance name to snoop (eg, dad0)
23#		-f filename	# full pathname of file to snoop
24#		-m mount_point	# this FS only (will skip raw events)
25#		-t top		# print top number only
26#	eg,
27#		iotop 1  	# 1 second samples
28#		iotop -C	# don't clear the screen
29#		iotop -P	# print %I/O (time based)
30#		iotop -j	# print project IDs
31#		iotop -Z 	# print zone IDs
32#		iotop -t 20 	# print top 20 lines only
33#		iotop -C 5 12	# print 12 x 5 second samples
34# 	
35# FIELDS:
36#		UID		user ID
37#		PID		process ID
38#		PPID		parent process ID
39#		PROJ		project ID
40#		ZONE		zone ID
41#		CMD		process command name
42#		DEVICE  	device name
43#		MAJ     	device major number
44#		MIN     	device minor number
45#		D		direction, Read or Write
46#		BYTES		total size of operations, bytes
47#		ELAPSED		total elapsed from request to completion, us
48#		DISKTIME	total time for disk to complete request, us
49#		%I/O		percent disk I/O, based on time (DISKTIME)
50#		load		1 min load average
51#		disk_r		total disk read Kbytes for sample
52#		disk_w		total disk write Kbytes for sample
53# 
54# NOTE:
55# * There are two different delta times reported. -D prints the
56#   elapsed time from the disk request (strategy) to the disk completion
57#   (iodone); -o prints the time for the disk to complete that event 
58#   since it's last event (time between iodones), or, the time to the
59#   strategy if the disk had been idle. 
60# * The %I/O value can exceed 100%. It represents how busy a process is
61#   making the disks, in terms of a single disk. A value of 200% could 
62#   mean 2 disks are busy at 100%, or 4 disks at 50%...
63#
64# SEE ALSO: iosnoop
65#	    BigAdmin: DTrace, http://www.sun.com/bigadmin/content/dtrace
66#	    Solaris Dynamic Tracing Guide, http://docs.sun.com
67#	    DTrace Tools, http://www.brendangregg.com/dtrace.html
68#
69# INSPIRATION:  top(1) by William LeFebvre
70#
71# COPYRIGHT: Copyright (c) 2005, 2006 Brendan Gregg.
72#
73# CDDL HEADER START
74#
75#  The contents of this file are subject to the terms of the
76#  Common Development and Distribution License, Version 1.0 only
77#  (the "License").  You may not use this file except in compliance
78#  with the License.
79#
80#  You can obtain a copy of the license at Docs/cddl1.txt
81#  or http://www.opensolaris.org/os/licensing.
82#  See the License for the specific language governing permissions
83#  and limitations under the License.
84#
85# CDDL HEADER END
86#
87# KNOWN BUGS: 
88# - This can print errors while running on servers with Veritas volumes.
89#
90# Author: Brendan Gregg  [Sydney, Australia]
91#
92# 15-Jul-2005	Brendan Gregg	Created this.
93#
94
95
96##############################
97# --- Process Arguments ---
98#
99
100### default variables
101opt_device=0; opt_file=0; opt_mount=0; opt_clear=1; opt_proj=0; opt_zone=0
102opt_percent=0; opt_def=1; opt_bytes=1; filter=0; device=.; filename=.; mount=.
103opt_top=0; opt_elapsed=0; opt_dtime=0; interval=5; count=-1; top=0
104
105### process options
106while getopts CDd:f:hjm:oPt:Z name
107do
108	case $name in
109	C)	opt_clear=0 ;;
110	D)	opt_elapsed=1; opt_bytes=0 ;;
111	d)	opt_device=1; device=$OPTARG ;;
112	f)	opt_file=1; filename=$OPTARG ;;
113	j)	opt_proj=1; opt_def=0 ;;
114	m)	opt_mount=1; mount=$OPTARG ;;
115	o)	opt_dtime=1; opt_bytes=0 ;;
116	P)	opt_percent=1; opt_dtime=1; opt_bytes=0 ;;
117	t)	opt_top=1; top=$OPTARG ;;
118	Z)	opt_zone=1; opt_def=0 ;;
119	h|?)	cat <<-END >&2
120		USAGE: iotop [-C] [-D|-o|-P] [-j|-Z] [-d device] [-f filename]
121		             [-m mount_point] [-t top] [interval [count]]
122 
123		                -C      # don't clear the screen
124		                -D      # print delta times, elapsed, us
125		                -j      # print project ID
126		                -o      # print disk delta times, us
127		                -P      # print %I/O (disk delta times)
128		                -Z      # print zone ID
129		                -d device       # instance name to snoop 
130		                -f filename     # snoop this file only
131		                -m mount_point  # this FS only 
132		                -t top  	# print top number only
133		   eg,
134		        iotop         # default output, 5 second samples
135		        iotop 1       # 1 second samples
136		        iotop -P      # print %I/O (time based)
137		        iotop -m /    # snoop events on filesystem / only
138		        iotop -t 20   # print top 20 lines only
139		        iotop -C 5 12 # print 12 x 5 second samples
140		END
141		exit 1
142	esac
143done
144
145shift $(( $OPTIND - 1 ))
146
147### option logic
148if [[ "$1" > 0 ]]; then
149        interval=$1; shift
150fi
151if [[ "$1" > 0 ]]; then
152        count=$1; shift
153fi
154if (( opt_proj && opt_zone )); then
155        opt_proj=0
156fi
157if (( opt_elapsed && opt_dtime )); then
158        opt_elapsed=0
159fi
160if (( opt_device || opt_mount || opt_file )); then
161	filter=1
162fi
163if (( opt_clear )); then
164        clearstr=`clear`
165else
166        clearstr=.
167fi
168
169
170
171#################################
172# --- Main Program, DTrace ---
173#
174/usr/sbin/dtrace -n '
175 /*
176  * Command line arguments
177  */
178 inline int OPT_def 	= '$opt_def';
179 inline int OPT_proj 	= '$opt_proj';
180 inline int OPT_zone 	= '$opt_zone';
181 inline int OPT_clear 	= '$opt_clear';
182 inline int OPT_bytes 	= '$opt_bytes';
183 inline int OPT_elapsed = '$opt_elapsed';
184 inline int OPT_dtime 	= '$opt_dtime';
185 inline int OPT_percent	= '$opt_percent';
186 inline int OPT_device 	= '$opt_device';
187 inline int OPT_mount 	= '$opt_mount';
188 inline int OPT_file 	= '$opt_file';
189 inline int OPT_top 	= '$opt_top';
190 inline int INTERVAL 	= '$interval';
191 inline int COUNTER 	= '$count';
192 inline int FILTER 	= '$filter';
193 inline int TOP 	= '$top';
194 inline string DEVICE 	= "'$device'";
195 inline string FILENAME = "'$filename'";
196 inline string MOUNT 	= "'$mount'";
197 inline string CLEAR 	= "'$clearstr'";
198 
199 #pragma D option quiet
200
201 /*
202  * Print header
203  */
204 dtrace:::BEGIN 
205 {
206	last_event[""] = 0;
207
208        /* starting values */
209        counts = COUNTER;
210        secs = INTERVAL;
211        disk_r = 0;
212        disk_w = 0;
213
214        printf("Tracing... Please wait.\n");
215 }
216
217 /*
218  * Check event is being traced
219  */
220 io:::start,
221 io:::done 
222 { 
223	/* default is to trace unless filtering, */
224	this->ok = FILTER ? 0 : 1;
225
226	/* check each filter, */
227	(OPT_device == 1 && DEVICE == args[1]->dev_statname)? this->ok = 1 : 1;
228	(OPT_file == 1 && FILENAME == args[2]->fi_pathname) ? this->ok = 1 : 1;
229	(OPT_mount == 1 && MOUNT == args[2]->fi_mount)  ? this->ok = 1 : 1;
230 }
231
232 /*
233  * Reset last_event for disk idle -> start
234  * this prevents idle time being counted as disk time.
235  */
236 io:::start
237 /! pending[args[1]->dev_statname]/
238 {
239	/* save last disk event */
240	last_event[args[1]->dev_statname] = timestamp;
241 }
242
243 /*
244  * Store entry details
245  */
246 io:::start
247 /this->ok/
248 {
249	/* these are used as a unique disk event key, */
250 	this->dev = args[0]->b_edev;
251 	this->blk = args[0]->b_blkno;
252
253	/* save disk event details, */
254 	start_uid[this->dev, this->blk] = uid;
255 	start_pid[this->dev, this->blk] = pid;
256 	start_ppid[this->dev, this->blk] = ppid;
257 	start_comm[this->dev, this->blk] = execname;
258 	start_time[this->dev, this->blk] = timestamp;
259 	start_proj[this->dev, this->blk] = curpsinfo->pr_projid;
260 	start_zone[this->dev, this->blk] = curpsinfo->pr_zoneid;
261 	start_rw[this->dev, this->blk] = args[0]->b_flags & B_READ ? "R" : "W";
262	disk_r += args[0]->b_flags & B_READ ? args[0]->b_bcount : 0;
263	disk_w += args[0]->b_flags & B_READ ? 0 : args[0]->b_bcount;
264
265	/* increase disk event pending count */
266	pending[args[1]->dev_statname]++;
267 }
268
269 /*
270  * Process and Print completion
271  */
272 io:::done
273 /this->ok/
274 {
275	/* decrease disk event pending count */
276	pending[args[1]->dev_statname]--;
277
278	/*
279	 * Process details
280	 */
281
282 	/* fetch entry values */
283 	this->dev = args[0]->b_edev;
284 	this->blk = args[0]->b_blkno;
285 	this->suid = (int)start_uid[this->dev, this->blk];
286 	this->spid = start_pid[this->dev, this->blk];
287 	this->sppid = start_ppid[this->dev, this->blk];
288 	this->sproj = (int)start_proj[this->dev, this->blk];
289 	this->szone = (int)start_zone[this->dev, this->blk];
290 	self->scomm = start_comm[this->dev, this->blk];
291 	this->stime = start_time[this->dev, this->blk];
292	this->etime = timestamp; /* endtime */
293	this->elapsed = this->etime - this->stime;
294 	self->rw = start_rw[this->dev, this->blk];
295	this->dtime = last_event[args[1]->dev_statname] == 0 ? 0 :
296	    timestamp - last_event[args[1]->dev_statname];
297
298 	/* memory cleanup */
299 	start_uid[this->dev, this->blk]  = 0;
300 	start_pid[this->dev, this->blk]  = 0;
301 	start_ppid[this->dev, this->blk] = 0;
302 	start_time[this->dev, this->blk] = 0;
303 	start_comm[this->dev, this->blk] = 0;
304 	start_zone[this->dev, this->blk] = 0;
305 	start_proj[this->dev, this->blk] = 0;
306 	start_rw[this->dev, this->blk]   = 0;
307
308	/*
309	 * Choose statistic to track
310	 */
311	OPT_bytes   ? this->value = args[0]->b_bcount    : 1;
312	OPT_elapsed ? this->value = this->elapsed / 1000 : 1;
313	OPT_dtime   ? this->value = this->dtime / 1000   : 1;
314	
315	/*
316	 * Save details
317	 */
318	OPT_def ? @out[this->suid, this->spid, this->sppid, self->scomm,
319	    args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
320	    self->rw] = sum(this->value) : 1;
321	OPT_proj ? @out[this->sproj, this->spid, this->sppid, self->scomm,
322	    args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
323	    self->rw] = sum(this->value) : 1;
324	OPT_zone ? @out[this->szone, this->spid, this->sppid, self->scomm,
325	    args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
326	    self->rw] = sum(this->value) : 1;
327
328	/* save last disk event */
329	last_event[args[1]->dev_statname] = timestamp;
330
331	self->scomm = 0;
332	self->rw = 0;
333 }
334
335 /*
336  * Prevent pending from underflowing
337  * this can happen if this program is started during disk events.
338  */
339 io:::done
340 /pending[args[1]->dev_statname] < 0/
341 {
342	pending[args[1]->dev_statname] = 0;
343 }
344
345 /*
346  * Timer
347  */
348 profile:::tick-1sec
349 {
350	secs--;
351 }
352
353 /*
354  * Print Report
355  */
356 profile:::tick-1sec
357 /secs == 0/
358 {
359	/* fetch 1 min load average */
360	/*
361	this->load1a  = `hp_avenrun[0] / 65536;
362	this->load1b  = ((`hp_avenrun[0] % 65536) * 100) / 65536;
363	*/
364	this->fscale = `averunnable.fscale;
365	this->load1a  = `averunnable.ldavg[0] / this->fscale;
366	this->load1b  = ((`averunnable.ldavg[0] % this->fscale) * 100) / this->fscale;
367
368	/* convert counters to Kbytes */
369	disk_r /= 1024;
370	disk_w /= 1024;
371
372	/* print status */
373	OPT_clear ? printf("%s", CLEAR) : 1;
374	printf("%Y,  load: %d.%02d,  disk_r: %6d KB,  disk_w: %6d KB\n\n",
375	    walltimestamp, this->load1a, this->load1b, disk_r, disk_w);
376
377	/* print headers */
378	OPT_def  ? printf("  UID ") : 1;
379	OPT_proj ? printf(" PROJ ") : 1;
380	OPT_zone ? printf(" ZONE ") : 1;
381	printf("%6s %6s %-16s %-7s %3s %3s %1s",
382	    "PID", "PPID", "CMD", "DEVICE", "MAJ", "MIN", "D");
383	OPT_bytes   ? printf(" %16s\n", "BYTES") : 1;
384	OPT_elapsed ? printf(" %16s\n", "ELAPSED") : 1;
385	OPT_dtime && ! OPT_percent  ? printf(" %16s\n", "DISKTIME") : 1;
386	OPT_dtime && OPT_percent    ? printf(" %6s\n", "%I/O") : 1;
387
388	/* truncate to top lines if needed */
389	OPT_top ? trunc(@out, TOP) : 1;
390
391	/* normalise to percentage if needed */
392	OPT_percent ? normalize(@out, INTERVAL * 10000) : 1;
393
394	/* print data */
395	! OPT_percent ? 
396	    printa("%5d %6d %6d %-16s %-7s %3d %3d %1s %16@d\n", @out) :
397	    printa("%5d %6d %6d %-16s %-7s %3d %3d %1s %6@d\n", @out);
398	printf("\n");
399
400	/* clear data */
401	trunc(@out);
402	disk_r = 0;
403	disk_w = 0;
404	secs = INTERVAL;
405	counts--;
406 }
407
408 /*
409  * End of program
410  */
411 profile:::tick-1sec
412 /counts == 0/
413 {
414	exit(0);
415 }
416
417 /*
418  * Cleanup for Ctrl-C
419  */
420 dtrace:::END
421 {
422	trunc(@out);
423 }
424'
425
426