1#!/usr/sbin/dtrace -s
2/*
3 * fsrw.d - file system read/write event tracing.
4 *          Written using DTrace (Solaris 10 3/05)
5 *
6 * This traces file related activity: system call reads and writes,
7 * vnode logical read and writes (fop), and disk I/O. It can be used
8 * to examine the behaviour of each I/O layer, from the syscall
9 * interface to what the disk is doing. Behaviour such as read-ahead, and
10 * max I/O size breakup can be observed.
11 *
12 * 23-Apr-2006, ver 0.50
13 *
14 * USAGE:	fsrw.d
15 *
16 * FIELDS:
17 *		Event		Traced event (see EVENTS below)
18 *		Device		Device, for disk I/O
19 *		RW		Either Read or Write
20 *		Size		Size of I/O in bytes
21 *		Offset		Offset of I/O in kilobytes
22 *		Path		Path to file on disk
23 *
24 * EVENTS:
25 *		sc-read		System call read
26 *		sc-write	System call write
27 *		fop_read	Logical read
28 *		fop_write	Logical write
29 *		disk_io		Physical disk I/O
30 *		disk_ra		Physical disk I/O, read ahead
31 *
32 * The events are drawn with a level of indentation, which can sometimes
33 * help identify related events.
34 *
35 * SEE ALSO: fspaging.d
36 *
37 * IDEA: Richard McDougall, Solaris Internals 2nd Ed, FS Chapter.
38 *
39 * COPYRIGHT: Copyright (c) 2006 Brendan Gregg.
40 *
41 * CDDL HEADER START
42 *
43 *  The contents of this file are subject to the terms of the
44 *  Common Development and Distribution License, Version 1.0 only
45 *  (the "License").  You may not use this file except in compliance
46 *  with the License.
47 *
48 *  You can obtain a copy of the license at Docs/cddl1.txt
49 *  or http://www.opensolaris.org/os/licensing.
50 *  See the License for the specific language governing permissions
51 *  and limitations under the License.
52 *
53 * CDDL HEADER END
54 *
55 * ToDo: readv()
56 *
57 * 20-Mar-2006  Brendan Gregg   Created this.
58 */
59
60#pragma D option quiet
61#pragma D option switchrate=10hz
62
63dtrace:::BEGIN
64{
65	printf("%-12s %10s %2s %8s %6s %s\n",
66	    "Event", "Device", "RW", "Size", "Offset", "Path");
67}
68
69syscall::*read:entry,
70syscall::*write*:entry
71{
72	/*
73	 * starting with a file descriptior, dig out useful info
74	 * from the corresponding file_t and vnode_t.
75	 */
76	this->filistp = curthread->t_procp->p_user.u_finfo.fi_list;
77	this->ufentryp = (uf_entry_t *)((uint64_t)this->filistp +
78	    (uint64_t)arg0 * (uint64_t)sizeof (uf_entry_t));
79	this->filep = this->ufentryp->uf_file;
80	self->offset = this->filep->f_offset;
81	this->vnodep = this->filep != 0 ? this->filep->f_vnode : 0;
82	self->vpath = this->vnodep ? (this->vnodep->v_path != 0 ?
83	    cleanpath(this->vnodep->v_path) : "<unknown>") : "<unknown>";
84
85	/* only trace activity to regular files and directories, as */
86	self->sc_trace = this->vnodep ? this->vnodep->v_type == VREG ||
87	    this->vnodep->v_type == VDIR ? 1 : 0 : 0;
88}
89
90syscall::*read:entry
91/self->sc_trace/
92{
93	printf("sc-%-9s %10s %2s %8d %6d %s\n", probefunc, ".", "R",
94	    (int)arg2, self->offset / 1024, self->vpath);
95}
96
97syscall::*write*:entry
98/self->sc_trace/
99{
100	printf("sc-%-9s %10s %2s %8d %6d %s\n", probefunc, ".", "W",
101	    (int)arg2, self->offset / 1024, self->vpath);
102}
103
104syscall::*read:return,
105syscall::*write*:return
106{
107	self->vpath = 0;
108	self->offset = 0;
109	self->sc_trace = 0;
110}
111
112fbt::fop_read:entry,
113fbt::fop_write:entry
114/self->sc_trace && args[0]->v_path/
115{
116	printf("  %-10s %10s %2s %8d %6d %s\n", probefunc, ".",
117	    probefunc == "fop_read" ? "R" : "W", args[1]->uio_resid,
118	    args[1]->_uio_offset._f / 1024, cleanpath(args[0]->v_path));
119}
120
121fbt:ufs:ufs_getpage_ra:entry
122{
123	/* fetch the real offset (file_t is unaware of this) */
124	self->ra_offset = ((inode_t *)args[0]->v_data)->i_nextrio;
125	self->read_ahead = 1;
126}
127
128fbt:ufs:ufs_getpage_ra:return
129{
130	self->read_ahead = 0;
131	self->ra_offset = 0;
132}
133
134io::bdev_strategy:start
135{
136	this->offset = self->read_ahead ? self->ra_offset : args[2]->fi_offset;
137	printf("    %-8s %10s %2s %8d %6d %s\n",
138	    self->read_ahead ? "disk_ra" : "disk_io", args[1]->dev_statname,
139	    args[0]->b_flags & B_READ ? "R" : "W", args[0]->b_bcount,
140	    this->offset / 1024, args[2]->fi_pathname);
141	/*
142	 * it would seem to make sense to only trace disk events during
143	 * an fop event, easily coded with a self->fop_trace flag. However
144	 * writes are asynchronous to the fop_write calls (they are flushed
145	 * at some later time), and so this approach will miss tracing
146	 * most of the disk writes.
147	 */
148}
149