vdev_queue.c revision 185029
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/zfs_context.h>
27#include <sys/spa.h>
28#include <sys/vdev_impl.h>
29#include <sys/zio.h>
30#include <sys/avl.h>
31
32/*
33 * These tunables are for performance analysis.
34 */
35/*
36 * zfs_vdev_max_pending is the maximum number of i/os concurrently
37 * pending to each device.  zfs_vdev_min_pending is the initial number
38 * of i/os pending to each device (before it starts ramping up to
39 * max_pending).
40 */
41int zfs_vdev_max_pending = 35;
42int zfs_vdev_min_pending = 4;
43
44/* deadline = pri + (LBOLT >> time_shift) */
45int zfs_vdev_time_shift = 6;
46
47/* exponential I/O issue ramp-up rate */
48int zfs_vdev_ramp_rate = 2;
49
50/*
51 * i/os will be aggregated into a single large i/o up to
52 * zfs_vdev_aggregation_limit bytes long.
53 */
54int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
55
56SYSCTL_DECL(_vfs_zfs_vdev);
57TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending);
58SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RDTUN,
59    &zfs_vdev_max_pending, 0, "Maximum I/O requests pending on each device");
60TUNABLE_INT("vfs.zfs.vdev.min_pending", &zfs_vdev_min_pending);
61SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RDTUN,
62    &zfs_vdev_min_pending, 0,
63    "Initial number of I/O requests pending to each device");
64TUNABLE_INT("vfs.zfs.vdev.time_shift", &zfs_vdev_time_shift);
65SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RDTUN,
66    &zfs_vdev_time_shift, 0, "Used for calculating I/O request deadline");
67TUNABLE_INT("vfs.zfs.vdev.ramp_rate", &zfs_vdev_ramp_rate);
68SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RDTUN,
69    &zfs_vdev_ramp_rate, 0, "Exponential I/O issue ramp-up rate");
70TUNABLE_INT("vfs.zfs.vdev.aggregation_limit", &zfs_vdev_aggregation_limit);
71SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RDTUN,
72    &zfs_vdev_aggregation_limit, 0,
73    "I/O requests are aggregated up to this size");
74
75/*
76 * Virtual device vector for disk I/O scheduling.
77 */
78int
79vdev_queue_deadline_compare(const void *x1, const void *x2)
80{
81	const zio_t *z1 = x1;
82	const zio_t *z2 = x2;
83
84	if (z1->io_deadline < z2->io_deadline)
85		return (-1);
86	if (z1->io_deadline > z2->io_deadline)
87		return (1);
88
89	if (z1->io_offset < z2->io_offset)
90		return (-1);
91	if (z1->io_offset > z2->io_offset)
92		return (1);
93
94	if (z1 < z2)
95		return (-1);
96	if (z1 > z2)
97		return (1);
98
99	return (0);
100}
101
102int
103vdev_queue_offset_compare(const void *x1, const void *x2)
104{
105	const zio_t *z1 = x1;
106	const zio_t *z2 = x2;
107
108	if (z1->io_offset < z2->io_offset)
109		return (-1);
110	if (z1->io_offset > z2->io_offset)
111		return (1);
112
113	if (z1 < z2)
114		return (-1);
115	if (z1 > z2)
116		return (1);
117
118	return (0);
119}
120
121void
122vdev_queue_init(vdev_t *vd)
123{
124	vdev_queue_t *vq = &vd->vdev_queue;
125
126	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
127
128	avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
129	    sizeof (zio_t), offsetof(struct zio, io_deadline_node));
130
131	avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
132	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
133
134	avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
135	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
136
137	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
138	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
139}
140
141void
142vdev_queue_fini(vdev_t *vd)
143{
144	vdev_queue_t *vq = &vd->vdev_queue;
145
146	avl_destroy(&vq->vq_deadline_tree);
147	avl_destroy(&vq->vq_read_tree);
148	avl_destroy(&vq->vq_write_tree);
149	avl_destroy(&vq->vq_pending_tree);
150
151	mutex_destroy(&vq->vq_lock);
152}
153
154static void
155vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
156{
157	avl_add(&vq->vq_deadline_tree, zio);
158	avl_add(zio->io_vdev_tree, zio);
159}
160
161static void
162vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
163{
164	avl_remove(&vq->vq_deadline_tree, zio);
165	avl_remove(zio->io_vdev_tree, zio);
166}
167
168static void
169vdev_queue_agg_io_done(zio_t *aio)
170{
171	zio_t *dio;
172	uint64_t offset = 0;
173
174	while ((dio = aio->io_delegate_list) != NULL) {
175		if (aio->io_type == ZIO_TYPE_READ)
176			bcopy((char *)aio->io_data + offset, dio->io_data,
177			    dio->io_size);
178		offset += dio->io_size;
179		aio->io_delegate_list = dio->io_delegate_next;
180		dio->io_delegate_next = NULL;
181		dio->io_error = aio->io_error;
182		zio_execute(dio);
183	}
184	ASSERT3U(offset, ==, aio->io_size);
185
186	zio_buf_free(aio->io_data, aio->io_size);
187}
188
189#define	IS_ADJACENT(io, nio) \
190	((io)->io_offset + (io)->io_size == (nio)->io_offset)
191
192static zio_t *
193vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
194{
195	zio_t *fio, *lio, *aio, *dio;
196	avl_tree_t *tree;
197	uint64_t size;
198
199	ASSERT(MUTEX_HELD(&vq->vq_lock));
200
201	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
202	    avl_numnodes(&vq->vq_deadline_tree) == 0)
203		return (NULL);
204
205	fio = lio = avl_first(&vq->vq_deadline_tree);
206
207	tree = fio->io_vdev_tree;
208	size = fio->io_size;
209
210	while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
211	    !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
212	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
213		dio->io_delegate_next = fio;
214		fio = dio;
215		size += dio->io_size;
216	}
217
218	while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
219	    !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
220	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
221		lio->io_delegate_next = dio;
222		lio = dio;
223		size += dio->io_size;
224	}
225
226	if (fio != lio) {
227		char *buf = zio_buf_alloc(size);
228		uint64_t offset = 0;
229
230		ASSERT(size <= zfs_vdev_aggregation_limit);
231
232		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
233		    buf, size, fio->io_type, ZIO_PRIORITY_NOW,
234		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
235		    vdev_queue_agg_io_done, NULL);
236
237		aio->io_delegate_list = fio;
238
239		for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
240			ASSERT(dio->io_type == aio->io_type);
241			ASSERT(dio->io_vdev_tree == tree);
242			if (dio->io_type == ZIO_TYPE_WRITE)
243				bcopy(dio->io_data, buf + offset, dio->io_size);
244			offset += dio->io_size;
245			vdev_queue_io_remove(vq, dio);
246			zio_vdev_io_bypass(dio);
247		}
248
249		ASSERT(offset == size);
250
251		avl_add(&vq->vq_pending_tree, aio);
252
253		return (aio);
254	}
255
256	ASSERT(fio->io_vdev_tree == tree);
257	vdev_queue_io_remove(vq, fio);
258
259	avl_add(&vq->vq_pending_tree, fio);
260
261	return (fio);
262}
263
264zio_t *
265vdev_queue_io(zio_t *zio)
266{
267	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
268	zio_t *nio;
269
270	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
271
272	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
273		return (zio);
274
275	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
276
277	if (zio->io_type == ZIO_TYPE_READ)
278		zio->io_vdev_tree = &vq->vq_read_tree;
279	else
280		zio->io_vdev_tree = &vq->vq_write_tree;
281
282	mutex_enter(&vq->vq_lock);
283
284	zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority;
285
286	vdev_queue_io_add(vq, zio);
287
288	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
289
290	mutex_exit(&vq->vq_lock);
291
292	if (nio == NULL)
293		return (NULL);
294
295	if (nio->io_done == vdev_queue_agg_io_done) {
296		zio_nowait(nio);
297		return (NULL);
298	}
299
300	return (nio);
301}
302
303void
304vdev_queue_io_done(zio_t *zio)
305{
306	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
307
308	mutex_enter(&vq->vq_lock);
309
310	avl_remove(&vq->vq_pending_tree, zio);
311
312	for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
313		zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
314		if (nio == NULL)
315			break;
316		mutex_exit(&vq->vq_lock);
317		if (nio->io_done == vdev_queue_agg_io_done) {
318			zio_nowait(nio);
319		} else {
320			zio_vdev_io_reissue(nio);
321			zio_execute(nio);
322		}
323		mutex_enter(&vq->vq_lock);
324	}
325
326	mutex_exit(&vq->vq_lock);
327}
328