vdev_queue.c revision 168404
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22168404Spjd * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23168404Spjd * Use is subject to license terms.
24168404Spjd */
25168404Spjd
26168404Spjd#pragma ident	"%Z%%M%	%I%	%E% SMI"
27168404Spjd
28168404Spjd#include <sys/zfs_context.h>
29168404Spjd#include <sys/spa.h>
30168404Spjd#include <sys/vdev_impl.h>
31168404Spjd#include <sys/zio.h>
32168404Spjd#include <sys/avl.h>
33168404Spjd
34168404Spjd/*
35168404Spjd * These tunables are for performance analysis.
36168404Spjd */
37168404Spjd/*
38168404Spjd * zfs_vdev_max_pending is the maximum number of i/os concurrently
39168404Spjd * pending to each device.  zfs_vdev_min_pending is the initial number
40168404Spjd * of i/os pending to each device (before it starts ramping up to
41168404Spjd * max_pending).
42168404Spjd */
43168404Spjdint zfs_vdev_max_pending = 35;
44168404Spjdint zfs_vdev_min_pending = 4;
45168404Spjd
46168404Spjd/* deadline = pri + (lbolt >> time_shift) */
47168404Spjdint zfs_vdev_time_shift = 6;
48168404Spjd
49168404Spjd/* exponential I/O issue ramp-up rate */
50168404Spjdint zfs_vdev_ramp_rate = 2;
51168404Spjd
52168404Spjd/*
53168404Spjd * i/os will be aggregated into a single large i/o up to
54168404Spjd * zfs_vdev_aggregation_limit bytes long.
55168404Spjd */
56168404Spjdint zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
57168404Spjd
58168404Spjd/*
59168404Spjd * Virtual device vector for disk I/O scheduling.
60168404Spjd */
61168404Spjdint
62168404Spjdvdev_queue_deadline_compare(const void *x1, const void *x2)
63168404Spjd{
64168404Spjd	const zio_t *z1 = x1;
65168404Spjd	const zio_t *z2 = x2;
66168404Spjd
67168404Spjd	if (z1->io_deadline < z2->io_deadline)
68168404Spjd		return (-1);
69168404Spjd	if (z1->io_deadline > z2->io_deadline)
70168404Spjd		return (1);
71168404Spjd
72168404Spjd	if (z1->io_offset < z2->io_offset)
73168404Spjd		return (-1);
74168404Spjd	if (z1->io_offset > z2->io_offset)
75168404Spjd		return (1);
76168404Spjd
77168404Spjd	if (z1 < z2)
78168404Spjd		return (-1);
79168404Spjd	if (z1 > z2)
80168404Spjd		return (1);
81168404Spjd
82168404Spjd	return (0);
83168404Spjd}
84168404Spjd
85168404Spjdint
86168404Spjdvdev_queue_offset_compare(const void *x1, const void *x2)
87168404Spjd{
88168404Spjd	const zio_t *z1 = x1;
89168404Spjd	const zio_t *z2 = x2;
90168404Spjd
91168404Spjd	if (z1->io_offset < z2->io_offset)
92168404Spjd		return (-1);
93168404Spjd	if (z1->io_offset > z2->io_offset)
94168404Spjd		return (1);
95168404Spjd
96168404Spjd	if (z1 < z2)
97168404Spjd		return (-1);
98168404Spjd	if (z1 > z2)
99168404Spjd		return (1);
100168404Spjd
101168404Spjd	return (0);
102168404Spjd}
103168404Spjd
104168404Spjdvoid
105168404Spjdvdev_queue_init(vdev_t *vd)
106168404Spjd{
107168404Spjd	vdev_queue_t *vq = &vd->vdev_queue;
108168404Spjd
109168404Spjd	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
110168404Spjd
111168404Spjd	avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
112168404Spjd	    sizeof (zio_t), offsetof(struct zio, io_deadline_node));
113168404Spjd
114168404Spjd	avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
115168404Spjd	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
116168404Spjd
117168404Spjd	avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
118168404Spjd	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
119168404Spjd
120168404Spjd	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
121168404Spjd	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
122168404Spjd}
123168404Spjd
124168404Spjdvoid
125168404Spjdvdev_queue_fini(vdev_t *vd)
126168404Spjd{
127168404Spjd	vdev_queue_t *vq = &vd->vdev_queue;
128168404Spjd
129168404Spjd	avl_destroy(&vq->vq_deadline_tree);
130168404Spjd	avl_destroy(&vq->vq_read_tree);
131168404Spjd	avl_destroy(&vq->vq_write_tree);
132168404Spjd	avl_destroy(&vq->vq_pending_tree);
133168404Spjd
134168404Spjd	mutex_destroy(&vq->vq_lock);
135168404Spjd}
136168404Spjd
137168404Spjdstatic void
138168404Spjdvdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
139168404Spjd{
140168404Spjd	avl_add(&vq->vq_deadline_tree, zio);
141168404Spjd	avl_add(zio->io_vdev_tree, zio);
142168404Spjd}
143168404Spjd
144168404Spjdstatic void
145168404Spjdvdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
146168404Spjd{
147168404Spjd	avl_remove(&vq->vq_deadline_tree, zio);
148168404Spjd	avl_remove(zio->io_vdev_tree, zio);
149168404Spjd}
150168404Spjd
151168404Spjdstatic void
152168404Spjdvdev_queue_agg_io_done(zio_t *aio)
153168404Spjd{
154168404Spjd	zio_t *dio;
155168404Spjd	uint64_t offset = 0;
156168404Spjd
157168404Spjd	while ((dio = aio->io_delegate_list) != NULL) {
158168404Spjd		if (aio->io_type == ZIO_TYPE_READ)
159168404Spjd			bcopy((char *)aio->io_data + offset, dio->io_data,
160168404Spjd			    dio->io_size);
161168404Spjd		offset += dio->io_size;
162168404Spjd		aio->io_delegate_list = dio->io_delegate_next;
163168404Spjd		dio->io_delegate_next = NULL;
164168404Spjd		dio->io_error = aio->io_error;
165168404Spjd		zio_next_stage(dio);
166168404Spjd	}
167168404Spjd	ASSERT3U(offset, ==, aio->io_size);
168168404Spjd
169168404Spjd	zio_buf_free(aio->io_data, aio->io_size);
170168404Spjd}
171168404Spjd
172168404Spjd#define	IS_ADJACENT(io, nio) \
173168404Spjd	((io)->io_offset + (io)->io_size == (nio)->io_offset)
174168404Spjd
175168404Spjdtypedef void zio_issue_func_t(zio_t *);
176168404Spjd
177168404Spjdstatic zio_t *
178168404Spjdvdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
179168404Spjd	zio_issue_func_t **funcp)
180168404Spjd{
181168404Spjd	zio_t *fio, *lio, *aio, *dio;
182168404Spjd	avl_tree_t *tree;
183168404Spjd	uint64_t size;
184168404Spjd
185168404Spjd	ASSERT(MUTEX_HELD(&vq->vq_lock));
186168404Spjd
187168404Spjd	*funcp = NULL;
188168404Spjd
189168404Spjd	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
190168404Spjd	    avl_numnodes(&vq->vq_deadline_tree) == 0)
191168404Spjd		return (NULL);
192168404Spjd
193168404Spjd	fio = lio = avl_first(&vq->vq_deadline_tree);
194168404Spjd
195168404Spjd	tree = fio->io_vdev_tree;
196168404Spjd	size = fio->io_size;
197168404Spjd
198168404Spjd	while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
199168404Spjd	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
200168404Spjd		dio->io_delegate_next = fio;
201168404Spjd		fio = dio;
202168404Spjd		size += dio->io_size;
203168404Spjd	}
204168404Spjd
205168404Spjd	while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
206168404Spjd	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
207168404Spjd		lio->io_delegate_next = dio;
208168404Spjd		lio = dio;
209168404Spjd		size += dio->io_size;
210168404Spjd	}
211168404Spjd
212168404Spjd	if (fio != lio) {
213168404Spjd		char *buf = zio_buf_alloc(size);
214168404Spjd		uint64_t offset = 0;
215168404Spjd		int nagg = 0;
216168404Spjd
217168404Spjd		ASSERT(size <= zfs_vdev_aggregation_limit);
218168404Spjd
219168404Spjd		aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
220168404Spjd		    fio->io_offset, buf, size, fio->io_type,
221168404Spjd		    ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
222168404Spjd		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
223168404Spjd		    ZIO_FLAG_NOBOOKMARK,
224168404Spjd		    vdev_queue_agg_io_done, NULL);
225168404Spjd
226168404Spjd		aio->io_delegate_list = fio;
227168404Spjd
228168404Spjd		for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
229168404Spjd			ASSERT(dio->io_type == aio->io_type);
230168404Spjd			ASSERT(dio->io_vdev_tree == tree);
231168404Spjd			if (dio->io_type == ZIO_TYPE_WRITE)
232168404Spjd				bcopy(dio->io_data, buf + offset, dio->io_size);
233168404Spjd			offset += dio->io_size;
234168404Spjd			vdev_queue_io_remove(vq, dio);
235168404Spjd			zio_vdev_io_bypass(dio);
236168404Spjd			nagg++;
237168404Spjd		}
238168404Spjd
239168404Spjd		ASSERT(offset == size);
240168404Spjd
241168404Spjd		dprintf("%5s  T=%llu  off=%8llx  agg=%3d  "
242168404Spjd		    "old=%5llx  new=%5llx\n",
243168404Spjd		    zio_type_name[fio->io_type],
244168404Spjd		    fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);
245168404Spjd
246168404Spjd		avl_add(&vq->vq_pending_tree, aio);
247168404Spjd
248168404Spjd		*funcp = zio_nowait;
249168404Spjd		return (aio);
250168404Spjd	}
251168404Spjd
252168404Spjd	ASSERT(fio->io_vdev_tree == tree);
253168404Spjd	vdev_queue_io_remove(vq, fio);
254168404Spjd
255168404Spjd	avl_add(&vq->vq_pending_tree, fio);
256168404Spjd
257168404Spjd	*funcp = zio_next_stage;
258168404Spjd
259168404Spjd	return (fio);
260168404Spjd}
261168404Spjd
262168404Spjdzio_t *
263168404Spjdvdev_queue_io(zio_t *zio)
264168404Spjd{
265168404Spjd	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
266168404Spjd	zio_t *nio;
267168404Spjd	zio_issue_func_t *func;
268168404Spjd
269168404Spjd	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
270168404Spjd
271168404Spjd	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
272168404Spjd		return (zio);
273168404Spjd
274168404Spjd	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
275168404Spjd
276168404Spjd	if (zio->io_type == ZIO_TYPE_READ)
277168404Spjd		zio->io_vdev_tree = &vq->vq_read_tree;
278168404Spjd	else
279168404Spjd		zio->io_vdev_tree = &vq->vq_write_tree;
280168404Spjd
281168404Spjd	mutex_enter(&vq->vq_lock);
282168404Spjd
283168404Spjd	zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
284168404Spjd	    zio->io_priority;
285168404Spjd
286168404Spjd	vdev_queue_io_add(vq, zio);
287168404Spjd
288168404Spjd	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func);
289168404Spjd
290168404Spjd	mutex_exit(&vq->vq_lock);
291168404Spjd
292168404Spjd	if (nio == NULL || func != zio_nowait)
293168404Spjd		return (nio);
294168404Spjd
295168404Spjd	func(nio);
296168404Spjd	return (NULL);
297168404Spjd}
298168404Spjd
299168404Spjdvoid
300168404Spjdvdev_queue_io_done(zio_t *zio)
301168404Spjd{
302168404Spjd	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
303168404Spjd	zio_t *nio;
304168404Spjd	zio_issue_func_t *func;
305168404Spjd	int i;
306168404Spjd
307168404Spjd	mutex_enter(&vq->vq_lock);
308168404Spjd
309168404Spjd	avl_remove(&vq->vq_pending_tree, zio);
310168404Spjd
311168404Spjd	for (i = 0; i < zfs_vdev_ramp_rate; i++) {
312168404Spjd		nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func);
313168404Spjd		if (nio == NULL)
314168404Spjd			break;
315168404Spjd		mutex_exit(&vq->vq_lock);
316168404Spjd		if (func == zio_next_stage)
317168404Spjd			zio_vdev_io_reissue(nio);
318168404Spjd		func(nio);
319168404Spjd		mutex_enter(&vq->vq_lock);
320168404Spjd	}
321168404Spjd
322168404Spjd	mutex_exit(&vq->vq_lock);
323168404Spjd}
324