vdev_queue.c revision 209962
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/spa.h> 28#include <sys/vdev_impl.h> 29#include <sys/zio.h> 30#include <sys/avl.h> 31 32/* 33 * These tunables are for performance analysis. 34 */ 35/* 36 * zfs_vdev_max_pending is the maximum number of i/os concurrently 37 * pending to each device. zfs_vdev_min_pending is the initial number 38 * of i/os pending to each device (before it starts ramping up to 39 * max_pending). 40 */ 41int zfs_vdev_max_pending = 35; 42int zfs_vdev_min_pending = 4; 43 44/* deadline = pri + (LBOLT >> time_shift) */ 45int zfs_vdev_time_shift = 6; 46 47/* exponential I/O issue ramp-up rate */ 48int zfs_vdev_ramp_rate = 2; 49 50/* 51 * To reduce IOPs, we aggregate small adjacent i/os into one large i/o. 52 * For read i/os, we also aggregate across small adjacency gaps. 53 */ 54int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; 55int zfs_vdev_read_gap_limit = 32 << 10; 56 57SYSCTL_DECL(_vfs_zfs_vdev); 58TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending); 59SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RDTUN, 60 &zfs_vdev_max_pending, 0, "Maximum I/O requests pending on each device"); 61TUNABLE_INT("vfs.zfs.vdev.min_pending", &zfs_vdev_min_pending); 62SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RDTUN, 63 &zfs_vdev_min_pending, 0, 64 "Initial number of I/O requests pending to each device"); 65TUNABLE_INT("vfs.zfs.vdev.time_shift", &zfs_vdev_time_shift); 66SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RDTUN, 67 &zfs_vdev_time_shift, 0, "Used for calculating I/O request deadline"); 68TUNABLE_INT("vfs.zfs.vdev.ramp_rate", &zfs_vdev_ramp_rate); 69SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RDTUN, 70 &zfs_vdev_ramp_rate, 0, "Exponential I/O issue ramp-up rate"); 71TUNABLE_INT("vfs.zfs.vdev.aggregation_limit", &zfs_vdev_aggregation_limit); 72SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RDTUN, 73 &zfs_vdev_aggregation_limit, 0, 74 "I/O requests are aggregated up to this size"); 75 76/* 77 * Virtual device vector for disk I/O scheduling. 78 */ 79int 80vdev_queue_deadline_compare(const void *x1, const void *x2) 81{ 82 const zio_t *z1 = x1; 83 const zio_t *z2 = x2; 84 85 if (z1->io_deadline < z2->io_deadline) 86 return (-1); 87 if (z1->io_deadline > z2->io_deadline) 88 return (1); 89 90 if (z1->io_offset < z2->io_offset) 91 return (-1); 92 if (z1->io_offset > z2->io_offset) 93 return (1); 94 95 if (z1 < z2) 96 return (-1); 97 if (z1 > z2) 98 return (1); 99 100 return (0); 101} 102 103int 104vdev_queue_offset_compare(const void *x1, const void *x2) 105{ 106 const zio_t *z1 = x1; 107 const zio_t *z2 = x2; 108 109 if (z1->io_offset < z2->io_offset) 110 return (-1); 111 if (z1->io_offset > z2->io_offset) 112 return (1); 113 114 if (z1 < z2) 115 return (-1); 116 if (z1 > z2) 117 return (1); 118 119 return (0); 120} 121 122void 123vdev_queue_init(vdev_t *vd) 124{ 125 vdev_queue_t *vq = &vd->vdev_queue; 126 127 mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); 128 129 avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, 130 sizeof (zio_t), offsetof(struct zio, io_deadline_node)); 131 132 avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, 133 sizeof (zio_t), offsetof(struct zio, io_offset_node)); 134 135 avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, 136 sizeof (zio_t), offsetof(struct zio, io_offset_node)); 137 138 avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, 139 sizeof (zio_t), offsetof(struct zio, io_offset_node)); 140} 141 142void 143vdev_queue_fini(vdev_t *vd) 144{ 145 vdev_queue_t *vq = &vd->vdev_queue; 146 147 avl_destroy(&vq->vq_deadline_tree); 148 avl_destroy(&vq->vq_read_tree); 149 avl_destroy(&vq->vq_write_tree); 150 avl_destroy(&vq->vq_pending_tree); 151 152 mutex_destroy(&vq->vq_lock); 153} 154 155static void 156vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) 157{ 158 avl_add(&vq->vq_deadline_tree, zio); 159 avl_add(zio->io_vdev_tree, zio); 160} 161 162static void 163vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) 164{ 165 avl_remove(&vq->vq_deadline_tree, zio); 166 avl_remove(zio->io_vdev_tree, zio); 167} 168 169static void 170vdev_queue_agg_io_done(zio_t *aio) 171{ 172 zio_t *pio; 173 174 while ((pio = zio_walk_parents(aio)) != NULL) 175 if (aio->io_type == ZIO_TYPE_READ) 176 bcopy((char *)aio->io_data + (pio->io_offset - 177 aio->io_offset), pio->io_data, pio->io_size); 178 179 zio_buf_free(aio->io_data, aio->io_size); 180} 181 182/* 183 * Compute the range spanned by two i/os, which is the endpoint of the last 184 * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). 185 * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); 186 * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. 187 */ 188#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) 189#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) 190 191static zio_t * 192vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) 193{ 194 zio_t *fio, *lio, *aio, *dio, *nio; 195 avl_tree_t *t; 196 int flags; 197 uint64_t maxspan = zfs_vdev_aggregation_limit; 198 uint64_t maxgap; 199 200 ASSERT(MUTEX_HELD(&vq->vq_lock)); 201 202 if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || 203 avl_numnodes(&vq->vq_deadline_tree) == 0) 204 return (NULL); 205 206 fio = lio = avl_first(&vq->vq_deadline_tree); 207 208 t = fio->io_vdev_tree; 209 flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; 210 maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; 211 212 if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { 213 /* 214 * We can aggregate I/Os that are adjacent and of the 215 * same flavor, as expressed by the AGG_INHERIT flags. 216 * The latter is necessary so that certain attributes 217 * of the I/O, such as whether it's a normal I/O or a 218 * scrub/resilver, can be preserved in the aggregate. 219 */ 220 while ((dio = AVL_PREV(t, fio)) != NULL && 221 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 222 IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap) 223 fio = dio; 224 225 while ((dio = AVL_NEXT(t, lio)) != NULL && 226 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 227 IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap) 228 lio = dio; 229 } 230 231 if (fio != lio) { 232 uint64_t size = IO_SPAN(fio, lio); 233 ASSERT(size <= zfs_vdev_aggregation_limit); 234 235 aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, 236 zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW, 237 flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, 238 vdev_queue_agg_io_done, NULL); 239 240 nio = fio; 241 do { 242 dio = nio; 243 nio = AVL_NEXT(t, dio); 244 ASSERT(dio->io_type == aio->io_type); 245 ASSERT(dio->io_vdev_tree == t); 246 247 if (dio->io_type == ZIO_TYPE_WRITE) 248 bcopy(dio->io_data, (char *)aio->io_data + 249 (dio->io_offset - aio->io_offset), 250 dio->io_size); 251 252 zio_add_child(dio, aio); 253 vdev_queue_io_remove(vq, dio); 254 zio_vdev_io_bypass(dio); 255 zio_execute(dio); 256 } while (dio != lio); 257 258 avl_add(&vq->vq_pending_tree, aio); 259 260 return (aio); 261 } 262 263 ASSERT(fio->io_vdev_tree == t); 264 vdev_queue_io_remove(vq, fio); 265 266 avl_add(&vq->vq_pending_tree, fio); 267 268 return (fio); 269} 270 271zio_t * 272vdev_queue_io(zio_t *zio) 273{ 274 vdev_queue_t *vq = &zio->io_vd->vdev_queue; 275 zio_t *nio; 276 277 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 278 279 if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) 280 return (zio); 281 282 zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; 283 284 if (zio->io_type == ZIO_TYPE_READ) 285 zio->io_vdev_tree = &vq->vq_read_tree; 286 else 287 zio->io_vdev_tree = &vq->vq_write_tree; 288 289 mutex_enter(&vq->vq_lock); 290 291 zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority; 292 293 vdev_queue_io_add(vq, zio); 294 295 nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); 296 297 mutex_exit(&vq->vq_lock); 298 299 if (nio == NULL) 300 return (NULL); 301 302 if (nio->io_done == vdev_queue_agg_io_done) { 303 zio_nowait(nio); 304 return (NULL); 305 } 306 307 return (nio); 308} 309 310void 311vdev_queue_io_done(zio_t *zio) 312{ 313 vdev_queue_t *vq = &zio->io_vd->vdev_queue; 314 315 mutex_enter(&vq->vq_lock); 316 317 avl_remove(&vq->vq_pending_tree, zio); 318 319 for (int i = 0; i < zfs_vdev_ramp_rate; i++) { 320 zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); 321 if (nio == NULL) 322 break; 323 mutex_exit(&vq->vq_lock); 324 if (nio->io_done == vdev_queue_agg_io_done) { 325 zio_nowait(nio); 326 } else { 327 zio_vdev_io_reissue(nio); 328 zio_execute(nio); 329 } 330 mutex_enter(&vq->vq_lock); 331 } 332 333 mutex_exit(&vq->vq_lock); 334} 335