vdev_queue.c (185029) | vdev_queue.c (209962) |
---|---|
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 5 unchanged lines hidden (view full) --- 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* | 1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 5 unchanged lines hidden (view full) --- 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* |
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. | 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
23 * Use is subject to license terms. 24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/spa.h> 28#include <sys/vdev_impl.h> 29#include <sys/zio.h> 30#include <sys/avl.h> --- 12 unchanged lines hidden (view full) --- 43 44/* deadline = pri + (LBOLT >> time_shift) */ 45int zfs_vdev_time_shift = 6; 46 47/* exponential I/O issue ramp-up rate */ 48int zfs_vdev_ramp_rate = 2; 49 50/* | 23 * Use is subject to license terms. 24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/spa.h> 28#include <sys/vdev_impl.h> 29#include <sys/zio.h> 30#include <sys/avl.h> --- 12 unchanged lines hidden (view full) --- 43 44/* deadline = pri + (LBOLT >> time_shift) */ 45int zfs_vdev_time_shift = 6; 46 47/* exponential I/O issue ramp-up rate */ 48int zfs_vdev_ramp_rate = 2; 49 50/* |
51 * i/os will be aggregated into a single large i/o up to 52 * zfs_vdev_aggregation_limit bytes long. | 51 * To reduce IOPs, we aggregate small adjacent i/os into one large i/o. 52 * For read i/os, we also aggregate across small adjacency gaps. |
53 */ 54int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; | 53 */ 54int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; |
55int zfs_vdev_read_gap_limit = 32 << 10; |
|
55 56SYSCTL_DECL(_vfs_zfs_vdev); 57TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending); 58SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RDTUN, 59 &zfs_vdev_max_pending, 0, "Maximum I/O requests pending on each device"); 60TUNABLE_INT("vfs.zfs.vdev.min_pending", &zfs_vdev_min_pending); 61SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RDTUN, 62 &zfs_vdev_min_pending, 0, --- 100 unchanged lines hidden (view full) --- 163{ 164 avl_remove(&vq->vq_deadline_tree, zio); 165 avl_remove(zio->io_vdev_tree, zio); 166} 167 168static void 169vdev_queue_agg_io_done(zio_t *aio) 170{ | 56 57SYSCTL_DECL(_vfs_zfs_vdev); 58TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending); 59SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RDTUN, 60 &zfs_vdev_max_pending, 0, "Maximum I/O requests pending on each device"); 61TUNABLE_INT("vfs.zfs.vdev.min_pending", &zfs_vdev_min_pending); 62SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RDTUN, 63 &zfs_vdev_min_pending, 0, --- 100 unchanged lines hidden (view full) --- 164{ 165 avl_remove(&vq->vq_deadline_tree, zio); 166 avl_remove(zio->io_vdev_tree, zio); 167} 168 169static void 170vdev_queue_agg_io_done(zio_t *aio) 171{ |
171 zio_t *dio; 172 uint64_t offset = 0; | 172 zio_t *pio; |
173 | 173 |
174 while ((dio = aio->io_delegate_list) != NULL) { | 174 while ((pio = zio_walk_parents(aio)) != NULL) |
175 if (aio->io_type == ZIO_TYPE_READ) | 175 if (aio->io_type == ZIO_TYPE_READ) |
176 bcopy((char *)aio->io_data + offset, dio->io_data, 177 dio->io_size); 178 offset += dio->io_size; 179 aio->io_delegate_list = dio->io_delegate_next; 180 dio->io_delegate_next = NULL; 181 dio->io_error = aio->io_error; 182 zio_execute(dio); 183 } 184 ASSERT3U(offset, ==, aio->io_size); | 176 bcopy((char *)aio->io_data + (pio->io_offset - 177 aio->io_offset), pio->io_data, pio->io_size); |
185 186 zio_buf_free(aio->io_data, aio->io_size); 187} 188 | 178 179 zio_buf_free(aio->io_data, aio->io_size); 180} 181 |
189#define IS_ADJACENT(io, nio) \ 190 ((io)->io_offset + (io)->io_size == (nio)->io_offset) | 182/* 183 * Compute the range spanned by two i/os, which is the endpoint of the last 184 * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). 185 * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); 186 * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. 187 */ 188#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) 189#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) |
191 192static zio_t * 193vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) 194{ | 190 191static zio_t * 192vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) 193{ |
195 zio_t *fio, *lio, *aio, *dio; 196 avl_tree_t *tree; 197 uint64_t size; | 194 zio_t *fio, *lio, *aio, *dio, *nio; 195 avl_tree_t *t; 196 int flags; 197 uint64_t maxspan = zfs_vdev_aggregation_limit; 198 uint64_t maxgap; |
198 199 ASSERT(MUTEX_HELD(&vq->vq_lock)); 200 201 if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || 202 avl_numnodes(&vq->vq_deadline_tree) == 0) 203 return (NULL); 204 205 fio = lio = avl_first(&vq->vq_deadline_tree); 206 | 199 200 ASSERT(MUTEX_HELD(&vq->vq_lock)); 201 202 if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || 203 avl_numnodes(&vq->vq_deadline_tree) == 0) 204 return (NULL); 205 206 fio = lio = avl_first(&vq->vq_deadline_tree); 207 |
207 tree = fio->io_vdev_tree; 208 size = fio->io_size; | 208 t = fio->io_vdev_tree; 209 flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; 210 maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; |
209 | 211 |
210 while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && 211 !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && 212 size + dio->io_size <= zfs_vdev_aggregation_limit) { 213 dio->io_delegate_next = fio; 214 fio = dio; 215 size += dio->io_size; 216 } | 212 if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { 213 /* 214 * We can aggregate I/Os that are adjacent and of the 215 * same flavor, as expressed by the AGG_INHERIT flags. 216 * The latter is necessary so that certain attributes 217 * of the I/O, such as whether it's a normal I/O or a 218 * scrub/resilver, can be preserved in the aggregate. 219 */ 220 while ((dio = AVL_PREV(t, fio)) != NULL && 221 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 222 IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap) 223 fio = dio; |
217 | 224 |
218 while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && 219 !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && 220 size + dio->io_size <= zfs_vdev_aggregation_limit) { 221 lio->io_delegate_next = dio; 222 lio = dio; 223 size += dio->io_size; | 225 while ((dio = AVL_NEXT(t, lio)) != NULL && 226 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 227 IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap) 228 lio = dio; |
224 } 225 226 if (fio != lio) { | 229 } 230 231 if (fio != lio) { |
227 char *buf = zio_buf_alloc(size); 228 uint64_t offset = 0; 229 | 232 uint64_t size = IO_SPAN(fio, lio); |
230 ASSERT(size <= zfs_vdev_aggregation_limit); 231 232 aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, | 233 ASSERT(size <= zfs_vdev_aggregation_limit); 234 235 aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, |
233 buf, size, fio->io_type, ZIO_PRIORITY_NOW, 234 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, | 236 zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW, 237 flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, |
235 vdev_queue_agg_io_done, NULL); 236 | 238 vdev_queue_agg_io_done, NULL); 239 |
237 aio->io_delegate_list = fio; 238 239 for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { | 240 nio = fio; 241 do { 242 dio = nio; 243 nio = AVL_NEXT(t, dio); |
240 ASSERT(dio->io_type == aio->io_type); | 244 ASSERT(dio->io_type == aio->io_type); |
241 ASSERT(dio->io_vdev_tree == tree); | 245 ASSERT(dio->io_vdev_tree == t); 246 |
242 if (dio->io_type == ZIO_TYPE_WRITE) | 247 if (dio->io_type == ZIO_TYPE_WRITE) |
243 bcopy(dio->io_data, buf + offset, dio->io_size); 244 offset += dio->io_size; | 248 bcopy(dio->io_data, (char *)aio->io_data + 249 (dio->io_offset - aio->io_offset), 250 dio->io_size); 251 252 zio_add_child(dio, aio); |
245 vdev_queue_io_remove(vq, dio); 246 zio_vdev_io_bypass(dio); | 253 vdev_queue_io_remove(vq, dio); 254 zio_vdev_io_bypass(dio); |
247 } | 255 zio_execute(dio); 256 } while (dio != lio); |
248 | 257 |
249 ASSERT(offset == size); 250 | |
251 avl_add(&vq->vq_pending_tree, aio); 252 253 return (aio); 254 } 255 | 258 avl_add(&vq->vq_pending_tree, aio); 259 260 return (aio); 261 } 262 |
256 ASSERT(fio->io_vdev_tree == tree); | 263 ASSERT(fio->io_vdev_tree == t); |
257 vdev_queue_io_remove(vq, fio); 258 259 avl_add(&vq->vq_pending_tree, fio); 260 261 return (fio); 262} 263 264zio_t * --- 63 unchanged lines hidden --- | 264 vdev_queue_io_remove(vq, fio); 265 266 avl_add(&vq->vq_pending_tree, fio); 267 268 return (fio); 269} 270 271zio_t * --- 63 unchanged lines hidden --- |