1/* $NetBSD: bufq_priocscan.c,v 1.21 2017/05/04 11:03:27 kamil Exp $ */ 2 3/*- 4 * Copyright (c)2004,2005,2006,2008,2009,2011,2012 YAMAMOTO Takashi, 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <sys/cdefs.h> 30__KERNEL_RCSID(0, "$NetBSD: bufq_priocscan.c,v 1.21 2017/05/04 11:03:27 kamil Exp $"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/buf.h> 35#include <sys/bufq.h> 36#include <sys/bufq_impl.h> 37#include <sys/kmem.h> 38#include <sys/rbtree.h> 39#include <sys/module.h> 40 41#undef PRIOCSCAN_USE_GLOBAL_POSITION 42 43/* 44 * Cyclical scan (CSCAN) 45 */ 46 47struct cscan_key { 48 daddr_t k_rawblkno; 49 int k_cylinder; 50}; 51 52struct cscan_queue { 53 rb_tree_t cq_buffers; /* ordered list of buffers */ 54#if !defined(PRIOCSCAN_USE_GLOBAL_POSITION) 55 struct cscan_key cq_lastkey; /* key of last request */ 56#endif /* !defined(PRIOCSCAN_USE_GLOBAL_POSITION) */ 57 int cq_sortby; /* BUFQ_SORT_MASK */ 58 rb_tree_ops_t cq_ops; 59}; 60 61static signed int 62buf_cmp(const struct buf *b1, const struct buf *b2, int sortby) 63{ 64 65 if (buf_inorder(b2, b1, sortby)) { 66 return 1; /* b1 > b2 */ 67 } 68 if (buf_inorder(b1, b2, sortby)) { 69 return -1; /* b1 < b2 */ 70 } 71 return 0; 72} 73 74/* return positive if n1 > n2 */ 75static signed int 76cscan_tree_compare_nodes(void *context, const void *n1, const void *n2) 77{ 78 const struct cscan_queue * const q = context; 79 const struct buf * const b1 = n1; 80 const struct buf * const b2 = n2; 81 const int sortby = q->cq_sortby; 82 const int diff = buf_cmp(b1, b2, sortby); 83 84 /* 85 * XXX rawblkno/cylinder might not be unique. eg. unbuffered i/o 86 */ 87 88 if (diff != 0) { 89 return diff; 90 } 91 92 /* 93 * XXX rawblkno/cylinder might not be unique. eg. unbuffered i/o 94 */ 95 if (b1 > b2) { 96 return 1; 97 } 98 if (b1 < b2) { 99 return -1; 100 } 101 return 0; 102} 103 104/* return positive if n1 > k2 */ 105static signed int 106cscan_tree_compare_key(void *context, const void *n1, const void *k2) 107{ 108 const struct cscan_queue * const q = context; 109 const struct buf * const b1 = n1; 110 const struct cscan_key * const key = k2; 111 const struct buf tmp = { 112 .b_rawblkno = key->k_rawblkno, 113 .b_cylinder = key->k_cylinder, 114 }; 115 const struct buf *b2 = &tmp; 116 const int sortby = q->cq_sortby; 117 118 return buf_cmp(b1, b2, sortby); 119} 120 121static void __unused 122cscan_dump(struct cscan_queue *cq) 123{ 124 const int sortby = cq->cq_sortby; 125 struct buf *bp; 126 127 RB_TREE_FOREACH(bp, &cq->cq_buffers) { 128 if (sortby == BUFQ_SORT_RAWBLOCK) { 129 printf(" %jd", (intmax_t)bp->b_rawblkno); 130 } else { 131 printf(" %jd/%jd", 132 (intmax_t)bp->b_cylinder, (intmax_t)bp->b_rawblkno); 133 } 134 } 135} 136 137static inline bool 138cscan_empty(struct cscan_queue *q) 139{ 140 141 /* XXX this might do more work than necessary */ 142 return rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT) == NULL; 143} 144 145static void 146cscan_put(struct cscan_queue *q, struct buf *bp) 147{ 148 struct buf *obp __diagused; 149 150 obp = rb_tree_insert_node(&q->cq_buffers, bp); 151 KASSERT(obp == bp); /* see cscan_tree_compare_nodes */ 152} 153 154static struct buf * 155cscan_get(struct cscan_queue *q, int remove, struct cscan_key *key) 156{ 157 struct buf *bp; 158 159 bp = rb_tree_find_node_geq(&q->cq_buffers, key); 160 KDASSERT(bp == NULL || cscan_tree_compare_key(q, bp, key) >= 0); 161 if (bp == NULL) { 162 bp = rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT); 163 KDASSERT(cscan_tree_compare_key(q, bp, key) < 0); 164 } 165 if (bp != NULL && remove) { 166#if defined(DEBUG) 167 struct buf *nbp; 168#endif /* defined(DEBUG) */ 169 170 rb_tree_remove_node(&q->cq_buffers, bp); 171 /* 172 * remember the head position. 173 */ 174 key->k_cylinder = bp->b_cylinder; 175 key->k_rawblkno = bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT); 176#if defined(DEBUG) 177 nbp = rb_tree_find_node_geq(&q->cq_buffers, key); 178 if (nbp != NULL && cscan_tree_compare_nodes(q, nbp, bp) < 0) { 179 panic("%s: wrong order %p < %p\n", __func__, 180 nbp, bp); 181 } 182#endif /* defined(DEBUG) */ 183 } 184 return bp; 185} 186 187static void 188cscan_init(struct cscan_queue *q, int sortby) 189{ 190 static const rb_tree_ops_t cscan_tree_ops = { 191 .rbto_compare_nodes = cscan_tree_compare_nodes, 192 .rbto_compare_key = cscan_tree_compare_key, 193 .rbto_node_offset = offsetof(struct buf, b_u.u_rbnode), 194 .rbto_context = NULL, 195 }; 196 197 q->cq_sortby = sortby; 198 /* XXX copy ops to workaround rbtree.h API limitation */ 199 q->cq_ops = cscan_tree_ops; 200 q->cq_ops.rbto_context = q; 201 rb_tree_init(&q->cq_buffers, &q->cq_ops); 202} 203 204/* 205 * Per-prioritiy CSCAN. 206 * 207 * XXX probably we should have a way to raise 208 * priority of the on-queue requests. 209 */ 210#define PRIOCSCAN_NQUEUE 3 211 212struct priocscan_queue { 213 struct cscan_queue q_queue; 214 unsigned int q_burst; 215}; 216 217struct bufq_priocscan { 218 struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE]; 219 220#if defined(PRIOCSCAN_USE_GLOBAL_POSITION) 221 /* 222 * XXX using "global" head position can reduce positioning time 223 * when switching between queues. 224 * although it might affect against fairness. 225 */ 226 struct cscan_key bq_lastkey; 227#endif 228}; 229 230/* 231 * how many requests to serve when having pending requests on other queues. 232 * 233 * XXX tune 234 * be careful: while making these values larger likely 235 * increases the total throughput, it can also increase latencies 236 * for some workloads. 237 */ 238const int priocscan_burst[] = { 239 64, 16, 4 240}; 241 242static void bufq_priocscan_init(struct bufq_state *); 243static void bufq_priocscan_put(struct bufq_state *, struct buf *); 244static struct buf *bufq_priocscan_get(struct bufq_state *, int); 245 246BUFQ_DEFINE(priocscan, 40, bufq_priocscan_init); 247 248static inline struct cscan_queue *bufq_priocscan_selectqueue( 249 struct bufq_priocscan *, const struct buf *); 250 251static inline struct cscan_queue * 252bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp) 253{ 254 static const int priocscan_priomap[] = { 255 [BPRIO_TIMENONCRITICAL] = 2, 256 [BPRIO_TIMELIMITED] = 1, 257 [BPRIO_TIMECRITICAL] = 0 258 }; 259 260 return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue; 261} 262 263static void 264bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp) 265{ 266 struct bufq_priocscan *q = bufq_private(bufq); 267 struct cscan_queue *cq; 268 269 cq = bufq_priocscan_selectqueue(q, bp); 270 cscan_put(cq, bp); 271} 272 273static struct buf * 274bufq_priocscan_get(struct bufq_state *bufq, int remove) 275{ 276 struct bufq_priocscan *q = bufq_private(bufq); 277 struct priocscan_queue *pq, *npq; 278 struct priocscan_queue *first; /* highest priority non-empty queue */ 279 const struct priocscan_queue *epq; 280 struct buf *bp; 281 bool single; /* true if there's only one non-empty queue */ 282 283 /* 284 * find the highest priority non-empty queue. 285 */ 286 pq = &q->bq_queue[0]; 287 epq = pq + PRIOCSCAN_NQUEUE; 288 for (; pq < epq; pq++) { 289 if (!cscan_empty(&pq->q_queue)) { 290 break; 291 } 292 } 293 if (pq == epq) { 294 /* 295 * all our queues are empty. there's nothing to serve. 296 */ 297 return NULL; 298 } 299 first = pq; 300 301 /* 302 * scan the rest of queues. 303 * 304 * if we have two or more non-empty queues, we serve the highest 305 * priority one with non-zero burst count. 306 */ 307 single = true; 308 for (npq = pq + 1; npq < epq; npq++) { 309 if (!cscan_empty(&npq->q_queue)) { 310 /* 311 * we found another non-empty queue. 312 * it means that a queue needs to consume its burst 313 * count to be served. 314 */ 315 single = false; 316 317 /* 318 * check if our current candidate queue has already 319 * exhausted its burst count. 320 */ 321 if (pq->q_burst > 0) { 322 break; 323 } 324 pq = npq; 325 } 326 } 327 if (single) { 328 /* 329 * there's only a non-empty queue. 330 * just serve it without consuming its burst count. 331 */ 332 KASSERT(pq == first); 333 } else { 334 /* 335 * there are two or more non-empty queues. 336 */ 337 if (pq->q_burst == 0) { 338 /* 339 * no queues can be served because they have already 340 * exhausted their burst count. 341 */ 342 unsigned int i; 343#ifdef DEBUG 344 for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { 345 pq = &q->bq_queue[i]; 346 if (!cscan_empty(&pq->q_queue) && pq->q_burst) { 347 panic("%s: inconsist", __func__); 348 } 349 } 350#endif /* DEBUG */ 351 /* 352 * reset burst counts. 353 */ 354 if (remove) { 355 for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { 356 pq = &q->bq_queue[i]; 357 pq->q_burst = priocscan_burst[i]; 358 } 359 } 360 361 /* 362 * serve the highest priority non-empty queue. 363 */ 364 pq = first; 365 } 366 /* 367 * consume the burst count. 368 * 369 * XXX account only by number of requests. is it good enough? 370 */ 371 if (remove) { 372 KASSERT(pq->q_burst > 0); 373 pq->q_burst--; 374 } 375 } 376 377 /* 378 * finally, get a request from the selected queue. 379 */ 380 KDASSERT(!cscan_empty(&pq->q_queue)); 381 bp = cscan_get(&pq->q_queue, remove, 382#if defined(PRIOCSCAN_USE_GLOBAL_POSITION) 383 &q->bq_lastkey 384#else /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */ 385 &pq->q_queue.cq_lastkey 386#endif /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */ 387 ); 388 KDASSERT(bp != NULL); 389 KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp)); 390 391 return bp; 392} 393 394static struct buf * 395bufq_priocscan_cancel(struct bufq_state *bufq, struct buf *bp) 396{ 397 struct bufq_priocscan * const q = bufq_private(bufq); 398 unsigned int i; 399 400 for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { 401 struct cscan_queue * const cq = &q->bq_queue[i].q_queue; 402 struct buf *it; 403 404 /* 405 * XXX probably could be faster but the cancel functionality 406 * is not widely used anyway. 407 */ 408 RB_TREE_FOREACH(it, &cq->cq_buffers) { 409 if (it == bp) { 410 rb_tree_remove_node(&cq->cq_buffers, bp); 411 return bp; 412 } 413 } 414 } 415 return NULL; 416} 417 418static void 419bufq_priocscan_fini(struct bufq_state *bufq) 420{ 421 422 KASSERT(bufq->bq_private != NULL); 423 kmem_free(bufq->bq_private, sizeof(struct bufq_priocscan)); 424} 425 426static void 427bufq_priocscan_init(struct bufq_state *bufq) 428{ 429 struct bufq_priocscan *q; 430 const int sortby = bufq->bq_flags & BUFQ_SORT_MASK; 431 unsigned int i; 432 433 bufq->bq_get = bufq_priocscan_get; 434 bufq->bq_put = bufq_priocscan_put; 435 bufq->bq_cancel = bufq_priocscan_cancel; 436 bufq->bq_fini = bufq_priocscan_fini; 437 bufq->bq_private = kmem_zalloc(sizeof(struct bufq_priocscan), KM_SLEEP); 438 439 q = bufq->bq_private; 440 for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { 441 struct cscan_queue *cq = &q->bq_queue[i].q_queue; 442 443 cscan_init(cq, sortby); 444 } 445} 446 447MODULE(MODULE_CLASS_BUFQ, bufq_priocscan, NULL); 448 449static int 450bufq_priocscan_modcmd(modcmd_t cmd, void *opaque) 451{ 452 453 switch (cmd) { 454 case MODULE_CMD_INIT: 455 return bufq_register(&bufq_strat_priocscan); 456 case MODULE_CMD_FINI: 457 return bufq_unregister(&bufq_strat_priocscan); 458 default: 459 return ENOTTY; 460 } 461} 462