gs_rr.c revision 217324
1219820Sjeff/*- 2219820Sjeff * Copyright (c) 2009-2010 Fabio Checconi 3219820Sjeff * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa 4219820Sjeff * All rights reserved. 5219820Sjeff * 6219820Sjeff * Redistribution and use in source and binary forms, with or without 7219820Sjeff * modification, are permitted provided that the following conditions 8219820Sjeff * are met: 9219820Sjeff * 1. Redistributions of source code must retain the above copyright 10219820Sjeff * notice, this list of conditions and the following disclaimer. 11219820Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12219820Sjeff * notice, this list of conditions and the following disclaimer in the 13219820Sjeff * documentation and/or other materials provided with the distribution. 14219820Sjeff * 15219820Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16219820Sjeff * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17219820Sjeff * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18219820Sjeff * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19219820Sjeff * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20219820Sjeff * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21219820Sjeff * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22219820Sjeff * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23219820Sjeff * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24219820Sjeff * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25219820Sjeff * SUCH DAMAGE. 26219820Sjeff */ 27219820Sjeff 28219820Sjeff/* 29219820Sjeff * $Id$ 30219820Sjeff * $FreeBSD: head/sys/geom/sched/gs_rr.c 217324 2011-01-12 19:54:07Z mdf $ 31219820Sjeff * 32219820Sjeff * A round-robin (RR) anticipatory scheduler, with per-client queues. 33219820Sjeff * 34219820Sjeff * The goal of this implementation is to improve throughput compared 35219820Sjeff * to the pure elevator algorithm, and insure some fairness among 36219820Sjeff * clients. 37219820Sjeff * 38219820Sjeff * Requests coming from the same client are put in the same queue. 39219820Sjeff * We use anticipation to help reducing seeks, and each queue 40219820Sjeff * is never served continuously for more than a given amount of 41219820Sjeff * time or data. Queues are then served in a round-robin fashion. 42219820Sjeff * 43219820Sjeff * Each queue can be in any of the following states: 44219820Sjeff * READY immediately serve the first pending request; 45219820Sjeff * BUSY one request is under service, wait for completion; 46219820Sjeff * IDLING do not serve incoming requests immediately, unless 47219820Sjeff * they are "eligible" as defined later. 48219820Sjeff * 49219820Sjeff * Scheduling is made looking at the status of all queues, 50219820Sjeff * and the first one in round-robin order is privileged. 51219820Sjeff */ 52219820Sjeff 53219820Sjeff#include <sys/param.h> 54219820Sjeff#include <sys/systm.h> 55219820Sjeff#include <sys/kernel.h> 56219820Sjeff#include <sys/bio.h> 57219820Sjeff#include <sys/callout.h> 58219820Sjeff#include <sys/malloc.h> 59219820Sjeff#include <sys/module.h> 60219820Sjeff#include <sys/proc.h> 61219820Sjeff#include <sys/queue.h> 62219820Sjeff#include <sys/sysctl.h> 63219820Sjeff#include "gs_scheduler.h" 64219820Sjeff 65219820Sjeff/* possible states of the scheduler */ 66219820Sjeffenum g_rr_state { 67219820Sjeff G_QUEUE_READY = 0, /* Ready to dispatch. */ 68219820Sjeff G_QUEUE_BUSY, /* Waiting for a completion. */ 69219820Sjeff G_QUEUE_IDLING /* Waiting for a new request. */ 70219820Sjeff}; 71219820Sjeff 72219820Sjeff/* possible queue flags */ 73219820Sjeffenum g_rr_flags { 74219820Sjeff G_FLAG_COMPLETED = 1, /* Completed a req. in the current budget. */ 75219820Sjeff}; 76219820Sjeff 77219820Sjeffstruct g_rr_softc; 78219820Sjeff 79219820Sjeff/* 80219820Sjeff * Queue descriptor, containing reference count, scheduling 81219820Sjeff * state, a queue of pending requests, configuration parameters. 82219820Sjeff * Queues with pending request(s) and not under service are also 83219820Sjeff * stored in a Round Robin (RR) list. 84219820Sjeff */ 85219820Sjeffstruct g_rr_queue { 86219820Sjeff struct g_rr_softc *q_sc; /* link to the parent */ 87219820Sjeff 88219820Sjeff enum g_rr_state q_status; 89219820Sjeff unsigned int q_service; /* service received so far */ 90219820Sjeff int q_slice_end; /* actual slice end in ticks */ 91219820Sjeff enum g_rr_flags q_flags; /* queue flags */ 92219820Sjeff struct bio_queue_head q_bioq; 93219820Sjeff 94219820Sjeff /* Scheduling parameters */ 95219820Sjeff unsigned int q_budget; /* slice size in bytes */ 96219820Sjeff unsigned int q_slice_duration; /* slice size in ticks */ 97219820Sjeff unsigned int q_wait_ticks; /* wait time for anticipation */ 98219820Sjeff 99219820Sjeff /* Stats to drive the various heuristics. */ 100219820Sjeff struct g_savg q_thinktime; /* Thinktime average. */ 101219820Sjeff struct g_savg q_seekdist; /* Seek distance average. */ 102219820Sjeff 103219820Sjeff int q_bionum; /* Number of requests. */ 104219820Sjeff 105219820Sjeff off_t q_lastoff; /* Last submitted req. offset. */ 106219820Sjeff int q_lastsub; /* Last submitted req. time. */ 107219820Sjeff 108219820Sjeff /* Expiration deadline for an empty queue. */ 109219820Sjeff int q_expire; 110219820Sjeff 111219820Sjeff TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */ 112219820Sjeff}; 113219820Sjeff 114219820Sjeff/* List types. */ 115219820SjeffTAILQ_HEAD(g_rr_tailq, g_rr_queue); 116219820Sjeff 117219820Sjeff/* list of scheduler instances */ 118219820SjeffLIST_HEAD(g_scheds, g_rr_softc); 119219820Sjeff 120219820Sjeff/* Default quantum for RR between queues. */ 121219820Sjeff#define G_RR_DEFAULT_BUDGET 0x00800000 122219820Sjeff 123219820Sjeff/* 124219820Sjeff * Per device descriptor, holding the Round Robin list of queues 125219820Sjeff * accessing the disk, a reference to the geom, and the timer. 126219820Sjeff */ 127219820Sjeffstruct g_rr_softc { 128219820Sjeff struct g_geom *sc_geom; 129219820Sjeff 130219820Sjeff /* 131219820Sjeff * sc_active is the queue we are anticipating for. 132219820Sjeff * It is set only in gs_rr_next(), and possibly cleared 133219820Sjeff * only in gs_rr_next() or on a timeout. 134219820Sjeff * The active queue is never in the Round Robin list 135219820Sjeff * even if it has requests queued. 136219820Sjeff */ 137219820Sjeff struct g_rr_queue *sc_active; 138219820Sjeff struct callout sc_wait; /* timer for sc_active */ 139219820Sjeff 140219820Sjeff struct g_rr_tailq sc_rr_tailq; /* the round-robin list */ 141219820Sjeff int sc_nqueues; /* number of queues */ 142219820Sjeff 143219820Sjeff /* Statistics */ 144 int sc_in_flight; /* requests in the driver */ 145 146 LIST_ENTRY(g_rr_softc) sc_next; 147}; 148 149/* Descriptor for bounded values, min and max are constant. */ 150struct x_bound { 151 const int x_min; 152 int x_cur; 153 const int x_max; 154}; 155 156/* 157 * parameters, config and stats 158 */ 159struct g_rr_params { 160 int queues; /* total number of queues */ 161 int w_anticipate; /* anticipate writes */ 162 int bypass; /* bypass scheduling writes */ 163 164 int units; /* how many instances */ 165 /* sc_head is used for debugging */ 166 struct g_scheds sc_head; /* first scheduler instance */ 167 168 struct x_bound queue_depth; /* max parallel requests */ 169 struct x_bound wait_ms; /* wait time, milliseconds */ 170 struct x_bound quantum_ms; /* quantum size, milliseconds */ 171 struct x_bound quantum_kb; /* quantum size, Kb (1024 bytes) */ 172 173 /* statistics */ 174 int wait_hit; /* success in anticipation */ 175 int wait_miss; /* failure in anticipation */ 176}; 177 178/* 179 * Default parameters for the scheduler. The quantum sizes target 180 * a 80MB/s disk; if the hw is faster or slower the minimum of the 181 * two will have effect: the clients will still be isolated but 182 * the fairness may be limited. A complete solution would involve 183 * the on-line measurement of the actual disk throughput to derive 184 * these parameters. Or we may just choose to ignore service domain 185 * fairness and accept what can be achieved with time-only budgets. 186 */ 187static struct g_rr_params me = { 188 .sc_head = LIST_HEAD_INITIALIZER(&me.sc_head), 189 .w_anticipate = 1, 190 .queue_depth = { 1, 1, 50 }, 191 .wait_ms = { 1, 10, 30 }, 192 .quantum_ms = { 1, 100, 500 }, 193 .quantum_kb = { 16, 8192, 65536 }, 194}; 195 196struct g_rr_params *gs_rr_me = &me; 197 198SYSCTL_DECL(_kern_geom_sched); 199SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0, 200 "GEOM_SCHED ROUND ROBIN stuff"); 201SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD, 202 &me.units, 0, "Scheduler instances"); 203SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD, 204 &me.queues, 0, "Total rr queues"); 205SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW, 206 &me.wait_ms.x_cur, 0, "Wait time milliseconds"); 207SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW, 208 &me.quantum_ms.x_cur, 0, "Quantum size milliseconds"); 209SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW, 210 &me.bypass, 0, "Bypass scheduler"); 211SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW, 212 &me.w_anticipate, 0, "Do anticipation on writes"); 213SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW, 214 &me.quantum_kb.x_cur, 0, "Quantum size Kbytes"); 215SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW, 216 &me.queue_depth.x_cur, 0, "Maximum simultaneous requests"); 217SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW, 218 &me.wait_hit, 0, "Hits in anticipation"); 219SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW, 220 &me.wait_miss, 0, "Misses in anticipation"); 221 222#ifdef DEBUG_QUEUES 223/* print the status of a queue */ 224static void 225gs_rr_dump_q(struct g_rr_queue *qp, int index) 226{ 227 int l = 0; 228 struct bio *bp; 229 230 TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) { 231 l++; 232 } 233 printf("--- rr queue %d %p status %d len %d ---\n", 234 index, qp, qp->q_status, l); 235} 236 237/* 238 * Dump the scheduler status when writing to this sysctl variable. 239 * XXX right now we only dump the status of the last instance created. 240 * not a severe issue because this is only for debugging 241 */ 242static int 243gs_rr_sysctl_status(SYSCTL_HANDLER_ARGS) 244{ 245 int error, val = 0; 246 struct g_rr_softc *sc; 247 248 error = sysctl_handle_int(oidp, &val, 0, req); 249 if (error || !req->newptr ) 250 return (error); 251 252 printf("called %s\n", __FUNCTION__); 253 254 LIST_FOREACH(sc, &me.sc_head, sc_next) { 255 int i, tot = 0; 256 printf("--- sc %p active %p nqueues %d " 257 "callout %d in_flight %d ---\n", 258 sc, sc->sc_active, sc->sc_nqueues, 259 callout_active(&sc->sc_wait), 260 sc->sc_in_flight); 261 for (i = 0; i < G_RR_HASH_SIZE; i++) { 262 struct g_rr_queue *qp; 263 LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) { 264 gs_rr_dump_q(qp, tot); 265 tot++; 266 } 267 } 268 } 269 return (0); 270} 271 272SYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status, 273 CTLTYPE_UINT | CTLFLAG_RW, 274 0, sizeof(int), gs_rr_sysctl_status, "I", "status"); 275 276#endif /* DEBUG_QUEUES */ 277 278/* 279 * Get a bounded value, optionally convert to a min of t_min ticks. 280 */ 281static int 282get_bounded(struct x_bound *v, int t_min) 283{ 284 int x; 285 286 x = v->x_cur; 287 if (x < v->x_min) 288 x = v->x_min; 289 else if (x > v->x_max) 290 x = v->x_max; 291 if (t_min) { 292 x = x * hz / 1000; /* convert to ticks */ 293 if (x < t_min) 294 x = t_min; 295 } 296 return x; 297} 298 299/* 300 * Get a reference to the queue for bp, using the generic 301 * classification mechanism. 302 */ 303static struct g_rr_queue * 304g_rr_queue_get(struct g_rr_softc *sc, struct bio *bp) 305{ 306 307 return (g_sched_get_class(sc->sc_geom, bp)); 308} 309 310static int 311g_rr_init_class(void *data, void *priv) 312{ 313 struct g_rr_softc *sc = data; 314 struct g_rr_queue *qp = priv; 315 316 gs_bioq_init(&qp->q_bioq); 317 318 /* 319 * Set the initial parameters for the client: 320 * slice size in bytes and ticks, and wait ticks. 321 * Right now these are constant, but we could have 322 * autoconfiguration code to adjust the values based on 323 * the actual workload. 324 */ 325 qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0); 326 qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); 327 qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); 328 329 qp->q_sc = sc; /* link to the parent */ 330 qp->q_sc->sc_nqueues++; 331 me.queues++; 332 333 return (0); 334} 335 336/* 337 * Release a reference to the queue. 338 */ 339static void 340g_rr_queue_put(struct g_rr_queue *qp) 341{ 342 343 g_sched_put_class(qp->q_sc->sc_geom, qp); 344} 345 346static void 347g_rr_fini_class(void *data, void *priv) 348{ 349 struct g_rr_queue *qp = priv; 350 351 KASSERT(gs_bioq_first(&qp->q_bioq) == NULL, 352 ("released nonempty queue")); 353 qp->q_sc->sc_nqueues--; 354 me.queues--; 355} 356 357static inline int 358g_rr_queue_expired(struct g_rr_queue *qp) 359{ 360 361 if (qp->q_service >= qp->q_budget) 362 return (1); 363 364 if ((qp->q_flags & G_FLAG_COMPLETED) && 365 ticks - qp->q_slice_end >= 0) 366 return (1); 367 368 return (0); 369} 370 371static inline int 372g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp) 373{ 374 int wait = get_bounded(&me.wait_ms, 2); 375 376 if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE)) 377 return (0); 378 379 if (g_savg_valid(&qp->q_thinktime) && 380 g_savg_read(&qp->q_thinktime) > wait) 381 return (0); 382 383 if (g_savg_valid(&qp->q_seekdist) && 384 g_savg_read(&qp->q_seekdist) > 8192) 385 return (0); 386 387 return (1); 388} 389 390/* 391 * Called on a request arrival, timeout or completion. 392 * Try to serve a request among those queued. 393 */ 394static struct bio * 395g_rr_next(void *data, int force) 396{ 397 struct g_rr_softc *sc = data; 398 struct g_rr_queue *qp; 399 struct bio *bp, *next; 400 int expired; 401 402 qp = sc->sc_active; 403 if (me.bypass == 0 && !force) { 404 if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0)) 405 return (NULL); 406 407 /* Try with the queue under service first. */ 408 if (qp != NULL && qp->q_status != G_QUEUE_READY) { 409 /* 410 * Queue is anticipating, ignore request. 411 * We should check that we are not past 412 * the timeout, but in that case the timeout 413 * will fire immediately afterwards so we 414 * don't bother. 415 */ 416 return (NULL); 417 } 418 } else if (qp != NULL && qp->q_status != G_QUEUE_READY) { 419 g_rr_queue_put(qp); 420 sc->sc_active = qp = NULL; 421 } 422 423 /* 424 * No queue under service, look for the first in RR order. 425 * If we find it, select if as sc_active, clear service 426 * and record the end time of the slice. 427 */ 428 if (qp == NULL) { 429 qp = TAILQ_FIRST(&sc->sc_rr_tailq); 430 if (qp == NULL) 431 return (NULL); /* no queues at all, return */ 432 /* otherwise select the new queue for service. */ 433 TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq); 434 sc->sc_active = qp; 435 qp->q_service = 0; 436 qp->q_flags &= ~G_FLAG_COMPLETED; 437 } 438 439 bp = gs_bioq_takefirst(&qp->q_bioq); /* surely not NULL */ 440 qp->q_service += bp->bio_length; /* charge the service */ 441 442 /* 443 * The request at the head of the active queue is always 444 * dispatched, and gs_rr_next() will be called again 445 * immediately. 446 * We need to prepare for what to do next: 447 * 448 * 1. have we reached the end of the (time or service) slice ? 449 * If so, clear sc_active and possibly requeue the previous 450 * active queue if it has more requests pending; 451 * 2. do we have more requests in sc_active ? 452 * If yes, do not anticipate, as gs_rr_next() will run again; 453 * if no, decide whether or not to anticipate depending 454 * on read or writes (e.g., anticipate only on reads). 455 */ 456 expired = g_rr_queue_expired(qp); /* are we expired ? */ 457 next = gs_bioq_first(&qp->q_bioq); /* do we have one more ? */ 458 if (expired) { 459 sc->sc_active = NULL; 460 /* Either requeue or release reference. */ 461 if (next != NULL) 462 TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); 463 else 464 g_rr_queue_put(qp); 465 } else if (next != NULL) { 466 qp->q_status = G_QUEUE_READY; 467 } else { 468 if (!force && g_rr_should_anticipate(qp, bp)) { 469 /* anticipate */ 470 qp->q_status = G_QUEUE_BUSY; 471 } else { 472 /* do not anticipate, release reference */ 473 g_rr_queue_put(qp); 474 sc->sc_active = NULL; 475 } 476 } 477 /* If sc_active != NULL, its q_status is always correct. */ 478 479 sc->sc_in_flight++; 480 481 return (bp); 482} 483 484static inline void 485g_rr_update_thinktime(struct g_rr_queue *qp) 486{ 487 int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2); 488 489 if (qp->q_sc->sc_active != qp) 490 return; 491 492 qp->q_lastsub = ticks; 493 delta = (delta > 2 * wait) ? 2 * wait : delta; 494 if (qp->q_bionum > 7) 495 g_savg_add_sample(&qp->q_thinktime, delta); 496} 497 498static inline void 499g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp) 500{ 501 off_t dist; 502 503 if (qp->q_lastoff > bp->bio_offset) 504 dist = qp->q_lastoff - bp->bio_offset; 505 else 506 dist = bp->bio_offset - qp->q_lastoff; 507 508 if (dist > (8192 * 8)) 509 dist = 8192 * 8; 510 511 qp->q_lastoff = bp->bio_offset + bp->bio_length; 512 513 if (qp->q_bionum > 7) 514 g_savg_add_sample(&qp->q_seekdist, dist); 515} 516 517/* 518 * Called when a real request for disk I/O arrives. 519 * Locate the queue associated with the client. 520 * If the queue is the one we are anticipating for, reset its timeout; 521 * if the queue is not in the round robin list, insert it in the list. 522 * On any error, do not queue the request and return -1, the caller 523 * will take care of this request. 524 */ 525static int 526g_rr_start(void *data, struct bio *bp) 527{ 528 struct g_rr_softc *sc = data; 529 struct g_rr_queue *qp; 530 531 if (me.bypass) 532 return (-1); /* bypass the scheduler */ 533 534 /* Get the queue for the request. */ 535 qp = g_rr_queue_get(sc, bp); 536 if (qp == NULL) 537 return (-1); /* allocation failed, tell upstream */ 538 539 if (gs_bioq_first(&qp->q_bioq) == NULL) { 540 /* 541 * We are inserting into an empty queue. 542 * Reset its state if it is sc_active, 543 * otherwise insert it in the RR list. 544 */ 545 if (qp == sc->sc_active) { 546 qp->q_status = G_QUEUE_READY; 547 callout_stop(&sc->sc_wait); 548 } else { 549 g_sched_priv_ref(qp); 550 TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); 551 } 552 } 553 554 qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3); 555 556 g_rr_update_thinktime(qp); 557 g_rr_update_seekdist(qp, bp); 558 559 /* Inherit the reference returned by g_rr_queue_get(). */ 560 bp->bio_caller1 = qp; 561 gs_bioq_disksort(&qp->q_bioq, bp); 562 563 return (0); 564} 565 566/* 567 * Callout executed when a queue times out anticipating a new request. 568 */ 569static void 570g_rr_wait_timeout(void *data) 571{ 572 struct g_rr_softc *sc = data; 573 struct g_geom *geom = sc->sc_geom; 574 575 g_sched_lock(geom); 576 /* 577 * We can race with other events, so check if 578 * sc_active is still valid. 579 */ 580 if (sc->sc_active != NULL) { 581 /* Release the reference to the queue. */ 582 g_rr_queue_put(sc->sc_active); 583 sc->sc_active = NULL; 584 me.wait_hit--; 585 me.wait_miss++; /* record the miss */ 586 } 587 g_sched_dispatch(geom); 588 g_sched_unlock(geom); 589} 590 591/* 592 * Module glue: allocate descriptor, initialize its fields. 593 */ 594static void * 595g_rr_init(struct g_geom *geom) 596{ 597 struct g_rr_softc *sc; 598 599 /* XXX check whether we can sleep */ 600 sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO); 601 sc->sc_geom = geom; 602 TAILQ_INIT(&sc->sc_rr_tailq); 603 callout_init(&sc->sc_wait, CALLOUT_MPSAFE); 604 LIST_INSERT_HEAD(&me.sc_head, sc, sc_next); 605 me.units++; 606 607 return (sc); 608} 609 610/* 611 * Module glue -- drain the callout structure, destroy the 612 * hash table and its element, and free the descriptor. 613 */ 614static void 615g_rr_fini(void *data) 616{ 617 struct g_rr_softc *sc = data; 618 619 callout_drain(&sc->sc_wait); 620 KASSERT(sc->sc_active == NULL, ("still a queue under service")); 621 KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues")); 622 623 LIST_REMOVE(sc, sc_next); 624 me.units--; 625 free(sc, M_GEOM_SCHED); 626} 627 628/* 629 * Called when the request under service terminates. 630 * Start the anticipation timer if needed. 631 */ 632static void 633g_rr_done(void *data, struct bio *bp) 634{ 635 struct g_rr_softc *sc = data; 636 struct g_rr_queue *qp; 637 638 sc->sc_in_flight--; 639 640 qp = bp->bio_caller1; 641 if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) { 642 if (!(qp->q_flags & G_FLAG_COMPLETED)) { 643 qp->q_flags |= G_FLAG_COMPLETED; 644 /* in case we want to make the slice adaptive */ 645 qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); 646 qp->q_slice_end = ticks + qp->q_slice_duration; 647 } 648 649 /* The queue is trying anticipation, start the timer. */ 650 qp->q_status = G_QUEUE_IDLING; 651 /* may make this adaptive */ 652 qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); 653 me.wait_hit++; 654 callout_reset(&sc->sc_wait, qp->q_wait_ticks, 655 g_rr_wait_timeout, sc); 656 } else 657 g_sched_dispatch(sc->sc_geom); 658 659 /* Release a reference to the queue. */ 660 g_rr_queue_put(qp); 661} 662 663static void 664g_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 665 struct g_consumer *cp, struct g_provider *pp) 666{ 667 if (indent == NULL) { /* plaintext */ 668 sbuf_printf(sb, " units %d queues %d", 669 me.units, me.queues); 670 } 671} 672 673static struct g_gsched g_rr = { 674 .gs_name = "rr", 675 .gs_priv_size = sizeof(struct g_rr_queue), 676 .gs_init = g_rr_init, 677 .gs_fini = g_rr_fini, 678 .gs_start = g_rr_start, 679 .gs_done = g_rr_done, 680 .gs_next = g_rr_next, 681 .gs_dumpconf = g_rr_dumpconf, 682 .gs_init_class = g_rr_init_class, 683 .gs_fini_class = g_rr_fini_class, 684}; 685 686DECLARE_GSCHED_MODULE(rr, &g_rr); 687