gs_rr.c revision 218675
1206497Sluigi/*- 2206552Sluigi * Copyright (c) 2009-2010 Fabio Checconi 3206552Sluigi * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa 4206497Sluigi * All rights reserved. 5206497Sluigi * 6206497Sluigi * Redistribution and use in source and binary forms, with or without 7206497Sluigi * modification, are permitted provided that the following conditions 8206497Sluigi * are met: 9206497Sluigi * 1. Redistributions of source code must retain the above copyright 10206497Sluigi * notice, this list of conditions and the following disclaimer. 11206497Sluigi * 2. Redistributions in binary form must reproduce the above copyright 12206497Sluigi * notice, this list of conditions and the following disclaimer in the 13206497Sluigi * documentation and/or other materials provided with the distribution. 14206497Sluigi * 15206497Sluigi * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16206497Sluigi * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17206497Sluigi * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18206497Sluigi * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19206497Sluigi * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20206497Sluigi * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21206497Sluigi * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22206497Sluigi * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23206497Sluigi * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24206497Sluigi * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25206497Sluigi * SUCH DAMAGE. 26206497Sluigi */ 27206497Sluigi 28206497Sluigi/* 29206497Sluigi * $Id$ 30206497Sluigi * $FreeBSD: head/sys/geom/sched/gs_rr.c 218675 2011-02-14 08:09:02Z luigi $ 31206497Sluigi * 32206497Sluigi * A round-robin (RR) anticipatory scheduler, with per-client queues. 33206497Sluigi * 34206497Sluigi * The goal of this implementation is to improve throughput compared 35206497Sluigi * to the pure elevator algorithm, and insure some fairness among 36206497Sluigi * clients. 37206497Sluigi * 38206497Sluigi * Requests coming from the same client are put in the same queue. 39206497Sluigi * We use anticipation to help reducing seeks, and each queue 40206497Sluigi * is never served continuously for more than a given amount of 41206497Sluigi * time or data. Queues are then served in a round-robin fashion. 42206497Sluigi * 43206497Sluigi * Each queue can be in any of the following states: 44206497Sluigi * READY immediately serve the first pending request; 45206497Sluigi * BUSY one request is under service, wait for completion; 46206497Sluigi * IDLING do not serve incoming requests immediately, unless 47206497Sluigi * they are "eligible" as defined later. 48206497Sluigi * 49206497Sluigi * Scheduling is made looking at the status of all queues, 50206497Sluigi * and the first one in round-robin order is privileged. 51206497Sluigi */ 52206497Sluigi 53206497Sluigi#include <sys/param.h> 54206497Sluigi#include <sys/systm.h> 55206497Sluigi#include <sys/kernel.h> 56206497Sluigi#include <sys/bio.h> 57206497Sluigi#include <sys/callout.h> 58206497Sluigi#include <sys/malloc.h> 59206497Sluigi#include <sys/module.h> 60206497Sluigi#include <sys/proc.h> 61206497Sluigi#include <sys/queue.h> 62206497Sluigi#include <sys/sysctl.h> 63206497Sluigi#include "gs_scheduler.h" 64206497Sluigi 65206497Sluigi/* possible states of the scheduler */ 66206497Sluigienum g_rr_state { 67206497Sluigi G_QUEUE_READY = 0, /* Ready to dispatch. */ 68206497Sluigi G_QUEUE_BUSY, /* Waiting for a completion. */ 69206497Sluigi G_QUEUE_IDLING /* Waiting for a new request. */ 70206497Sluigi}; 71206497Sluigi 72206497Sluigi/* possible queue flags */ 73206497Sluigienum g_rr_flags { 74218675Sluigi /* G_FLAG_COMPLETED means that the field q_slice_end is valid. */ 75206497Sluigi G_FLAG_COMPLETED = 1, /* Completed a req. in the current budget. */ 76206497Sluigi}; 77206497Sluigi 78206497Sluigistruct g_rr_softc; 79206497Sluigi 80206497Sluigi/* 81206497Sluigi * Queue descriptor, containing reference count, scheduling 82206497Sluigi * state, a queue of pending requests, configuration parameters. 83206497Sluigi * Queues with pending request(s) and not under service are also 84206497Sluigi * stored in a Round Robin (RR) list. 85206497Sluigi */ 86206497Sluigistruct g_rr_queue { 87206497Sluigi struct g_rr_softc *q_sc; /* link to the parent */ 88206497Sluigi 89206497Sluigi enum g_rr_state q_status; 90206497Sluigi unsigned int q_service; /* service received so far */ 91218675Sluigi int q_slice_end; /* actual slice end time, in ticks */ 92206497Sluigi enum g_rr_flags q_flags; /* queue flags */ 93206497Sluigi struct bio_queue_head q_bioq; 94206497Sluigi 95206497Sluigi /* Scheduling parameters */ 96206497Sluigi unsigned int q_budget; /* slice size in bytes */ 97206497Sluigi unsigned int q_slice_duration; /* slice size in ticks */ 98206497Sluigi unsigned int q_wait_ticks; /* wait time for anticipation */ 99206497Sluigi 100206497Sluigi /* Stats to drive the various heuristics. */ 101206497Sluigi struct g_savg q_thinktime; /* Thinktime average. */ 102206497Sluigi struct g_savg q_seekdist; /* Seek distance average. */ 103206497Sluigi 104206497Sluigi int q_bionum; /* Number of requests. */ 105206497Sluigi 106206497Sluigi off_t q_lastoff; /* Last submitted req. offset. */ 107206497Sluigi int q_lastsub; /* Last submitted req. time. */ 108206497Sluigi 109206497Sluigi /* Expiration deadline for an empty queue. */ 110206497Sluigi int q_expire; 111206497Sluigi 112206497Sluigi TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */ 113206497Sluigi}; 114206497Sluigi 115206497Sluigi/* List types. */ 116206497SluigiTAILQ_HEAD(g_rr_tailq, g_rr_queue); 117206497Sluigi 118206497Sluigi/* list of scheduler instances */ 119206497SluigiLIST_HEAD(g_scheds, g_rr_softc); 120206497Sluigi 121206497Sluigi/* Default quantum for RR between queues. */ 122206497Sluigi#define G_RR_DEFAULT_BUDGET 0x00800000 123206497Sluigi 124206497Sluigi/* 125206497Sluigi * Per device descriptor, holding the Round Robin list of queues 126206497Sluigi * accessing the disk, a reference to the geom, and the timer. 127206497Sluigi */ 128206497Sluigistruct g_rr_softc { 129206497Sluigi struct g_geom *sc_geom; 130206497Sluigi 131206497Sluigi /* 132206497Sluigi * sc_active is the queue we are anticipating for. 133206497Sluigi * It is set only in gs_rr_next(), and possibly cleared 134206497Sluigi * only in gs_rr_next() or on a timeout. 135206497Sluigi * The active queue is never in the Round Robin list 136206497Sluigi * even if it has requests queued. 137206497Sluigi */ 138206497Sluigi struct g_rr_queue *sc_active; 139206497Sluigi struct callout sc_wait; /* timer for sc_active */ 140206497Sluigi 141206497Sluigi struct g_rr_tailq sc_rr_tailq; /* the round-robin list */ 142206497Sluigi int sc_nqueues; /* number of queues */ 143206497Sluigi 144206497Sluigi /* Statistics */ 145206497Sluigi int sc_in_flight; /* requests in the driver */ 146206497Sluigi 147206497Sluigi LIST_ENTRY(g_rr_softc) sc_next; 148206497Sluigi}; 149206497Sluigi 150206497Sluigi/* Descriptor for bounded values, min and max are constant. */ 151206497Sluigistruct x_bound { 152206497Sluigi const int x_min; 153206497Sluigi int x_cur; 154206497Sluigi const int x_max; 155206497Sluigi}; 156206497Sluigi 157206497Sluigi/* 158206497Sluigi * parameters, config and stats 159206497Sluigi */ 160206497Sluigistruct g_rr_params { 161206497Sluigi int queues; /* total number of queues */ 162206497Sluigi int w_anticipate; /* anticipate writes */ 163206497Sluigi int bypass; /* bypass scheduling writes */ 164206497Sluigi 165206497Sluigi int units; /* how many instances */ 166206497Sluigi /* sc_head is used for debugging */ 167206497Sluigi struct g_scheds sc_head; /* first scheduler instance */ 168206497Sluigi 169206497Sluigi struct x_bound queue_depth; /* max parallel requests */ 170206497Sluigi struct x_bound wait_ms; /* wait time, milliseconds */ 171206497Sluigi struct x_bound quantum_ms; /* quantum size, milliseconds */ 172206497Sluigi struct x_bound quantum_kb; /* quantum size, Kb (1024 bytes) */ 173206497Sluigi 174206497Sluigi /* statistics */ 175206497Sluigi int wait_hit; /* success in anticipation */ 176206497Sluigi int wait_miss; /* failure in anticipation */ 177206497Sluigi}; 178206497Sluigi 179206497Sluigi/* 180206497Sluigi * Default parameters for the scheduler. The quantum sizes target 181206497Sluigi * a 80MB/s disk; if the hw is faster or slower the minimum of the 182206497Sluigi * two will have effect: the clients will still be isolated but 183206497Sluigi * the fairness may be limited. A complete solution would involve 184206497Sluigi * the on-line measurement of the actual disk throughput to derive 185206497Sluigi * these parameters. Or we may just choose to ignore service domain 186206497Sluigi * fairness and accept what can be achieved with time-only budgets. 187206497Sluigi */ 188206497Sluigistatic struct g_rr_params me = { 189206497Sluigi .sc_head = LIST_HEAD_INITIALIZER(&me.sc_head), 190206497Sluigi .w_anticipate = 1, 191206497Sluigi .queue_depth = { 1, 1, 50 }, 192206497Sluigi .wait_ms = { 1, 10, 30 }, 193206497Sluigi .quantum_ms = { 1, 100, 500 }, 194206497Sluigi .quantum_kb = { 16, 8192, 65536 }, 195206497Sluigi}; 196206497Sluigi 197206497Sluigistruct g_rr_params *gs_rr_me = &me; 198206497Sluigi 199206497SluigiSYSCTL_DECL(_kern_geom_sched); 200206497SluigiSYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0, 201206497Sluigi "GEOM_SCHED ROUND ROBIN stuff"); 202217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD, 203206497Sluigi &me.units, 0, "Scheduler instances"); 204217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD, 205206497Sluigi &me.queues, 0, "Total rr queues"); 206217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW, 207206497Sluigi &me.wait_ms.x_cur, 0, "Wait time milliseconds"); 208217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW, 209206497Sluigi &me.quantum_ms.x_cur, 0, "Quantum size milliseconds"); 210217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW, 211206497Sluigi &me.bypass, 0, "Bypass scheduler"); 212217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW, 213206497Sluigi &me.w_anticipate, 0, "Do anticipation on writes"); 214217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW, 215206497Sluigi &me.quantum_kb.x_cur, 0, "Quantum size Kbytes"); 216217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW, 217206497Sluigi &me.queue_depth.x_cur, 0, "Maximum simultaneous requests"); 218217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW, 219206497Sluigi &me.wait_hit, 0, "Hits in anticipation"); 220217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW, 221206497Sluigi &me.wait_miss, 0, "Misses in anticipation"); 222206497Sluigi 223206497Sluigi#ifdef DEBUG_QUEUES 224206497Sluigi/* print the status of a queue */ 225206497Sluigistatic void 226206497Sluigigs_rr_dump_q(struct g_rr_queue *qp, int index) 227206497Sluigi{ 228206497Sluigi int l = 0; 229206497Sluigi struct bio *bp; 230206497Sluigi 231206497Sluigi TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) { 232206497Sluigi l++; 233206497Sluigi } 234206497Sluigi printf("--- rr queue %d %p status %d len %d ---\n", 235206497Sluigi index, qp, qp->q_status, l); 236206497Sluigi} 237206497Sluigi 238206497Sluigi/* 239206497Sluigi * Dump the scheduler status when writing to this sysctl variable. 240206497Sluigi * XXX right now we only dump the status of the last instance created. 241206497Sluigi * not a severe issue because this is only for debugging 242206497Sluigi */ 243206497Sluigistatic int 244206497Sluigigs_rr_sysctl_status(SYSCTL_HANDLER_ARGS) 245206497Sluigi{ 246206497Sluigi int error, val = 0; 247206497Sluigi struct g_rr_softc *sc; 248206497Sluigi 249206497Sluigi error = sysctl_handle_int(oidp, &val, 0, req); 250206497Sluigi if (error || !req->newptr ) 251206497Sluigi return (error); 252206497Sluigi 253206497Sluigi printf("called %s\n", __FUNCTION__); 254206497Sluigi 255206497Sluigi LIST_FOREACH(sc, &me.sc_head, sc_next) { 256206497Sluigi int i, tot = 0; 257206497Sluigi printf("--- sc %p active %p nqueues %d " 258206497Sluigi "callout %d in_flight %d ---\n", 259206497Sluigi sc, sc->sc_active, sc->sc_nqueues, 260206497Sluigi callout_active(&sc->sc_wait), 261206497Sluigi sc->sc_in_flight); 262206497Sluigi for (i = 0; i < G_RR_HASH_SIZE; i++) { 263206497Sluigi struct g_rr_queue *qp; 264206497Sluigi LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) { 265206497Sluigi gs_rr_dump_q(qp, tot); 266206497Sluigi tot++; 267206497Sluigi } 268206497Sluigi } 269206497Sluigi } 270206497Sluigi return (0); 271206497Sluigi} 272206497Sluigi 273206497SluigiSYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status, 274206497Sluigi CTLTYPE_UINT | CTLFLAG_RW, 275206497Sluigi 0, sizeof(int), gs_rr_sysctl_status, "I", "status"); 276206497Sluigi 277206497Sluigi#endif /* DEBUG_QUEUES */ 278206497Sluigi 279206497Sluigi/* 280206497Sluigi * Get a bounded value, optionally convert to a min of t_min ticks. 281206497Sluigi */ 282206497Sluigistatic int 283206497Sluigiget_bounded(struct x_bound *v, int t_min) 284206497Sluigi{ 285206497Sluigi int x; 286206497Sluigi 287206497Sluigi x = v->x_cur; 288206497Sluigi if (x < v->x_min) 289206497Sluigi x = v->x_min; 290206497Sluigi else if (x > v->x_max) 291206497Sluigi x = v->x_max; 292206497Sluigi if (t_min) { 293206497Sluigi x = x * hz / 1000; /* convert to ticks */ 294206497Sluigi if (x < t_min) 295206497Sluigi x = t_min; 296206497Sluigi } 297206497Sluigi return x; 298206497Sluigi} 299206497Sluigi 300206497Sluigi/* 301206497Sluigi * Get a reference to the queue for bp, using the generic 302206497Sluigi * classification mechanism. 303206497Sluigi */ 304206497Sluigistatic struct g_rr_queue * 305206497Sluigig_rr_queue_get(struct g_rr_softc *sc, struct bio *bp) 306206497Sluigi{ 307206497Sluigi 308206497Sluigi return (g_sched_get_class(sc->sc_geom, bp)); 309206497Sluigi} 310206497Sluigi 311206497Sluigistatic int 312206497Sluigig_rr_init_class(void *data, void *priv) 313206497Sluigi{ 314206497Sluigi struct g_rr_softc *sc = data; 315206497Sluigi struct g_rr_queue *qp = priv; 316206497Sluigi 317206497Sluigi gs_bioq_init(&qp->q_bioq); 318206497Sluigi 319206497Sluigi /* 320206497Sluigi * Set the initial parameters for the client: 321206497Sluigi * slice size in bytes and ticks, and wait ticks. 322206497Sluigi * Right now these are constant, but we could have 323206497Sluigi * autoconfiguration code to adjust the values based on 324206497Sluigi * the actual workload. 325206497Sluigi */ 326206497Sluigi qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0); 327206497Sluigi qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); 328206497Sluigi qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); 329206497Sluigi 330206497Sluigi qp->q_sc = sc; /* link to the parent */ 331206497Sluigi qp->q_sc->sc_nqueues++; 332206497Sluigi me.queues++; 333206497Sluigi 334206497Sluigi return (0); 335206497Sluigi} 336206497Sluigi 337206497Sluigi/* 338206497Sluigi * Release a reference to the queue. 339206497Sluigi */ 340206497Sluigistatic void 341206497Sluigig_rr_queue_put(struct g_rr_queue *qp) 342206497Sluigi{ 343206497Sluigi 344206497Sluigi g_sched_put_class(qp->q_sc->sc_geom, qp); 345206497Sluigi} 346206497Sluigi 347206497Sluigistatic void 348206497Sluigig_rr_fini_class(void *data, void *priv) 349206497Sluigi{ 350206497Sluigi struct g_rr_queue *qp = priv; 351206497Sluigi 352206497Sluigi KASSERT(gs_bioq_first(&qp->q_bioq) == NULL, 353206497Sluigi ("released nonempty queue")); 354206497Sluigi qp->q_sc->sc_nqueues--; 355206497Sluigi me.queues--; 356206497Sluigi} 357206497Sluigi 358206497Sluigistatic inline int 359206497Sluigig_rr_queue_expired(struct g_rr_queue *qp) 360206497Sluigi{ 361206497Sluigi 362206497Sluigi if (qp->q_service >= qp->q_budget) 363206497Sluigi return (1); 364206497Sluigi 365206497Sluigi if ((qp->q_flags & G_FLAG_COMPLETED) && 366206497Sluigi ticks - qp->q_slice_end >= 0) 367206497Sluigi return (1); 368206497Sluigi 369206497Sluigi return (0); 370206497Sluigi} 371206497Sluigi 372206497Sluigistatic inline int 373206497Sluigig_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp) 374206497Sluigi{ 375206497Sluigi int wait = get_bounded(&me.wait_ms, 2); 376206497Sluigi 377206497Sluigi if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE)) 378206497Sluigi return (0); 379206497Sluigi 380206497Sluigi if (g_savg_valid(&qp->q_thinktime) && 381206497Sluigi g_savg_read(&qp->q_thinktime) > wait) 382206497Sluigi return (0); 383206497Sluigi 384206497Sluigi if (g_savg_valid(&qp->q_seekdist) && 385206497Sluigi g_savg_read(&qp->q_seekdist) > 8192) 386206497Sluigi return (0); 387206497Sluigi 388206497Sluigi return (1); 389206497Sluigi} 390206497Sluigi 391206497Sluigi/* 392206497Sluigi * Called on a request arrival, timeout or completion. 393206497Sluigi * Try to serve a request among those queued. 394206497Sluigi */ 395206497Sluigistatic struct bio * 396206497Sluigig_rr_next(void *data, int force) 397206497Sluigi{ 398206497Sluigi struct g_rr_softc *sc = data; 399206497Sluigi struct g_rr_queue *qp; 400206497Sluigi struct bio *bp, *next; 401206497Sluigi int expired; 402206497Sluigi 403206497Sluigi qp = sc->sc_active; 404206497Sluigi if (me.bypass == 0 && !force) { 405206497Sluigi if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0)) 406206497Sluigi return (NULL); 407206497Sluigi 408206497Sluigi /* Try with the queue under service first. */ 409206497Sluigi if (qp != NULL && qp->q_status != G_QUEUE_READY) { 410206497Sluigi /* 411206497Sluigi * Queue is anticipating, ignore request. 412206497Sluigi * We should check that we are not past 413206497Sluigi * the timeout, but in that case the timeout 414206497Sluigi * will fire immediately afterwards so we 415206497Sluigi * don't bother. 416206497Sluigi */ 417206497Sluigi return (NULL); 418206497Sluigi } 419206497Sluigi } else if (qp != NULL && qp->q_status != G_QUEUE_READY) { 420206497Sluigi g_rr_queue_put(qp); 421206497Sluigi sc->sc_active = qp = NULL; 422206497Sluigi } 423206497Sluigi 424206497Sluigi /* 425206497Sluigi * No queue under service, look for the first in RR order. 426206497Sluigi * If we find it, select if as sc_active, clear service 427206497Sluigi * and record the end time of the slice. 428206497Sluigi */ 429206497Sluigi if (qp == NULL) { 430206497Sluigi qp = TAILQ_FIRST(&sc->sc_rr_tailq); 431206497Sluigi if (qp == NULL) 432206497Sluigi return (NULL); /* no queues at all, return */ 433206497Sluigi /* otherwise select the new queue for service. */ 434206497Sluigi TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq); 435206497Sluigi sc->sc_active = qp; 436206497Sluigi qp->q_service = 0; 437206497Sluigi qp->q_flags &= ~G_FLAG_COMPLETED; 438206497Sluigi } 439206497Sluigi 440206497Sluigi bp = gs_bioq_takefirst(&qp->q_bioq); /* surely not NULL */ 441206497Sluigi qp->q_service += bp->bio_length; /* charge the service */ 442206497Sluigi 443206497Sluigi /* 444206497Sluigi * The request at the head of the active queue is always 445206497Sluigi * dispatched, and gs_rr_next() will be called again 446206497Sluigi * immediately. 447206497Sluigi * We need to prepare for what to do next: 448206497Sluigi * 449206497Sluigi * 1. have we reached the end of the (time or service) slice ? 450206497Sluigi * If so, clear sc_active and possibly requeue the previous 451206497Sluigi * active queue if it has more requests pending; 452206497Sluigi * 2. do we have more requests in sc_active ? 453206497Sluigi * If yes, do not anticipate, as gs_rr_next() will run again; 454206497Sluigi * if no, decide whether or not to anticipate depending 455206497Sluigi * on read or writes (e.g., anticipate only on reads). 456206497Sluigi */ 457206497Sluigi expired = g_rr_queue_expired(qp); /* are we expired ? */ 458206497Sluigi next = gs_bioq_first(&qp->q_bioq); /* do we have one more ? */ 459206497Sluigi if (expired) { 460206497Sluigi sc->sc_active = NULL; 461206497Sluigi /* Either requeue or release reference. */ 462206497Sluigi if (next != NULL) 463206497Sluigi TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); 464206497Sluigi else 465206497Sluigi g_rr_queue_put(qp); 466206497Sluigi } else if (next != NULL) { 467206497Sluigi qp->q_status = G_QUEUE_READY; 468206497Sluigi } else { 469206497Sluigi if (!force && g_rr_should_anticipate(qp, bp)) { 470206497Sluigi /* anticipate */ 471206497Sluigi qp->q_status = G_QUEUE_BUSY; 472206497Sluigi } else { 473206497Sluigi /* do not anticipate, release reference */ 474206497Sluigi g_rr_queue_put(qp); 475206497Sluigi sc->sc_active = NULL; 476206497Sluigi } 477206497Sluigi } 478206497Sluigi /* If sc_active != NULL, its q_status is always correct. */ 479206497Sluigi 480206497Sluigi sc->sc_in_flight++; 481206497Sluigi 482206497Sluigi return (bp); 483206497Sluigi} 484206497Sluigi 485206497Sluigistatic inline void 486206497Sluigig_rr_update_thinktime(struct g_rr_queue *qp) 487206497Sluigi{ 488206497Sluigi int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2); 489206497Sluigi 490206497Sluigi if (qp->q_sc->sc_active != qp) 491206497Sluigi return; 492206497Sluigi 493206497Sluigi qp->q_lastsub = ticks; 494206497Sluigi delta = (delta > 2 * wait) ? 2 * wait : delta; 495206497Sluigi if (qp->q_bionum > 7) 496206497Sluigi g_savg_add_sample(&qp->q_thinktime, delta); 497206497Sluigi} 498206497Sluigi 499206497Sluigistatic inline void 500206497Sluigig_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp) 501206497Sluigi{ 502206497Sluigi off_t dist; 503206497Sluigi 504206497Sluigi if (qp->q_lastoff > bp->bio_offset) 505206497Sluigi dist = qp->q_lastoff - bp->bio_offset; 506206497Sluigi else 507206497Sluigi dist = bp->bio_offset - qp->q_lastoff; 508206497Sluigi 509206497Sluigi if (dist > (8192 * 8)) 510206497Sluigi dist = 8192 * 8; 511206497Sluigi 512206497Sluigi qp->q_lastoff = bp->bio_offset + bp->bio_length; 513206497Sluigi 514206497Sluigi if (qp->q_bionum > 7) 515206497Sluigi g_savg_add_sample(&qp->q_seekdist, dist); 516206497Sluigi} 517206497Sluigi 518206497Sluigi/* 519206497Sluigi * Called when a real request for disk I/O arrives. 520206497Sluigi * Locate the queue associated with the client. 521206497Sluigi * If the queue is the one we are anticipating for, reset its timeout; 522206497Sluigi * if the queue is not in the round robin list, insert it in the list. 523206497Sluigi * On any error, do not queue the request and return -1, the caller 524206497Sluigi * will take care of this request. 525206497Sluigi */ 526206497Sluigistatic int 527206497Sluigig_rr_start(void *data, struct bio *bp) 528206497Sluigi{ 529206497Sluigi struct g_rr_softc *sc = data; 530206497Sluigi struct g_rr_queue *qp; 531206497Sluigi 532206497Sluigi if (me.bypass) 533206497Sluigi return (-1); /* bypass the scheduler */ 534206497Sluigi 535206497Sluigi /* Get the queue for the request. */ 536206497Sluigi qp = g_rr_queue_get(sc, bp); 537206497Sluigi if (qp == NULL) 538206497Sluigi return (-1); /* allocation failed, tell upstream */ 539206497Sluigi 540206497Sluigi if (gs_bioq_first(&qp->q_bioq) == NULL) { 541206497Sluigi /* 542206497Sluigi * We are inserting into an empty queue. 543206497Sluigi * Reset its state if it is sc_active, 544206497Sluigi * otherwise insert it in the RR list. 545206497Sluigi */ 546206497Sluigi if (qp == sc->sc_active) { 547206497Sluigi qp->q_status = G_QUEUE_READY; 548206497Sluigi callout_stop(&sc->sc_wait); 549206497Sluigi } else { 550206497Sluigi g_sched_priv_ref(qp); 551206497Sluigi TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); 552206497Sluigi } 553206497Sluigi } 554206497Sluigi 555206497Sluigi qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3); 556206497Sluigi 557206497Sluigi g_rr_update_thinktime(qp); 558206497Sluigi g_rr_update_seekdist(qp, bp); 559206497Sluigi 560206497Sluigi /* Inherit the reference returned by g_rr_queue_get(). */ 561206497Sluigi bp->bio_caller1 = qp; 562206497Sluigi gs_bioq_disksort(&qp->q_bioq, bp); 563206497Sluigi 564206497Sluigi return (0); 565206497Sluigi} 566206497Sluigi 567206497Sluigi/* 568206497Sluigi * Callout executed when a queue times out anticipating a new request. 569206497Sluigi */ 570206497Sluigistatic void 571206497Sluigig_rr_wait_timeout(void *data) 572206497Sluigi{ 573206497Sluigi struct g_rr_softc *sc = data; 574206497Sluigi struct g_geom *geom = sc->sc_geom; 575206497Sluigi 576206497Sluigi g_sched_lock(geom); 577206497Sluigi /* 578206497Sluigi * We can race with other events, so check if 579206497Sluigi * sc_active is still valid. 580206497Sluigi */ 581206497Sluigi if (sc->sc_active != NULL) { 582206497Sluigi /* Release the reference to the queue. */ 583206497Sluigi g_rr_queue_put(sc->sc_active); 584206497Sluigi sc->sc_active = NULL; 585206497Sluigi me.wait_hit--; 586206497Sluigi me.wait_miss++; /* record the miss */ 587206497Sluigi } 588206497Sluigi g_sched_dispatch(geom); 589206497Sluigi g_sched_unlock(geom); 590206497Sluigi} 591206497Sluigi 592206497Sluigi/* 593206497Sluigi * Module glue: allocate descriptor, initialize its fields. 594206497Sluigi */ 595206497Sluigistatic void * 596206497Sluigig_rr_init(struct g_geom *geom) 597206497Sluigi{ 598206497Sluigi struct g_rr_softc *sc; 599206497Sluigi 600206497Sluigi /* XXX check whether we can sleep */ 601206497Sluigi sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO); 602206497Sluigi sc->sc_geom = geom; 603206497Sluigi TAILQ_INIT(&sc->sc_rr_tailq); 604206497Sluigi callout_init(&sc->sc_wait, CALLOUT_MPSAFE); 605206497Sluigi LIST_INSERT_HEAD(&me.sc_head, sc, sc_next); 606206497Sluigi me.units++; 607206497Sluigi 608206497Sluigi return (sc); 609206497Sluigi} 610206497Sluigi 611206497Sluigi/* 612206497Sluigi * Module glue -- drain the callout structure, destroy the 613206497Sluigi * hash table and its element, and free the descriptor. 614206497Sluigi */ 615206497Sluigistatic void 616206497Sluigig_rr_fini(void *data) 617206497Sluigi{ 618206497Sluigi struct g_rr_softc *sc = data; 619206497Sluigi 620206497Sluigi callout_drain(&sc->sc_wait); 621206497Sluigi KASSERT(sc->sc_active == NULL, ("still a queue under service")); 622206497Sluigi KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues")); 623206497Sluigi 624206497Sluigi LIST_REMOVE(sc, sc_next); 625206497Sluigi me.units--; 626206497Sluigi free(sc, M_GEOM_SCHED); 627206497Sluigi} 628206497Sluigi 629206497Sluigi/* 630206497Sluigi * Called when the request under service terminates. 631206497Sluigi * Start the anticipation timer if needed. 632206497Sluigi */ 633206497Sluigistatic void 634206497Sluigig_rr_done(void *data, struct bio *bp) 635206497Sluigi{ 636206497Sluigi struct g_rr_softc *sc = data; 637206497Sluigi struct g_rr_queue *qp; 638206497Sluigi 639206497Sluigi sc->sc_in_flight--; 640206497Sluigi 641206497Sluigi qp = bp->bio_caller1; 642218675Sluigi 643218675Sluigi /* 644218675Sluigi * When the first request for this queue completes, update the 645218675Sluigi * duration and end of the slice. We do not do it when the 646218675Sluigi * slice starts to avoid charging to the queue the time for 647218675Sluigi * the first seek. 648218675Sluigi */ 649218675Sluigi if (!(qp->q_flags & G_FLAG_COMPLETED)) { 650218675Sluigi qp->q_flags |= G_FLAG_COMPLETED; 651218675Sluigi /* 652218675Sluigi * recompute the slice duration, in case we want 653218675Sluigi * to make it adaptive. This is not used right now. 654218675Sluigi * XXX should we do the same for q_quantum and q_wait_ticks ? 655218675Sluigi */ 656218675Sluigi qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); 657218675Sluigi qp->q_slice_end = ticks + qp->q_slice_duration; 658218675Sluigi } 659218675Sluigi 660206497Sluigi if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) { 661206497Sluigi /* The queue is trying anticipation, start the timer. */ 662206497Sluigi qp->q_status = G_QUEUE_IDLING; 663206497Sluigi /* may make this adaptive */ 664206497Sluigi qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); 665206497Sluigi me.wait_hit++; 666206497Sluigi callout_reset(&sc->sc_wait, qp->q_wait_ticks, 667206497Sluigi g_rr_wait_timeout, sc); 668206497Sluigi } else 669206497Sluigi g_sched_dispatch(sc->sc_geom); 670206497Sluigi 671206497Sluigi /* Release a reference to the queue. */ 672206497Sluigi g_rr_queue_put(qp); 673206497Sluigi} 674206497Sluigi 675206497Sluigistatic void 676206497Sluigig_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 677206497Sluigi struct g_consumer *cp, struct g_provider *pp) 678206497Sluigi{ 679206497Sluigi if (indent == NULL) { /* plaintext */ 680206497Sluigi sbuf_printf(sb, " units %d queues %d", 681206497Sluigi me.units, me.queues); 682206497Sluigi } 683206497Sluigi} 684206497Sluigi 685206497Sluigistatic struct g_gsched g_rr = { 686206497Sluigi .gs_name = "rr", 687206497Sluigi .gs_priv_size = sizeof(struct g_rr_queue), 688206497Sluigi .gs_init = g_rr_init, 689206497Sluigi .gs_fini = g_rr_fini, 690206497Sluigi .gs_start = g_rr_start, 691206497Sluigi .gs_done = g_rr_done, 692206497Sluigi .gs_next = g_rr_next, 693206497Sluigi .gs_dumpconf = g_rr_dumpconf, 694206497Sluigi .gs_init_class = g_rr_init_class, 695206497Sluigi .gs_fini_class = g_rr_fini_class, 696206497Sluigi}; 697206497Sluigi 698206497SluigiDECLARE_GSCHED_MODULE(rr, &g_rr); 699