1206497Sluigi/*- 2206552Sluigi * Copyright (c) 2009-2010 Fabio Checconi 3206552Sluigi * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa 4206497Sluigi * All rights reserved. 5206497Sluigi * 6206497Sluigi * Redistribution and use in source and binary forms, with or without 7206497Sluigi * modification, are permitted provided that the following conditions 8206497Sluigi * are met: 9206497Sluigi * 1. Redistributions of source code must retain the above copyright 10206497Sluigi * notice, this list of conditions and the following disclaimer. 11206497Sluigi * 2. Redistributions in binary form must reproduce the above copyright 12206497Sluigi * notice, this list of conditions and the following disclaimer in the 13206497Sluigi * documentation and/or other materials provided with the distribution. 14206497Sluigi * 15206497Sluigi * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16206497Sluigi * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17206497Sluigi * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18206497Sluigi * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19206497Sluigi * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20206497Sluigi * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21206497Sluigi * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22206497Sluigi * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23206497Sluigi * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24206497Sluigi * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25206497Sluigi * SUCH DAMAGE. 26206497Sluigi */ 27206497Sluigi 28206497Sluigi/* 29206497Sluigi * $Id$ 30206497Sluigi * $FreeBSD$ 31206497Sluigi * 32206497Sluigi * A round-robin (RR) anticipatory scheduler, with per-client queues. 33206497Sluigi * 34206497Sluigi * The goal of this implementation is to improve throughput compared 35206497Sluigi * to the pure elevator algorithm, and insure some fairness among 36206497Sluigi * clients. 37206497Sluigi * 38206497Sluigi * Requests coming from the same client are put in the same queue. 39206497Sluigi * We use anticipation to help reducing seeks, and each queue 40206497Sluigi * is never served continuously for more than a given amount of 41206497Sluigi * time or data. Queues are then served in a round-robin fashion. 42206497Sluigi * 43206497Sluigi * Each queue can be in any of the following states: 44206497Sluigi * READY immediately serve the first pending request; 45206497Sluigi * BUSY one request is under service, wait for completion; 46206497Sluigi * IDLING do not serve incoming requests immediately, unless 47206497Sluigi * they are "eligible" as defined later. 48206497Sluigi * 49206497Sluigi * Scheduling is made looking at the status of all queues, 50206497Sluigi * and the first one in round-robin order is privileged. 51206497Sluigi */ 52206497Sluigi 53206497Sluigi#include <sys/param.h> 54206497Sluigi#include <sys/systm.h> 55206497Sluigi#include <sys/kernel.h> 56206497Sluigi#include <sys/bio.h> 57206497Sluigi#include <sys/callout.h> 58206497Sluigi#include <sys/malloc.h> 59206497Sluigi#include <sys/module.h> 60206497Sluigi#include <sys/proc.h> 61206497Sluigi#include <sys/queue.h> 62223921Sae#include <sys/sbuf.h> 63206497Sluigi#include <sys/sysctl.h> 64206497Sluigi#include "gs_scheduler.h" 65206497Sluigi 66206497Sluigi/* possible states of the scheduler */ 67206497Sluigienum g_rr_state { 68206497Sluigi G_QUEUE_READY = 0, /* Ready to dispatch. */ 69206497Sluigi G_QUEUE_BUSY, /* Waiting for a completion. */ 70206497Sluigi G_QUEUE_IDLING /* Waiting for a new request. */ 71206497Sluigi}; 72206497Sluigi 73206497Sluigi/* possible queue flags */ 74206497Sluigienum g_rr_flags { 75218675Sluigi /* G_FLAG_COMPLETED means that the field q_slice_end is valid. */ 76206497Sluigi G_FLAG_COMPLETED = 1, /* Completed a req. in the current budget. */ 77206497Sluigi}; 78206497Sluigi 79206497Sluigistruct g_rr_softc; 80206497Sluigi 81206497Sluigi/* 82206497Sluigi * Queue descriptor, containing reference count, scheduling 83206497Sluigi * state, a queue of pending requests, configuration parameters. 84206497Sluigi * Queues with pending request(s) and not under service are also 85206497Sluigi * stored in a Round Robin (RR) list. 86206497Sluigi */ 87206497Sluigistruct g_rr_queue { 88206497Sluigi struct g_rr_softc *q_sc; /* link to the parent */ 89206497Sluigi 90206497Sluigi enum g_rr_state q_status; 91206497Sluigi unsigned int q_service; /* service received so far */ 92218675Sluigi int q_slice_end; /* actual slice end time, in ticks */ 93206497Sluigi enum g_rr_flags q_flags; /* queue flags */ 94206497Sluigi struct bio_queue_head q_bioq; 95206497Sluigi 96206497Sluigi /* Scheduling parameters */ 97206497Sluigi unsigned int q_budget; /* slice size in bytes */ 98206497Sluigi unsigned int q_slice_duration; /* slice size in ticks */ 99206497Sluigi unsigned int q_wait_ticks; /* wait time for anticipation */ 100206497Sluigi 101206497Sluigi /* Stats to drive the various heuristics. */ 102206497Sluigi struct g_savg q_thinktime; /* Thinktime average. */ 103206497Sluigi struct g_savg q_seekdist; /* Seek distance average. */ 104206497Sluigi 105206497Sluigi int q_bionum; /* Number of requests. */ 106206497Sluigi 107206497Sluigi off_t q_lastoff; /* Last submitted req. offset. */ 108206497Sluigi int q_lastsub; /* Last submitted req. time. */ 109206497Sluigi 110206497Sluigi /* Expiration deadline for an empty queue. */ 111206497Sluigi int q_expire; 112206497Sluigi 113206497Sluigi TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */ 114206497Sluigi}; 115206497Sluigi 116206497Sluigi/* List types. */ 117206497SluigiTAILQ_HEAD(g_rr_tailq, g_rr_queue); 118206497Sluigi 119206497Sluigi/* list of scheduler instances */ 120206497SluigiLIST_HEAD(g_scheds, g_rr_softc); 121206497Sluigi 122206497Sluigi/* Default quantum for RR between queues. */ 123206497Sluigi#define G_RR_DEFAULT_BUDGET 0x00800000 124206497Sluigi 125206497Sluigi/* 126206497Sluigi * Per device descriptor, holding the Round Robin list of queues 127206497Sluigi * accessing the disk, a reference to the geom, and the timer. 128206497Sluigi */ 129206497Sluigistruct g_rr_softc { 130206497Sluigi struct g_geom *sc_geom; 131206497Sluigi 132206497Sluigi /* 133206497Sluigi * sc_active is the queue we are anticipating for. 134206497Sluigi * It is set only in gs_rr_next(), and possibly cleared 135206497Sluigi * only in gs_rr_next() or on a timeout. 136206497Sluigi * The active queue is never in the Round Robin list 137206497Sluigi * even if it has requests queued. 138206497Sluigi */ 139206497Sluigi struct g_rr_queue *sc_active; 140206497Sluigi struct callout sc_wait; /* timer for sc_active */ 141206497Sluigi 142206497Sluigi struct g_rr_tailq sc_rr_tailq; /* the round-robin list */ 143206497Sluigi int sc_nqueues; /* number of queues */ 144206497Sluigi 145206497Sluigi /* Statistics */ 146206497Sluigi int sc_in_flight; /* requests in the driver */ 147206497Sluigi 148206497Sluigi LIST_ENTRY(g_rr_softc) sc_next; 149206497Sluigi}; 150206497Sluigi 151206497Sluigi/* Descriptor for bounded values, min and max are constant. */ 152206497Sluigistruct x_bound { 153206497Sluigi const int x_min; 154206497Sluigi int x_cur; 155206497Sluigi const int x_max; 156206497Sluigi}; 157206497Sluigi 158206497Sluigi/* 159206497Sluigi * parameters, config and stats 160206497Sluigi */ 161206497Sluigistruct g_rr_params { 162206497Sluigi int queues; /* total number of queues */ 163206497Sluigi int w_anticipate; /* anticipate writes */ 164206497Sluigi int bypass; /* bypass scheduling writes */ 165206497Sluigi 166206497Sluigi int units; /* how many instances */ 167206497Sluigi /* sc_head is used for debugging */ 168206497Sluigi struct g_scheds sc_head; /* first scheduler instance */ 169206497Sluigi 170206497Sluigi struct x_bound queue_depth; /* max parallel requests */ 171206497Sluigi struct x_bound wait_ms; /* wait time, milliseconds */ 172206497Sluigi struct x_bound quantum_ms; /* quantum size, milliseconds */ 173206497Sluigi struct x_bound quantum_kb; /* quantum size, Kb (1024 bytes) */ 174206497Sluigi 175206497Sluigi /* statistics */ 176206497Sluigi int wait_hit; /* success in anticipation */ 177206497Sluigi int wait_miss; /* failure in anticipation */ 178206497Sluigi}; 179206497Sluigi 180206497Sluigi/* 181206497Sluigi * Default parameters for the scheduler. The quantum sizes target 182206497Sluigi * a 80MB/s disk; if the hw is faster or slower the minimum of the 183206497Sluigi * two will have effect: the clients will still be isolated but 184206497Sluigi * the fairness may be limited. A complete solution would involve 185206497Sluigi * the on-line measurement of the actual disk throughput to derive 186206497Sluigi * these parameters. Or we may just choose to ignore service domain 187206497Sluigi * fairness and accept what can be achieved with time-only budgets. 188206497Sluigi */ 189206497Sluigistatic struct g_rr_params me = { 190206497Sluigi .sc_head = LIST_HEAD_INITIALIZER(&me.sc_head), 191206497Sluigi .w_anticipate = 1, 192206497Sluigi .queue_depth = { 1, 1, 50 }, 193206497Sluigi .wait_ms = { 1, 10, 30 }, 194206497Sluigi .quantum_ms = { 1, 100, 500 }, 195206497Sluigi .quantum_kb = { 16, 8192, 65536 }, 196206497Sluigi}; 197206497Sluigi 198206497Sluigistruct g_rr_params *gs_rr_me = &me; 199206497Sluigi 200206497SluigiSYSCTL_DECL(_kern_geom_sched); 201227309Sedstatic SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0, 202206497Sluigi "GEOM_SCHED ROUND ROBIN stuff"); 203217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD, 204206497Sluigi &me.units, 0, "Scheduler instances"); 205217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD, 206206497Sluigi &me.queues, 0, "Total rr queues"); 207217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW, 208206497Sluigi &me.wait_ms.x_cur, 0, "Wait time milliseconds"); 209217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW, 210206497Sluigi &me.quantum_ms.x_cur, 0, "Quantum size milliseconds"); 211217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW, 212206497Sluigi &me.bypass, 0, "Bypass scheduler"); 213217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW, 214206497Sluigi &me.w_anticipate, 0, "Do anticipation on writes"); 215217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW, 216206497Sluigi &me.quantum_kb.x_cur, 0, "Quantum size Kbytes"); 217217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW, 218206497Sluigi &me.queue_depth.x_cur, 0, "Maximum simultaneous requests"); 219217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW, 220206497Sluigi &me.wait_hit, 0, "Hits in anticipation"); 221217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW, 222206497Sluigi &me.wait_miss, 0, "Misses in anticipation"); 223206497Sluigi 224206497Sluigi#ifdef DEBUG_QUEUES 225206497Sluigi/* print the status of a queue */ 226206497Sluigistatic void 227206497Sluigigs_rr_dump_q(struct g_rr_queue *qp, int index) 228206497Sluigi{ 229206497Sluigi int l = 0; 230206497Sluigi struct bio *bp; 231206497Sluigi 232206497Sluigi TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) { 233206497Sluigi l++; 234206497Sluigi } 235206497Sluigi printf("--- rr queue %d %p status %d len %d ---\n", 236206497Sluigi index, qp, qp->q_status, l); 237206497Sluigi} 238206497Sluigi 239206497Sluigi/* 240206497Sluigi * Dump the scheduler status when writing to this sysctl variable. 241206497Sluigi * XXX right now we only dump the status of the last instance created. 242206497Sluigi * not a severe issue because this is only for debugging 243206497Sluigi */ 244206497Sluigistatic int 245206497Sluigigs_rr_sysctl_status(SYSCTL_HANDLER_ARGS) 246206497Sluigi{ 247206497Sluigi int error, val = 0; 248206497Sluigi struct g_rr_softc *sc; 249206497Sluigi 250206497Sluigi error = sysctl_handle_int(oidp, &val, 0, req); 251206497Sluigi if (error || !req->newptr ) 252206497Sluigi return (error); 253206497Sluigi 254206497Sluigi printf("called %s\n", __FUNCTION__); 255206497Sluigi 256206497Sluigi LIST_FOREACH(sc, &me.sc_head, sc_next) { 257206497Sluigi int i, tot = 0; 258206497Sluigi printf("--- sc %p active %p nqueues %d " 259206497Sluigi "callout %d in_flight %d ---\n", 260206497Sluigi sc, sc->sc_active, sc->sc_nqueues, 261206497Sluigi callout_active(&sc->sc_wait), 262206497Sluigi sc->sc_in_flight); 263206497Sluigi for (i = 0; i < G_RR_HASH_SIZE; i++) { 264206497Sluigi struct g_rr_queue *qp; 265206497Sluigi LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) { 266206497Sluigi gs_rr_dump_q(qp, tot); 267206497Sluigi tot++; 268206497Sluigi } 269206497Sluigi } 270206497Sluigi } 271206497Sluigi return (0); 272206497Sluigi} 273206497Sluigi 274206497SluigiSYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status, 275206497Sluigi CTLTYPE_UINT | CTLFLAG_RW, 276206497Sluigi 0, sizeof(int), gs_rr_sysctl_status, "I", "status"); 277206497Sluigi 278206497Sluigi#endif /* DEBUG_QUEUES */ 279206497Sluigi 280206497Sluigi/* 281206497Sluigi * Get a bounded value, optionally convert to a min of t_min ticks. 282206497Sluigi */ 283206497Sluigistatic int 284206497Sluigiget_bounded(struct x_bound *v, int t_min) 285206497Sluigi{ 286206497Sluigi int x; 287206497Sluigi 288206497Sluigi x = v->x_cur; 289206497Sluigi if (x < v->x_min) 290206497Sluigi x = v->x_min; 291206497Sluigi else if (x > v->x_max) 292206497Sluigi x = v->x_max; 293206497Sluigi if (t_min) { 294206497Sluigi x = x * hz / 1000; /* convert to ticks */ 295206497Sluigi if (x < t_min) 296206497Sluigi x = t_min; 297206497Sluigi } 298206497Sluigi return x; 299206497Sluigi} 300206497Sluigi 301206497Sluigi/* 302206497Sluigi * Get a reference to the queue for bp, using the generic 303206497Sluigi * classification mechanism. 304206497Sluigi */ 305206497Sluigistatic struct g_rr_queue * 306206497Sluigig_rr_queue_get(struct g_rr_softc *sc, struct bio *bp) 307206497Sluigi{ 308206497Sluigi 309206497Sluigi return (g_sched_get_class(sc->sc_geom, bp)); 310206497Sluigi} 311206497Sluigi 312206497Sluigistatic int 313206497Sluigig_rr_init_class(void *data, void *priv) 314206497Sluigi{ 315206497Sluigi struct g_rr_softc *sc = data; 316206497Sluigi struct g_rr_queue *qp = priv; 317206497Sluigi 318275947Simp bioq_init(&qp->q_bioq); 319206497Sluigi 320206497Sluigi /* 321206497Sluigi * Set the initial parameters for the client: 322206497Sluigi * slice size in bytes and ticks, and wait ticks. 323206497Sluigi * Right now these are constant, but we could have 324206497Sluigi * autoconfiguration code to adjust the values based on 325206497Sluigi * the actual workload. 326206497Sluigi */ 327206497Sluigi qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0); 328206497Sluigi qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); 329206497Sluigi qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); 330206497Sluigi 331206497Sluigi qp->q_sc = sc; /* link to the parent */ 332206497Sluigi qp->q_sc->sc_nqueues++; 333206497Sluigi me.queues++; 334206497Sluigi 335206497Sluigi return (0); 336206497Sluigi} 337206497Sluigi 338206497Sluigi/* 339206497Sluigi * Release a reference to the queue. 340206497Sluigi */ 341206497Sluigistatic void 342206497Sluigig_rr_queue_put(struct g_rr_queue *qp) 343206497Sluigi{ 344206497Sluigi 345206497Sluigi g_sched_put_class(qp->q_sc->sc_geom, qp); 346206497Sluigi} 347206497Sluigi 348206497Sluigistatic void 349206497Sluigig_rr_fini_class(void *data, void *priv) 350206497Sluigi{ 351206497Sluigi struct g_rr_queue *qp = priv; 352206497Sluigi 353275947Simp KASSERT(bioq_first(&qp->q_bioq) == NULL, 354206497Sluigi ("released nonempty queue")); 355206497Sluigi qp->q_sc->sc_nqueues--; 356206497Sluigi me.queues--; 357206497Sluigi} 358206497Sluigi 359206497Sluigistatic inline int 360206497Sluigig_rr_queue_expired(struct g_rr_queue *qp) 361206497Sluigi{ 362206497Sluigi 363206497Sluigi if (qp->q_service >= qp->q_budget) 364206497Sluigi return (1); 365206497Sluigi 366206497Sluigi if ((qp->q_flags & G_FLAG_COMPLETED) && 367206497Sluigi ticks - qp->q_slice_end >= 0) 368206497Sluigi return (1); 369206497Sluigi 370206497Sluigi return (0); 371206497Sluigi} 372206497Sluigi 373206497Sluigistatic inline int 374206497Sluigig_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp) 375206497Sluigi{ 376206497Sluigi int wait = get_bounded(&me.wait_ms, 2); 377206497Sluigi 378296606Simp if (!me.w_anticipate && (bp->bio_cmd == BIO_WRITE)) 379206497Sluigi return (0); 380206497Sluigi 381206497Sluigi if (g_savg_valid(&qp->q_thinktime) && 382206497Sluigi g_savg_read(&qp->q_thinktime) > wait) 383206497Sluigi return (0); 384206497Sluigi 385206497Sluigi if (g_savg_valid(&qp->q_seekdist) && 386206497Sluigi g_savg_read(&qp->q_seekdist) > 8192) 387206497Sluigi return (0); 388206497Sluigi 389206497Sluigi return (1); 390206497Sluigi} 391206497Sluigi 392206497Sluigi/* 393206497Sluigi * Called on a request arrival, timeout or completion. 394206497Sluigi * Try to serve a request among those queued. 395206497Sluigi */ 396206497Sluigistatic struct bio * 397206497Sluigig_rr_next(void *data, int force) 398206497Sluigi{ 399206497Sluigi struct g_rr_softc *sc = data; 400206497Sluigi struct g_rr_queue *qp; 401206497Sluigi struct bio *bp, *next; 402206497Sluigi int expired; 403206497Sluigi 404206497Sluigi qp = sc->sc_active; 405206497Sluigi if (me.bypass == 0 && !force) { 406206497Sluigi if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0)) 407206497Sluigi return (NULL); 408206497Sluigi 409206497Sluigi /* Try with the queue under service first. */ 410206497Sluigi if (qp != NULL && qp->q_status != G_QUEUE_READY) { 411206497Sluigi /* 412206497Sluigi * Queue is anticipating, ignore request. 413206497Sluigi * We should check that we are not past 414206497Sluigi * the timeout, but in that case the timeout 415206497Sluigi * will fire immediately afterwards so we 416206497Sluigi * don't bother. 417206497Sluigi */ 418206497Sluigi return (NULL); 419206497Sluigi } 420206497Sluigi } else if (qp != NULL && qp->q_status != G_QUEUE_READY) { 421206497Sluigi g_rr_queue_put(qp); 422206497Sluigi sc->sc_active = qp = NULL; 423206497Sluigi } 424206497Sluigi 425206497Sluigi /* 426206497Sluigi * No queue under service, look for the first in RR order. 427206497Sluigi * If we find it, select if as sc_active, clear service 428206497Sluigi * and record the end time of the slice. 429206497Sluigi */ 430206497Sluigi if (qp == NULL) { 431206497Sluigi qp = TAILQ_FIRST(&sc->sc_rr_tailq); 432206497Sluigi if (qp == NULL) 433206497Sluigi return (NULL); /* no queues at all, return */ 434206497Sluigi /* otherwise select the new queue for service. */ 435206497Sluigi TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq); 436206497Sluigi sc->sc_active = qp; 437206497Sluigi qp->q_service = 0; 438206497Sluigi qp->q_flags &= ~G_FLAG_COMPLETED; 439206497Sluigi } 440206497Sluigi 441275947Simp bp = bioq_takefirst(&qp->q_bioq); /* surely not NULL */ 442206497Sluigi qp->q_service += bp->bio_length; /* charge the service */ 443206497Sluigi 444206497Sluigi /* 445206497Sluigi * The request at the head of the active queue is always 446206497Sluigi * dispatched, and gs_rr_next() will be called again 447206497Sluigi * immediately. 448206497Sluigi * We need to prepare for what to do next: 449206497Sluigi * 450206497Sluigi * 1. have we reached the end of the (time or service) slice ? 451206497Sluigi * If so, clear sc_active and possibly requeue the previous 452206497Sluigi * active queue if it has more requests pending; 453206497Sluigi * 2. do we have more requests in sc_active ? 454206497Sluigi * If yes, do not anticipate, as gs_rr_next() will run again; 455206497Sluigi * if no, decide whether or not to anticipate depending 456206497Sluigi * on read or writes (e.g., anticipate only on reads). 457206497Sluigi */ 458206497Sluigi expired = g_rr_queue_expired(qp); /* are we expired ? */ 459275947Simp next = bioq_first(&qp->q_bioq); /* do we have one more ? */ 460206497Sluigi if (expired) { 461206497Sluigi sc->sc_active = NULL; 462206497Sluigi /* Either requeue or release reference. */ 463206497Sluigi if (next != NULL) 464206497Sluigi TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); 465206497Sluigi else 466206497Sluigi g_rr_queue_put(qp); 467206497Sluigi } else if (next != NULL) { 468206497Sluigi qp->q_status = G_QUEUE_READY; 469206497Sluigi } else { 470206497Sluigi if (!force && g_rr_should_anticipate(qp, bp)) { 471206497Sluigi /* anticipate */ 472206497Sluigi qp->q_status = G_QUEUE_BUSY; 473206497Sluigi } else { 474206497Sluigi /* do not anticipate, release reference */ 475206497Sluigi g_rr_queue_put(qp); 476206497Sluigi sc->sc_active = NULL; 477206497Sluigi } 478206497Sluigi } 479206497Sluigi /* If sc_active != NULL, its q_status is always correct. */ 480206497Sluigi 481206497Sluigi sc->sc_in_flight++; 482206497Sluigi 483206497Sluigi return (bp); 484206497Sluigi} 485206497Sluigi 486206497Sluigistatic inline void 487206497Sluigig_rr_update_thinktime(struct g_rr_queue *qp) 488206497Sluigi{ 489206497Sluigi int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2); 490206497Sluigi 491206497Sluigi if (qp->q_sc->sc_active != qp) 492206497Sluigi return; 493206497Sluigi 494206497Sluigi qp->q_lastsub = ticks; 495206497Sluigi delta = (delta > 2 * wait) ? 2 * wait : delta; 496206497Sluigi if (qp->q_bionum > 7) 497206497Sluigi g_savg_add_sample(&qp->q_thinktime, delta); 498206497Sluigi} 499206497Sluigi 500206497Sluigistatic inline void 501206497Sluigig_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp) 502206497Sluigi{ 503206497Sluigi off_t dist; 504206497Sluigi 505206497Sluigi if (qp->q_lastoff > bp->bio_offset) 506206497Sluigi dist = qp->q_lastoff - bp->bio_offset; 507206497Sluigi else 508206497Sluigi dist = bp->bio_offset - qp->q_lastoff; 509206497Sluigi 510206497Sluigi if (dist > (8192 * 8)) 511206497Sluigi dist = 8192 * 8; 512206497Sluigi 513206497Sluigi qp->q_lastoff = bp->bio_offset + bp->bio_length; 514206497Sluigi 515206497Sluigi if (qp->q_bionum > 7) 516206497Sluigi g_savg_add_sample(&qp->q_seekdist, dist); 517206497Sluigi} 518206497Sluigi 519206497Sluigi/* 520206497Sluigi * Called when a real request for disk I/O arrives. 521206497Sluigi * Locate the queue associated with the client. 522206497Sluigi * If the queue is the one we are anticipating for, reset its timeout; 523206497Sluigi * if the queue is not in the round robin list, insert it in the list. 524206497Sluigi * On any error, do not queue the request and return -1, the caller 525206497Sluigi * will take care of this request. 526206497Sluigi */ 527206497Sluigistatic int 528206497Sluigig_rr_start(void *data, struct bio *bp) 529206497Sluigi{ 530206497Sluigi struct g_rr_softc *sc = data; 531206497Sluigi struct g_rr_queue *qp; 532206497Sluigi 533206497Sluigi if (me.bypass) 534206497Sluigi return (-1); /* bypass the scheduler */ 535206497Sluigi 536206497Sluigi /* Get the queue for the request. */ 537206497Sluigi qp = g_rr_queue_get(sc, bp); 538206497Sluigi if (qp == NULL) 539206497Sluigi return (-1); /* allocation failed, tell upstream */ 540206497Sluigi 541275947Simp if (bioq_first(&qp->q_bioq) == NULL) { 542206497Sluigi /* 543206497Sluigi * We are inserting into an empty queue. 544206497Sluigi * Reset its state if it is sc_active, 545206497Sluigi * otherwise insert it in the RR list. 546206497Sluigi */ 547206497Sluigi if (qp == sc->sc_active) { 548206497Sluigi qp->q_status = G_QUEUE_READY; 549206497Sluigi callout_stop(&sc->sc_wait); 550206497Sluigi } else { 551206497Sluigi g_sched_priv_ref(qp); 552206497Sluigi TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); 553206497Sluigi } 554206497Sluigi } 555206497Sluigi 556206497Sluigi qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3); 557206497Sluigi 558206497Sluigi g_rr_update_thinktime(qp); 559206497Sluigi g_rr_update_seekdist(qp, bp); 560206497Sluigi 561206497Sluigi /* Inherit the reference returned by g_rr_queue_get(). */ 562206497Sluigi bp->bio_caller1 = qp; 563275947Simp bioq_disksort(&qp->q_bioq, bp); 564206497Sluigi 565206497Sluigi return (0); 566206497Sluigi} 567206497Sluigi 568206497Sluigi/* 569206497Sluigi * Callout executed when a queue times out anticipating a new request. 570206497Sluigi */ 571206497Sluigistatic void 572206497Sluigig_rr_wait_timeout(void *data) 573206497Sluigi{ 574206497Sluigi struct g_rr_softc *sc = data; 575206497Sluigi struct g_geom *geom = sc->sc_geom; 576206497Sluigi 577206497Sluigi g_sched_lock(geom); 578206497Sluigi /* 579206497Sluigi * We can race with other events, so check if 580206497Sluigi * sc_active is still valid. 581206497Sluigi */ 582206497Sluigi if (sc->sc_active != NULL) { 583206497Sluigi /* Release the reference to the queue. */ 584206497Sluigi g_rr_queue_put(sc->sc_active); 585206497Sluigi sc->sc_active = NULL; 586206497Sluigi me.wait_hit--; 587206497Sluigi me.wait_miss++; /* record the miss */ 588206497Sluigi } 589206497Sluigi g_sched_dispatch(geom); 590206497Sluigi g_sched_unlock(geom); 591206497Sluigi} 592206497Sluigi 593206497Sluigi/* 594206497Sluigi * Module glue: allocate descriptor, initialize its fields. 595206497Sluigi */ 596206497Sluigistatic void * 597206497Sluigig_rr_init(struct g_geom *geom) 598206497Sluigi{ 599206497Sluigi struct g_rr_softc *sc; 600206497Sluigi 601206497Sluigi /* XXX check whether we can sleep */ 602206497Sluigi sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO); 603206497Sluigi sc->sc_geom = geom; 604206497Sluigi TAILQ_INIT(&sc->sc_rr_tailq); 605283291Sjkim callout_init(&sc->sc_wait, 1); 606206497Sluigi LIST_INSERT_HEAD(&me.sc_head, sc, sc_next); 607206497Sluigi me.units++; 608206497Sluigi 609206497Sluigi return (sc); 610206497Sluigi} 611206497Sluigi 612206497Sluigi/* 613206497Sluigi * Module glue -- drain the callout structure, destroy the 614206497Sluigi * hash table and its element, and free the descriptor. 615206497Sluigi */ 616206497Sluigistatic void 617206497Sluigig_rr_fini(void *data) 618206497Sluigi{ 619206497Sluigi struct g_rr_softc *sc = data; 620206497Sluigi 621206497Sluigi callout_drain(&sc->sc_wait); 622206497Sluigi KASSERT(sc->sc_active == NULL, ("still a queue under service")); 623206497Sluigi KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues")); 624206497Sluigi 625206497Sluigi LIST_REMOVE(sc, sc_next); 626206497Sluigi me.units--; 627206497Sluigi free(sc, M_GEOM_SCHED); 628206497Sluigi} 629206497Sluigi 630206497Sluigi/* 631206497Sluigi * Called when the request under service terminates. 632206497Sluigi * Start the anticipation timer if needed. 633206497Sluigi */ 634206497Sluigistatic void 635206497Sluigig_rr_done(void *data, struct bio *bp) 636206497Sluigi{ 637206497Sluigi struct g_rr_softc *sc = data; 638206497Sluigi struct g_rr_queue *qp; 639206497Sluigi 640206497Sluigi sc->sc_in_flight--; 641206497Sluigi 642206497Sluigi qp = bp->bio_caller1; 643218675Sluigi 644218675Sluigi /* 645218675Sluigi * When the first request for this queue completes, update the 646218675Sluigi * duration and end of the slice. We do not do it when the 647218675Sluigi * slice starts to avoid charging to the queue the time for 648218675Sluigi * the first seek. 649218675Sluigi */ 650218675Sluigi if (!(qp->q_flags & G_FLAG_COMPLETED)) { 651218675Sluigi qp->q_flags |= G_FLAG_COMPLETED; 652218675Sluigi /* 653218675Sluigi * recompute the slice duration, in case we want 654218675Sluigi * to make it adaptive. This is not used right now. 655218675Sluigi * XXX should we do the same for q_quantum and q_wait_ticks ? 656218675Sluigi */ 657218675Sluigi qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); 658218675Sluigi qp->q_slice_end = ticks + qp->q_slice_duration; 659218675Sluigi } 660218675Sluigi 661206497Sluigi if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) { 662206497Sluigi /* The queue is trying anticipation, start the timer. */ 663206497Sluigi qp->q_status = G_QUEUE_IDLING; 664206497Sluigi /* may make this adaptive */ 665206497Sluigi qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); 666206497Sluigi me.wait_hit++; 667206497Sluigi callout_reset(&sc->sc_wait, qp->q_wait_ticks, 668206497Sluigi g_rr_wait_timeout, sc); 669206497Sluigi } else 670206497Sluigi g_sched_dispatch(sc->sc_geom); 671206497Sluigi 672206497Sluigi /* Release a reference to the queue. */ 673206497Sluigi g_rr_queue_put(qp); 674206497Sluigi} 675206497Sluigi 676206497Sluigistatic void 677206497Sluigig_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 678206497Sluigi struct g_consumer *cp, struct g_provider *pp) 679206497Sluigi{ 680206497Sluigi if (indent == NULL) { /* plaintext */ 681206497Sluigi sbuf_printf(sb, " units %d queues %d", 682206497Sluigi me.units, me.queues); 683206497Sluigi } 684206497Sluigi} 685206497Sluigi 686206497Sluigistatic struct g_gsched g_rr = { 687206497Sluigi .gs_name = "rr", 688206497Sluigi .gs_priv_size = sizeof(struct g_rr_queue), 689206497Sluigi .gs_init = g_rr_init, 690206497Sluigi .gs_fini = g_rr_fini, 691206497Sluigi .gs_start = g_rr_start, 692206497Sluigi .gs_done = g_rr_done, 693206497Sluigi .gs_next = g_rr_next, 694206497Sluigi .gs_dumpconf = g_rr_dumpconf, 695206497Sluigi .gs_init_class = g_rr_init_class, 696206497Sluigi .gs_fini_class = g_rr_fini_class, 697206497Sluigi}; 698206497Sluigi 699206497SluigiDECLARE_GSCHED_MODULE(rr, &g_rr); 700