1292425Simp/*- 2292425Simp * Copyright (c) 2015 Netflix, Inc. 3292425Simp * 4292425Simp * Derived from gs_rr.c: 5292425Simp * Copyright (c) 2009-2010 Fabio Checconi 6292425Simp * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa 7292425Simp * All rights reserved. 8292425Simp * 9292425Simp * Redistribution and use in source and binary forms, with or without 10292425Simp * modification, are permitted provided that the following conditions 11292425Simp * are met: 12292425Simp * 1. Redistributions of source code must retain the above copyright 13292425Simp * notice, this list of conditions and the following disclaimer. 14292425Simp * 2. Redistributions in binary form must reproduce the above copyright 15292425Simp * notice, this list of conditions and the following disclaimer in the 16292425Simp * documentation and/or other materials provided with the distribution. 17292425Simp * 18292425Simp * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19292425Simp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20292425Simp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21292425Simp * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22292425Simp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23292425Simp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24292425Simp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25292425Simp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26292425Simp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27292425Simp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28292425Simp * SUCH DAMAGE. 29292425Simp */ 30292425Simp 31292425Simp/* 32292425Simp * $Id$ 33292425Simp * $FreeBSD$ 34292425Simp * 35292425Simp * A simple scheduler that just delays certain transactions by a certain 36292425Simp * amount. We collect all the transactions that are 'done' and put them on 37292425Simp * a queue. The queue is run through every so often and the transactions that 38292425Simp * have taken longer than the threshold delay are completed. 39292425Simp */ 40292425Simp 41292425Simp#include <sys/param.h> 42292425Simp#include <sys/systm.h> 43292425Simp#include <sys/kernel.h> 44292425Simp#include <sys/bio.h> 45292425Simp#include <sys/callout.h> 46292425Simp#include <sys/malloc.h> 47292425Simp#include <sys/module.h> 48292425Simp#include <sys/proc.h> 49292425Simp#include <sys/queue.h> 50292425Simp#include <sys/sbuf.h> 51292425Simp#include <sys/sysctl.h> 52292425Simp#include "gs_scheduler.h" 53292425Simp 54292425Simp/* Useful constants */ 55292425Simp#define BTFRAC_1US 18446744073709ULL /* 2^64 / 1000000 */ 56292425Simp 57292425Simp/* list of scheduler instances */ 58292425SimpLIST_HEAD(g_scheds, g_delay_softc); 59292425Simp 60292425Simp/* 61292425Simp * Per device descriptor, holding the Round Robin list of queues 62292425Simp * accessing the disk, a reference to the geom, and the timer. 63292425Simp */ 64292425Simpstruct g_delay_softc { 65292425Simp struct g_geom *sc_geom; 66292425Simp 67292425Simp struct bio_queue_head sc_bioq; /* queue of pending requests */ 68292425Simp struct callout sc_wait; /* timer for completing with delays */ 69292425Simp 70292425Simp /* Statistics */ 71292425Simp int sc_in_flight; /* requests in the driver */ 72292425Simp}; 73292425Simp 74292425Simp/* 75292425Simp * parameters, config and stats 76292425Simp */ 77292425Simpstruct g_delay_params { 78292425Simp uint64_t io; 79292425Simp int bypass; /* bypass scheduling */ 80292425Simp int units; /* how many instances */ 81292425Simp int latency; /* How big a latncy are hoping for */ 82292425Simp}; 83292425Simp 84292425Simpstatic struct g_delay_params me = { 85292425Simp .bypass = 0, 86292425Simp .units = 0, 87292425Simp .latency = 0, 88292425Simp .io = 0, 89292425Simp}; 90292425Simpstruct g_delay_params *gs_delay_me = &me; 91292425Simp 92292425SimpSYSCTL_DECL(_kern_geom_sched); 93292425Simpstatic SYSCTL_NODE(_kern_geom_sched, OID_AUTO, delay, CTLFLAG_RW, 0, 94292425Simp "GEOM_SCHED DELAY stuff"); 95292425SimpSYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, bypass, CTLFLAG_RD, 96292425Simp &me.bypass, 0, "Scheduler bypass"); 97292425SimpSYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, units, CTLFLAG_RD, 98292425Simp &me.units, 0, "Scheduler instances"); 99292425SimpSYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, latency, CTLFLAG_RW, 100292425Simp &me.latency, 0, "Minimum latency for requests, in microseconds (1/hz resolution)"); 101292425SimpSYSCTL_QUAD(_kern_geom_sched_delay, OID_AUTO, io, CTLFLAG_RW, 102292425Simp &me.io, 0, "I/Os delayed\n"); 103292425Simp 104292425Simpstatic int 105292425Simpg_delay_init_class(void *data, void *priv) 106292425Simp{ 107292425Simp return (0); 108292425Simp} 109292425Simp 110292425Simpstatic void 111292425Simpg_delay_fini_class(void *data, void *priv) 112292425Simp{ 113292425Simp} 114292425Simp 115292425Simp/* 116292425Simp * Called on a request arrival, timeout or completion. 117292425Simp * Try to serve a request among those queued. 118292425Simp */ 119292425Simpstatic struct bio * 120292425Simpg_delay_next(void *data, int force) 121292425Simp{ 122292425Simp struct g_delay_softc *sc = data; 123292425Simp struct bio *bp; 124292425Simp struct bintime bt; 125292425Simp 126292425Simp bp = bioq_first(&sc->sc_bioq); 127292425Simp if (bp == NULL) 128292425Simp return (NULL); 129292425Simp 130292425Simp /* 131292425Simp * If the time isn't yet ripe for this bp to be let loose, 132292425Simp * then the time isn't ripe for any of its friends either 133292425Simp * since we insert in-order. Terminate if the bio hasn't 134292425Simp * aged appropriately. Note that there's pathology here 135292425Simp * such that we may be up to one tick early in releasing 136292425Simp * this I/O. We could implement this up to a tick late too 137292425Simp * but choose not to. 138292425Simp */ 139292425Simp getbinuptime(&bt); /* BIO's bio_t0 is uptime */ 140292425Simp if (bintime_cmp(&bp->bio_t0, &bt, >)) 141292425Simp return (NULL); 142292425Simp me.io++; 143292425Simp 144292425Simp /* 145292425Simp * The bp has mellowed enough, let it through and update stats. 146292425Simp * If there's others, we'll catch them next time we get called. 147292425Simp */ 148292425Simp sc->sc_in_flight++; 149292425Simp 150292425Simp bp = bioq_takefirst(&sc->sc_bioq); 151292425Simp return (bp); 152292425Simp} 153292425Simp 154292425Simp/* 155292425Simp * Called when a real request for disk I/O arrives. 156292425Simp * Locate the queue associated with the client. 157292425Simp * If the queue is the one we are anticipating for, reset its timeout; 158292425Simp * if the queue is not in the round robin list, insert it in the list. 159292425Simp * On any error, do not queue the request and return -1, the caller 160292425Simp * will take care of this request. 161292425Simp */ 162292425Simpstatic int 163292425Simpg_delay_start(void *data, struct bio *bp) 164292425Simp{ 165292425Simp struct g_delay_softc *sc = data; 166292425Simp 167292425Simp if (me.bypass) 168292425Simp return (-1); /* bypass the scheduler */ 169292425Simp 170292425Simp bp->bio_caller1 = sc; 171292425Simp getbinuptime(&bp->bio_t0); /* BIO's bio_t0 is uptime */ 172292425Simp bintime_addx(&bp->bio_t0, BTFRAC_1US * me.latency); 173292425Simp 174292425Simp /* 175292425Simp * Keep the I/Os ordered. Lower layers will reorder as we release them down. 176292425Simp * We rely on this in g_delay_next() so that we delay all things equally. Even 177292425Simp * if we move to multiple queues to push stuff down the stack, we'll want to 178292425Simp * insert in order and let the lower layers do whatever reordering they want. 179292425Simp */ 180292425Simp bioq_insert_tail(&sc->sc_bioq, bp); 181292425Simp 182292425Simp return (0); 183292425Simp} 184292425Simp 185292425Simpstatic void 186292425Simpg_delay_timeout(void *data) 187292425Simp{ 188292425Simp struct g_delay_softc *sc = data; 189292425Simp 190292425Simp g_sched_lock(sc->sc_geom); 191292425Simp g_sched_dispatch(sc->sc_geom); 192292425Simp g_sched_unlock(sc->sc_geom); 193292425Simp callout_reset(&sc->sc_wait, 1, g_delay_timeout, sc); 194292425Simp} 195292425Simp 196292425Simp/* 197292425Simp * Module glue: allocate descriptor, initialize its fields. 198292425Simp */ 199292425Simpstatic void * 200292425Simpg_delay_init(struct g_geom *geom) 201292425Simp{ 202292425Simp struct g_delay_softc *sc; 203292425Simp 204292436Simp sc = malloc(sizeof *sc, M_GEOM_SCHED, M_WAITOK | M_ZERO); 205292425Simp sc->sc_geom = geom; 206292425Simp bioq_init(&sc->sc_bioq); 207292425Simp callout_init(&sc->sc_wait, CALLOUT_MPSAFE); 208292425Simp callout_reset(&sc->sc_wait, 1, g_delay_timeout, sc); 209292425Simp me.units++; 210292425Simp 211292425Simp return (sc); 212292425Simp} 213292425Simp 214292425Simp/* 215292425Simp * Module glue -- drain the callout structure, destroy the 216292425Simp * hash table and its element, and free the descriptor. 217292425Simp */ 218292425Simpstatic void 219292425Simpg_delay_fini(void *data) 220292425Simp{ 221292425Simp struct g_delay_softc *sc = data; 222292425Simp 223292425Simp /* We're force drained before getting here */ 224292425Simp 225292425Simp /* Kick out timers */ 226292425Simp callout_drain(&sc->sc_wait); 227292425Simp me.units--; 228292425Simp free(sc, M_GEOM_SCHED); 229292425Simp} 230292425Simp 231292425Simp/* 232292425Simp * Called when the request under service terminates. 233292425Simp * Start the anticipation timer if needed. 234292425Simp */ 235292425Simpstatic void 236292425Simpg_delay_done(void *data, struct bio *bp) 237292425Simp{ 238292425Simp struct g_delay_softc *sc = data; 239292425Simp 240292425Simp sc->sc_in_flight--; 241292425Simp 242292425Simp g_sched_dispatch(sc->sc_geom); 243292425Simp} 244292425Simp 245292425Simpstatic void 246292425Simpg_delay_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 247292425Simp struct g_consumer *cp, struct g_provider *pp) 248292425Simp{ 249292425Simp} 250292425Simp 251292425Simpstatic struct g_gsched g_delay = { 252292425Simp .gs_name = "delay", 253292425Simp .gs_priv_size = 0, 254292425Simp .gs_init = g_delay_init, 255292425Simp .gs_fini = g_delay_fini, 256292425Simp .gs_start = g_delay_start, 257292425Simp .gs_done = g_delay_done, 258292425Simp .gs_next = g_delay_next, 259292425Simp .gs_dumpconf = g_delay_dumpconf, 260292425Simp .gs_init_class = g_delay_init_class, 261292425Simp .gs_fini_class = g_delay_fini_class, 262292425Simp}; 263292425Simp 264292425SimpDECLARE_GSCHED_MODULE(delay, &g_delay); 265