1292425Simp/*-
2292425Simp * Copyright (c) 2015 Netflix, Inc.
3292425Simp *
4292425Simp * Derived from gs_rr.c:
5292425Simp * Copyright (c) 2009-2010 Fabio Checconi
6292425Simp * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
7292425Simp * All rights reserved.
8292425Simp *
9292425Simp * Redistribution and use in source and binary forms, with or without
10292425Simp * modification, are permitted provided that the following conditions
11292425Simp * are met:
12292425Simp * 1. Redistributions of source code must retain the above copyright
13292425Simp *    notice, this list of conditions and the following disclaimer.
14292425Simp * 2. Redistributions in binary form must reproduce the above copyright
15292425Simp *    notice, this list of conditions and the following disclaimer in the
16292425Simp *    documentation and/or other materials provided with the distribution.
17292425Simp *
18292425Simp * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19292425Simp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20292425Simp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21292425Simp * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22292425Simp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23292425Simp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24292425Simp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25292425Simp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26292425Simp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27292425Simp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28292425Simp * SUCH DAMAGE.
29292425Simp */
30292425Simp
31292425Simp/*
32292425Simp * $Id$
33292425Simp * $FreeBSD$
34292425Simp *
35292425Simp * A simple scheduler that just delays certain transactions by a certain
36292425Simp * amount. We collect all the transactions that are 'done' and put them on
37292425Simp * a queue. The queue is run through every so often and the transactions that
38292425Simp * have taken longer than the threshold delay are completed.
39292425Simp */
40292425Simp
41292425Simp#include <sys/param.h>
42292425Simp#include <sys/systm.h>
43292425Simp#include <sys/kernel.h>
44292425Simp#include <sys/bio.h>
45292425Simp#include <sys/callout.h>
46292425Simp#include <sys/malloc.h>
47292425Simp#include <sys/module.h>
48292425Simp#include <sys/proc.h>
49292425Simp#include <sys/queue.h>
50292425Simp#include <sys/sbuf.h>
51292425Simp#include <sys/sysctl.h>
52292425Simp#include "gs_scheduler.h"
53292425Simp
54292425Simp/* Useful constants */
55292425Simp#define BTFRAC_1US 18446744073709ULL	/* 2^64 / 1000000 */
56292425Simp
57292425Simp/* list of scheduler instances */
58292425SimpLIST_HEAD(g_scheds, g_delay_softc);
59292425Simp
60292425Simp/*
61292425Simp * Per device descriptor, holding the Round Robin list of queues
62292425Simp * accessing the disk, a reference to the geom, and the timer.
63292425Simp */
64292425Simpstruct g_delay_softc {
65292425Simp	struct g_geom	*sc_geom;
66292425Simp
67292425Simp	struct bio_queue_head sc_bioq;	/* queue of pending requests */
68292425Simp	struct callout	sc_wait;	/* timer for completing with delays */
69292425Simp
70292425Simp	/* Statistics */
71292425Simp	int		sc_in_flight;	/* requests in the driver */
72292425Simp};
73292425Simp
74292425Simp/*
75292425Simp * parameters, config and stats
76292425Simp */
77292425Simpstruct g_delay_params {
78292425Simp	uint64_t io;
79292425Simp	int	bypass;			/* bypass scheduling */
80292425Simp	int	units;			/* how many instances */
81292425Simp	int	latency;		/* How big a latncy are hoping for */
82292425Simp};
83292425Simp
84292425Simpstatic struct g_delay_params me = {
85292425Simp	.bypass = 0,
86292425Simp	.units = 0,
87292425Simp	.latency = 0,
88292425Simp	.io = 0,
89292425Simp};
90292425Simpstruct g_delay_params *gs_delay_me = &me;
91292425Simp
92292425SimpSYSCTL_DECL(_kern_geom_sched);
93292425Simpstatic SYSCTL_NODE(_kern_geom_sched, OID_AUTO, delay, CTLFLAG_RW, 0,
94292425Simp    "GEOM_SCHED DELAY stuff");
95292425SimpSYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, bypass, CTLFLAG_RD,
96292425Simp    &me.bypass, 0, "Scheduler bypass");
97292425SimpSYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, units, CTLFLAG_RD,
98292425Simp    &me.units, 0, "Scheduler instances");
99292425SimpSYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, latency, CTLFLAG_RW,
100292425Simp    &me.latency, 0, "Minimum latency for requests, in microseconds (1/hz resolution)");
101292425SimpSYSCTL_QUAD(_kern_geom_sched_delay, OID_AUTO, io, CTLFLAG_RW,
102292425Simp    &me.io, 0, "I/Os delayed\n");
103292425Simp
104292425Simpstatic int
105292425Simpg_delay_init_class(void *data, void *priv)
106292425Simp{
107292425Simp	return (0);
108292425Simp}
109292425Simp
110292425Simpstatic void
111292425Simpg_delay_fini_class(void *data, void *priv)
112292425Simp{
113292425Simp}
114292425Simp
115292425Simp/*
116292425Simp * Called on a request arrival, timeout or completion.
117292425Simp * Try to serve a request among those queued.
118292425Simp */
119292425Simpstatic struct bio *
120292425Simpg_delay_next(void *data, int force)
121292425Simp{
122292425Simp	struct g_delay_softc *sc = data;
123292425Simp	struct bio *bp;
124292425Simp	struct bintime bt;
125292425Simp
126292425Simp	bp = bioq_first(&sc->sc_bioq);
127292425Simp	if (bp == NULL)
128292425Simp		return (NULL);
129292425Simp
130292425Simp	/*
131292425Simp	 * If the time isn't yet ripe for this bp to be let loose,
132292425Simp	 * then the time isn't ripe for any of its friends either
133292425Simp	 * since we insert in-order. Terminate if the bio hasn't
134292425Simp	 * aged appropriately. Note that there's pathology here
135292425Simp	 * such that we may be up to one tick early in releasing
136292425Simp	 * this I/O. We could implement this up to a tick late too
137292425Simp	 * but choose not to.
138292425Simp	 */
139292425Simp	getbinuptime(&bt);	/* BIO's bio_t0 is uptime */
140292425Simp	if (bintime_cmp(&bp->bio_t0, &bt, >))
141292425Simp		return (NULL);
142292425Simp	me.io++;
143292425Simp
144292425Simp	/*
145292425Simp	 * The bp has mellowed enough, let it through and update stats.
146292425Simp	 * If there's others, we'll catch them next time we get called.
147292425Simp	 */
148292425Simp	sc->sc_in_flight++;
149292425Simp
150292425Simp	bp = bioq_takefirst(&sc->sc_bioq);
151292425Simp	return (bp);
152292425Simp}
153292425Simp
154292425Simp/*
155292425Simp * Called when a real request for disk I/O arrives.
156292425Simp * Locate the queue associated with the client.
157292425Simp * If the queue is the one we are anticipating for, reset its timeout;
158292425Simp * if the queue is not in the round robin list, insert it in the list.
159292425Simp * On any error, do not queue the request and return -1, the caller
160292425Simp * will take care of this request.
161292425Simp */
162292425Simpstatic int
163292425Simpg_delay_start(void *data, struct bio *bp)
164292425Simp{
165292425Simp	struct g_delay_softc *sc = data;
166292425Simp
167292425Simp	if (me.bypass)
168292425Simp		return (-1);	/* bypass the scheduler */
169292425Simp
170292425Simp	bp->bio_caller1 = sc;
171292425Simp	getbinuptime(&bp->bio_t0);	/* BIO's bio_t0 is uptime */
172292425Simp	bintime_addx(&bp->bio_t0, BTFRAC_1US * me.latency);
173292425Simp
174292425Simp	/*
175292425Simp	 * Keep the I/Os ordered. Lower layers will reorder as we release them down.
176292425Simp	 * We rely on this in g_delay_next() so that we delay all things equally. Even
177292425Simp	 * if we move to multiple queues to push stuff down the stack, we'll want to
178292425Simp	 * insert in order and let the lower layers do whatever reordering they want.
179292425Simp	 */
180292425Simp	bioq_insert_tail(&sc->sc_bioq, bp);
181292425Simp
182292425Simp	return (0);
183292425Simp}
184292425Simp
185292425Simpstatic void
186292425Simpg_delay_timeout(void *data)
187292425Simp{
188292425Simp	struct g_delay_softc *sc = data;
189292425Simp
190292425Simp	g_sched_lock(sc->sc_geom);
191292425Simp	g_sched_dispatch(sc->sc_geom);
192292425Simp	g_sched_unlock(sc->sc_geom);
193292425Simp	callout_reset(&sc->sc_wait, 1, g_delay_timeout, sc);
194292425Simp}
195292425Simp
196292425Simp/*
197292425Simp * Module glue: allocate descriptor, initialize its fields.
198292425Simp */
199292425Simpstatic void *
200292425Simpg_delay_init(struct g_geom *geom)
201292425Simp{
202292425Simp	struct g_delay_softc *sc;
203292425Simp
204292436Simp	sc = malloc(sizeof *sc, M_GEOM_SCHED, M_WAITOK | M_ZERO);
205292425Simp	sc->sc_geom = geom;
206292425Simp	bioq_init(&sc->sc_bioq);
207292425Simp	callout_init(&sc->sc_wait, CALLOUT_MPSAFE);
208292425Simp	callout_reset(&sc->sc_wait, 1, g_delay_timeout, sc);
209292425Simp	me.units++;
210292425Simp
211292425Simp	return (sc);
212292425Simp}
213292425Simp
214292425Simp/*
215292425Simp * Module glue -- drain the callout structure, destroy the
216292425Simp * hash table and its element, and free the descriptor.
217292425Simp */
218292425Simpstatic void
219292425Simpg_delay_fini(void *data)
220292425Simp{
221292425Simp	struct g_delay_softc *sc = data;
222292425Simp
223292425Simp	/* We're force drained before getting here */
224292425Simp
225292425Simp	/* Kick out timers */
226292425Simp	callout_drain(&sc->sc_wait);
227292425Simp	me.units--;
228292425Simp	free(sc, M_GEOM_SCHED);
229292425Simp}
230292425Simp
231292425Simp/*
232292425Simp * Called when the request under service terminates.
233292425Simp * Start the anticipation timer if needed.
234292425Simp */
235292425Simpstatic void
236292425Simpg_delay_done(void *data, struct bio *bp)
237292425Simp{
238292425Simp	struct g_delay_softc *sc = data;
239292425Simp
240292425Simp	sc->sc_in_flight--;
241292425Simp
242292425Simp	g_sched_dispatch(sc->sc_geom);
243292425Simp}
244292425Simp
245292425Simpstatic void
246292425Simpg_delay_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
247292425Simp    struct g_consumer *cp, struct g_provider *pp)
248292425Simp{
249292425Simp}
250292425Simp
251292425Simpstatic struct g_gsched g_delay = {
252292425Simp	.gs_name = "delay",
253292425Simp	.gs_priv_size = 0,
254292425Simp	.gs_init = g_delay_init,
255292425Simp	.gs_fini = g_delay_fini,
256292425Simp	.gs_start = g_delay_start,
257292425Simp	.gs_done = g_delay_done,
258292425Simp	.gs_next = g_delay_next,
259292425Simp	.gs_dumpconf = g_delay_dumpconf,
260292425Simp	.gs_init_class = g_delay_init_class,
261292425Simp	.gs_fini_class = g_delay_fini_class,
262292425Simp};
263292425Simp
264292425SimpDECLARE_GSCHED_MODULE(delay, &g_delay);
265