gs_delay.c revision 292425
1/*-
2 * Copyright (c) 2015 Netflix, Inc.
3 *
4 * Derived from gs_rr.c:
5 * Copyright (c) 2009-2010 Fabio Checconi
6 * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31/*
32 * $Id$
33 * $FreeBSD: head/sys/geom/sched/gs_delay.c 292425 2015-12-18 05:39:25Z imp $
34 *
35 * A simple scheduler that just delays certain transactions by a certain
36 * amount. We collect all the transactions that are 'done' and put them on
37 * a queue. The queue is run through every so often and the transactions that
38 * have taken longer than the threshold delay are completed.
39 */
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/kernel.h>
44#include <sys/bio.h>
45#include <sys/callout.h>
46#include <sys/malloc.h>
47#include <sys/module.h>
48#include <sys/proc.h>
49#include <sys/queue.h>
50#include <sys/sbuf.h>
51#include <sys/sysctl.h>
52#include "gs_scheduler.h"
53
54/* Useful constants */
55#define BTFRAC_1US 18446744073709ULL	/* 2^64 / 1000000 */
56
57/* list of scheduler instances */
58LIST_HEAD(g_scheds, g_delay_softc);
59
60/*
61 * Per device descriptor, holding the Round Robin list of queues
62 * accessing the disk, a reference to the geom, and the timer.
63 */
64struct g_delay_softc {
65	struct g_geom	*sc_geom;
66
67	struct bio_queue_head sc_bioq;	/* queue of pending requests */
68	struct callout	sc_wait;	/* timer for completing with delays */
69
70	/* Statistics */
71	int		sc_in_flight;	/* requests in the driver */
72};
73
74/*
75 * parameters, config and stats
76 */
77struct g_delay_params {
78	uint64_t io;
79	int	bypass;			/* bypass scheduling */
80	int	units;			/* how many instances */
81	int	latency;		/* How big a latncy are hoping for */
82};
83
84static struct g_delay_params me = {
85	.bypass = 0,
86	.units = 0,
87	.latency = 0,
88	.io = 0,
89};
90struct g_delay_params *gs_delay_me = &me;
91
92SYSCTL_DECL(_kern_geom_sched);
93static SYSCTL_NODE(_kern_geom_sched, OID_AUTO, delay, CTLFLAG_RW, 0,
94    "GEOM_SCHED DELAY stuff");
95SYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, bypass, CTLFLAG_RD,
96    &me.bypass, 0, "Scheduler bypass");
97SYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, units, CTLFLAG_RD,
98    &me.units, 0, "Scheduler instances");
99SYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, latency, CTLFLAG_RW,
100    &me.latency, 0, "Minimum latency for requests, in microseconds (1/hz resolution)");
101SYSCTL_QUAD(_kern_geom_sched_delay, OID_AUTO, io, CTLFLAG_RW,
102    &me.io, 0, "I/Os delayed\n");
103
104static int
105g_delay_init_class(void *data, void *priv)
106{
107	return (0);
108}
109
110static void
111g_delay_fini_class(void *data, void *priv)
112{
113}
114
115/*
116 * Called on a request arrival, timeout or completion.
117 * Try to serve a request among those queued.
118 */
119static struct bio *
120g_delay_next(void *data, int force)
121{
122	struct g_delay_softc *sc = data;
123	struct bio *bp;
124	struct bintime bt;
125
126	bp = bioq_first(&sc->sc_bioq);
127	if (bp == NULL)
128		return (NULL);
129
130	/*
131	 * If the time isn't yet ripe for this bp to be let loose,
132	 * then the time isn't ripe for any of its friends either
133	 * since we insert in-order. Terminate if the bio hasn't
134	 * aged appropriately. Note that there's pathology here
135	 * such that we may be up to one tick early in releasing
136	 * this I/O. We could implement this up to a tick late too
137	 * but choose not to.
138	 */
139	getbinuptime(&bt);	/* BIO's bio_t0 is uptime */
140	if (bintime_cmp(&bp->bio_t0, &bt, >))
141		return (NULL);
142	me.io++;
143
144	/*
145	 * The bp has mellowed enough, let it through and update stats.
146	 * If there's others, we'll catch them next time we get called.
147	 */
148	sc->sc_in_flight++;
149
150	bp = bioq_takefirst(&sc->sc_bioq);
151	return (bp);
152}
153
154/*
155 * Called when a real request for disk I/O arrives.
156 * Locate the queue associated with the client.
157 * If the queue is the one we are anticipating for, reset its timeout;
158 * if the queue is not in the round robin list, insert it in the list.
159 * On any error, do not queue the request and return -1, the caller
160 * will take care of this request.
161 */
162static int
163g_delay_start(void *data, struct bio *bp)
164{
165	struct g_delay_softc *sc = data;
166
167	if (me.bypass)
168		return (-1);	/* bypass the scheduler */
169
170	bp->bio_caller1 = sc;
171	getbinuptime(&bp->bio_t0);	/* BIO's bio_t0 is uptime */
172	bintime_addx(&bp->bio_t0, BTFRAC_1US * me.latency);
173
174	/*
175	 * Keep the I/Os ordered. Lower layers will reorder as we release them down.
176	 * We rely on this in g_delay_next() so that we delay all things equally. Even
177	 * if we move to multiple queues to push stuff down the stack, we'll want to
178	 * insert in order and let the lower layers do whatever reordering they want.
179	 */
180	bioq_insert_tail(&sc->sc_bioq, bp);
181
182	return (0);
183}
184
185static void
186g_delay_timeout(void *data)
187{
188	struct g_delay_softc *sc = data;
189
190	g_sched_lock(sc->sc_geom);
191	g_sched_dispatch(sc->sc_geom);
192	g_sched_unlock(sc->sc_geom);
193	callout_reset(&sc->sc_wait, 1, g_delay_timeout, sc);
194}
195
196/*
197 * Module glue: allocate descriptor, initialize its fields.
198 */
199static void *
200g_delay_init(struct g_geom *geom)
201{
202	struct g_delay_softc *sc;
203
204	/* XXX check whether we can sleep */
205	sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO);
206	sc->sc_geom = geom;
207	bioq_init(&sc->sc_bioq);
208	callout_init(&sc->sc_wait, CALLOUT_MPSAFE);
209	callout_reset(&sc->sc_wait, 1, g_delay_timeout, sc);
210	me.units++;
211
212	return (sc);
213}
214
215/*
216 * Module glue -- drain the callout structure, destroy the
217 * hash table and its element, and free the descriptor.
218 */
219static void
220g_delay_fini(void *data)
221{
222	struct g_delay_softc *sc = data;
223
224	/* We're force drained before getting here */
225
226	/* Kick out timers */
227	callout_drain(&sc->sc_wait);
228	me.units--;
229	free(sc, M_GEOM_SCHED);
230}
231
232/*
233 * Called when the request under service terminates.
234 * Start the anticipation timer if needed.
235 */
236static void
237g_delay_done(void *data, struct bio *bp)
238{
239	struct g_delay_softc *sc = data;
240
241	sc->sc_in_flight--;
242
243	g_sched_dispatch(sc->sc_geom);
244}
245
246static void
247g_delay_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
248    struct g_consumer *cp, struct g_provider *pp)
249{
250}
251
252static struct g_gsched g_delay = {
253	.gs_name = "delay",
254	.gs_priv_size = 0,
255	.gs_init = g_delay_init,
256	.gs_fini = g_delay_fini,
257	.gs_start = g_delay_start,
258	.gs_done = g_delay_done,
259	.gs_next = g_delay_next,
260	.gs_dumpconf = g_delay_dumpconf,
261	.gs_init_class = g_delay_init_class,
262	.gs_fini_class = g_delay_fini_class,
263};
264
265DECLARE_GSCHED_MODULE(delay, &g_delay);
266