gs_rr.c revision 218675
1206497Sluigi/*-
2206552Sluigi * Copyright (c) 2009-2010 Fabio Checconi
3206552Sluigi * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
4206497Sluigi * All rights reserved.
5206497Sluigi *
6206497Sluigi * Redistribution and use in source and binary forms, with or without
7206497Sluigi * modification, are permitted provided that the following conditions
8206497Sluigi * are met:
9206497Sluigi * 1. Redistributions of source code must retain the above copyright
10206497Sluigi *    notice, this list of conditions and the following disclaimer.
11206497Sluigi * 2. Redistributions in binary form must reproduce the above copyright
12206497Sluigi *    notice, this list of conditions and the following disclaimer in the
13206497Sluigi *    documentation and/or other materials provided with the distribution.
14206497Sluigi *
15206497Sluigi * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16206497Sluigi * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17206497Sluigi * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18206497Sluigi * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19206497Sluigi * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20206497Sluigi * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21206497Sluigi * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22206497Sluigi * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23206497Sluigi * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24206497Sluigi * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25206497Sluigi * SUCH DAMAGE.
26206497Sluigi */
27206497Sluigi
28206497Sluigi/*
29206497Sluigi * $Id$
30206497Sluigi * $FreeBSD: head/sys/geom/sched/gs_rr.c 218675 2011-02-14 08:09:02Z luigi $
31206497Sluigi *
32206497Sluigi * A round-robin (RR) anticipatory scheduler, with per-client queues.
33206497Sluigi *
34206497Sluigi * The goal of this implementation is to improve throughput compared
35206497Sluigi * to the pure elevator algorithm, and insure some fairness among
36206497Sluigi * clients.
37206497Sluigi *
38206497Sluigi * Requests coming from the same client are put in the same queue.
39206497Sluigi * We use anticipation to help reducing seeks, and each queue
40206497Sluigi * is never served continuously for more than a given amount of
41206497Sluigi * time or data. Queues are then served in a round-robin fashion.
42206497Sluigi *
43206497Sluigi * Each queue can be in any of the following states:
44206497Sluigi *     READY	immediately serve the first pending request;
45206497Sluigi *     BUSY	one request is under service, wait for completion;
46206497Sluigi *     IDLING	do not serve incoming requests immediately, unless
47206497Sluigi * 		they are "eligible" as defined later.
48206497Sluigi *
49206497Sluigi * Scheduling is made looking at the status of all queues,
50206497Sluigi * and the first one in round-robin order is privileged.
51206497Sluigi */
52206497Sluigi
53206497Sluigi#include <sys/param.h>
54206497Sluigi#include <sys/systm.h>
55206497Sluigi#include <sys/kernel.h>
56206497Sluigi#include <sys/bio.h>
57206497Sluigi#include <sys/callout.h>
58206497Sluigi#include <sys/malloc.h>
59206497Sluigi#include <sys/module.h>
60206497Sluigi#include <sys/proc.h>
61206497Sluigi#include <sys/queue.h>
62206497Sluigi#include <sys/sysctl.h>
63206497Sluigi#include "gs_scheduler.h"
64206497Sluigi
65206497Sluigi/* possible states of the scheduler */
66206497Sluigienum g_rr_state {
67206497Sluigi	G_QUEUE_READY = 0,	/* Ready to dispatch. */
68206497Sluigi	G_QUEUE_BUSY,		/* Waiting for a completion. */
69206497Sluigi	G_QUEUE_IDLING		/* Waiting for a new request. */
70206497Sluigi};
71206497Sluigi
72206497Sluigi/* possible queue flags */
73206497Sluigienum g_rr_flags {
74218675Sluigi	/* G_FLAG_COMPLETED means that the field q_slice_end is valid. */
75206497Sluigi	G_FLAG_COMPLETED = 1,	/* Completed a req. in the current budget. */
76206497Sluigi};
77206497Sluigi
78206497Sluigistruct g_rr_softc;
79206497Sluigi
80206497Sluigi/*
81206497Sluigi * Queue descriptor, containing reference count, scheduling
82206497Sluigi * state, a queue of pending requests, configuration parameters.
83206497Sluigi * Queues with pending request(s) and not under service are also
84206497Sluigi * stored in a Round Robin (RR) list.
85206497Sluigi */
86206497Sluigistruct g_rr_queue {
87206497Sluigi	struct g_rr_softc *q_sc;	/* link to the parent */
88206497Sluigi
89206497Sluigi	enum g_rr_state	q_status;
90206497Sluigi	unsigned int	q_service;	/* service received so far */
91218675Sluigi	int		q_slice_end;	/* actual slice end time, in ticks */
92206497Sluigi	enum g_rr_flags	q_flags;	/* queue flags */
93206497Sluigi	struct bio_queue_head q_bioq;
94206497Sluigi
95206497Sluigi	/* Scheduling parameters */
96206497Sluigi	unsigned int	q_budget;	/* slice size in bytes */
97206497Sluigi	unsigned int	q_slice_duration; /* slice size in ticks */
98206497Sluigi	unsigned int	q_wait_ticks;	/* wait time for anticipation */
99206497Sluigi
100206497Sluigi	/* Stats to drive the various heuristics. */
101206497Sluigi	struct g_savg	q_thinktime;	/* Thinktime average. */
102206497Sluigi	struct g_savg	q_seekdist;	/* Seek distance average. */
103206497Sluigi
104206497Sluigi	int		q_bionum;	/* Number of requests. */
105206497Sluigi
106206497Sluigi	off_t		q_lastoff;	/* Last submitted req. offset. */
107206497Sluigi	int		q_lastsub;	/* Last submitted req. time. */
108206497Sluigi
109206497Sluigi	/* Expiration deadline for an empty queue. */
110206497Sluigi	int		q_expire;
111206497Sluigi
112206497Sluigi	TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */
113206497Sluigi};
114206497Sluigi
115206497Sluigi/* List types. */
116206497SluigiTAILQ_HEAD(g_rr_tailq, g_rr_queue);
117206497Sluigi
118206497Sluigi/* list of scheduler instances */
119206497SluigiLIST_HEAD(g_scheds, g_rr_softc);
120206497Sluigi
121206497Sluigi/* Default quantum for RR between queues. */
122206497Sluigi#define	G_RR_DEFAULT_BUDGET	0x00800000
123206497Sluigi
124206497Sluigi/*
125206497Sluigi * Per device descriptor, holding the Round Robin list of queues
126206497Sluigi * accessing the disk, a reference to the geom, and the timer.
127206497Sluigi */
128206497Sluigistruct g_rr_softc {
129206497Sluigi	struct g_geom	*sc_geom;
130206497Sluigi
131206497Sluigi	/*
132206497Sluigi	 * sc_active is the queue we are anticipating for.
133206497Sluigi	 * It is set only in gs_rr_next(), and possibly cleared
134206497Sluigi	 * only in gs_rr_next() or on a timeout.
135206497Sluigi	 * The active queue is never in the Round Robin list
136206497Sluigi	 * even if it has requests queued.
137206497Sluigi	 */
138206497Sluigi	struct g_rr_queue *sc_active;
139206497Sluigi	struct callout	sc_wait;	/* timer for sc_active */
140206497Sluigi
141206497Sluigi	struct g_rr_tailq sc_rr_tailq;	/* the round-robin list */
142206497Sluigi	int		sc_nqueues;	/* number of queues */
143206497Sluigi
144206497Sluigi	/* Statistics */
145206497Sluigi	int		sc_in_flight;	/* requests in the driver */
146206497Sluigi
147206497Sluigi	LIST_ENTRY(g_rr_softc)	sc_next;
148206497Sluigi};
149206497Sluigi
150206497Sluigi/* Descriptor for bounded values, min and max are constant. */
151206497Sluigistruct x_bound {
152206497Sluigi	const int	x_min;
153206497Sluigi	int		x_cur;
154206497Sluigi	const int	x_max;
155206497Sluigi};
156206497Sluigi
157206497Sluigi/*
158206497Sluigi * parameters, config and stats
159206497Sluigi */
160206497Sluigistruct g_rr_params {
161206497Sluigi	int	queues;			/* total number of queues */
162206497Sluigi	int	w_anticipate;		/* anticipate writes */
163206497Sluigi	int	bypass;			/* bypass scheduling writes */
164206497Sluigi
165206497Sluigi	int	units;			/* how many instances */
166206497Sluigi	/* sc_head is used for debugging */
167206497Sluigi	struct g_scheds	sc_head;	/* first scheduler instance */
168206497Sluigi
169206497Sluigi	struct x_bound queue_depth;	/* max parallel requests */
170206497Sluigi	struct x_bound wait_ms;		/* wait time, milliseconds */
171206497Sluigi	struct x_bound quantum_ms;	/* quantum size, milliseconds */
172206497Sluigi	struct x_bound quantum_kb;	/* quantum size, Kb (1024 bytes) */
173206497Sluigi
174206497Sluigi	/* statistics */
175206497Sluigi	int	wait_hit;		/* success in anticipation */
176206497Sluigi	int	wait_miss;		/* failure in anticipation */
177206497Sluigi};
178206497Sluigi
179206497Sluigi/*
180206497Sluigi * Default parameters for the scheduler.  The quantum sizes target
181206497Sluigi * a 80MB/s disk; if the hw is faster or slower the minimum of the
182206497Sluigi * two will have effect: the clients will still be isolated but
183206497Sluigi * the fairness may be limited.  A complete solution would involve
184206497Sluigi * the on-line measurement of the actual disk throughput to derive
185206497Sluigi * these parameters.  Or we may just choose to ignore service domain
186206497Sluigi * fairness and accept what can be achieved with time-only budgets.
187206497Sluigi */
188206497Sluigistatic struct g_rr_params me = {
189206497Sluigi	.sc_head = LIST_HEAD_INITIALIZER(&me.sc_head),
190206497Sluigi	.w_anticipate =	1,
191206497Sluigi	.queue_depth =	{ 1,	1,	50 },
192206497Sluigi	.wait_ms =	{ 1, 	10,	30 },
193206497Sluigi	.quantum_ms =	{ 1, 	100,	500 },
194206497Sluigi	.quantum_kb =	{ 16, 	8192,	65536 },
195206497Sluigi};
196206497Sluigi
197206497Sluigistruct g_rr_params *gs_rr_me = &me;
198206497Sluigi
199206497SluigiSYSCTL_DECL(_kern_geom_sched);
200206497SluigiSYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0,
201206497Sluigi    "GEOM_SCHED ROUND ROBIN stuff");
202217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD,
203206497Sluigi    &me.units, 0, "Scheduler instances");
204217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD,
205206497Sluigi    &me.queues, 0, "Total rr queues");
206217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW,
207206497Sluigi    &me.wait_ms.x_cur, 0, "Wait time milliseconds");
208217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW,
209206497Sluigi    &me.quantum_ms.x_cur, 0, "Quantum size milliseconds");
210217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW,
211206497Sluigi    &me.bypass, 0, "Bypass scheduler");
212217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW,
213206497Sluigi    &me.w_anticipate, 0, "Do anticipation on writes");
214217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW,
215206497Sluigi    &me.quantum_kb.x_cur, 0, "Quantum size Kbytes");
216217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW,
217206497Sluigi    &me.queue_depth.x_cur, 0, "Maximum simultaneous requests");
218217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW,
219206497Sluigi    &me.wait_hit, 0, "Hits in anticipation");
220217324SmdfSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW,
221206497Sluigi    &me.wait_miss, 0, "Misses in anticipation");
222206497Sluigi
223206497Sluigi#ifdef DEBUG_QUEUES
224206497Sluigi/* print the status of a queue */
225206497Sluigistatic void
226206497Sluigigs_rr_dump_q(struct g_rr_queue *qp, int index)
227206497Sluigi{
228206497Sluigi	int l = 0;
229206497Sluigi	struct bio *bp;
230206497Sluigi
231206497Sluigi	TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) {
232206497Sluigi		l++;
233206497Sluigi	}
234206497Sluigi	printf("--- rr queue %d %p status %d len %d ---\n",
235206497Sluigi	    index, qp, qp->q_status, l);
236206497Sluigi}
237206497Sluigi
238206497Sluigi/*
239206497Sluigi * Dump the scheduler status when writing to this sysctl variable.
240206497Sluigi * XXX right now we only dump the status of the last instance created.
241206497Sluigi * not a severe issue because this is only for debugging
242206497Sluigi */
243206497Sluigistatic int
244206497Sluigigs_rr_sysctl_status(SYSCTL_HANDLER_ARGS)
245206497Sluigi{
246206497Sluigi        int error, val = 0;
247206497Sluigi	struct g_rr_softc *sc;
248206497Sluigi
249206497Sluigi        error = sysctl_handle_int(oidp, &val, 0, req);
250206497Sluigi        if (error || !req->newptr )
251206497Sluigi                return (error);
252206497Sluigi
253206497Sluigi        printf("called %s\n", __FUNCTION__);
254206497Sluigi
255206497Sluigi	LIST_FOREACH(sc, &me.sc_head, sc_next) {
256206497Sluigi		int i, tot = 0;
257206497Sluigi		printf("--- sc %p active %p nqueues %d "
258206497Sluigi		    "callout %d in_flight %d ---\n",
259206497Sluigi		    sc, sc->sc_active, sc->sc_nqueues,
260206497Sluigi		    callout_active(&sc->sc_wait),
261206497Sluigi		    sc->sc_in_flight);
262206497Sluigi		for (i = 0; i < G_RR_HASH_SIZE; i++) {
263206497Sluigi			struct g_rr_queue *qp;
264206497Sluigi			LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) {
265206497Sluigi				gs_rr_dump_q(qp, tot);
266206497Sluigi				tot++;
267206497Sluigi			}
268206497Sluigi		}
269206497Sluigi	}
270206497Sluigi        return (0);
271206497Sluigi}
272206497Sluigi
273206497SluigiSYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status,
274206497Sluigi	CTLTYPE_UINT | CTLFLAG_RW,
275206497Sluigi    0, sizeof(int), gs_rr_sysctl_status, "I", "status");
276206497Sluigi
277206497Sluigi#endif	/* DEBUG_QUEUES */
278206497Sluigi
279206497Sluigi/*
280206497Sluigi * Get a bounded value, optionally convert to a min of t_min ticks.
281206497Sluigi */
282206497Sluigistatic int
283206497Sluigiget_bounded(struct x_bound *v, int t_min)
284206497Sluigi{
285206497Sluigi	int x;
286206497Sluigi
287206497Sluigi	x = v->x_cur;
288206497Sluigi	if (x < v->x_min)
289206497Sluigi		x = v->x_min;
290206497Sluigi	else if (x > v->x_max)
291206497Sluigi		x = v->x_max;
292206497Sluigi	if (t_min) {
293206497Sluigi		x = x * hz / 1000;	/* convert to ticks */
294206497Sluigi		if (x < t_min)
295206497Sluigi			x = t_min;
296206497Sluigi	}
297206497Sluigi	return x;
298206497Sluigi}
299206497Sluigi
300206497Sluigi/*
301206497Sluigi * Get a reference to the queue for bp, using the generic
302206497Sluigi * classification mechanism.
303206497Sluigi */
304206497Sluigistatic struct g_rr_queue *
305206497Sluigig_rr_queue_get(struct g_rr_softc *sc, struct bio *bp)
306206497Sluigi{
307206497Sluigi
308206497Sluigi	return (g_sched_get_class(sc->sc_geom, bp));
309206497Sluigi}
310206497Sluigi
311206497Sluigistatic int
312206497Sluigig_rr_init_class(void *data, void *priv)
313206497Sluigi{
314206497Sluigi	struct g_rr_softc *sc = data;
315206497Sluigi	struct g_rr_queue *qp = priv;
316206497Sluigi
317206497Sluigi	gs_bioq_init(&qp->q_bioq);
318206497Sluigi
319206497Sluigi	/*
320206497Sluigi	 * Set the initial parameters for the client:
321206497Sluigi	 * slice size in bytes and ticks, and wait ticks.
322206497Sluigi	 * Right now these are constant, but we could have
323206497Sluigi	 * autoconfiguration code to adjust the values based on
324206497Sluigi	 * the actual workload.
325206497Sluigi	 */
326206497Sluigi	qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0);
327206497Sluigi	qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
328206497Sluigi	qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
329206497Sluigi
330206497Sluigi	qp->q_sc = sc;		/* link to the parent */
331206497Sluigi	qp->q_sc->sc_nqueues++;
332206497Sluigi	me.queues++;
333206497Sluigi
334206497Sluigi	return (0);
335206497Sluigi}
336206497Sluigi
337206497Sluigi/*
338206497Sluigi * Release a reference to the queue.
339206497Sluigi */
340206497Sluigistatic void
341206497Sluigig_rr_queue_put(struct g_rr_queue *qp)
342206497Sluigi{
343206497Sluigi
344206497Sluigi	g_sched_put_class(qp->q_sc->sc_geom, qp);
345206497Sluigi}
346206497Sluigi
347206497Sluigistatic void
348206497Sluigig_rr_fini_class(void *data, void *priv)
349206497Sluigi{
350206497Sluigi	struct g_rr_queue *qp = priv;
351206497Sluigi
352206497Sluigi	KASSERT(gs_bioq_first(&qp->q_bioq) == NULL,
353206497Sluigi			("released nonempty queue"));
354206497Sluigi	qp->q_sc->sc_nqueues--;
355206497Sluigi	me.queues--;
356206497Sluigi}
357206497Sluigi
358206497Sluigistatic inline int
359206497Sluigig_rr_queue_expired(struct g_rr_queue *qp)
360206497Sluigi{
361206497Sluigi
362206497Sluigi	if (qp->q_service >= qp->q_budget)
363206497Sluigi		return (1);
364206497Sluigi
365206497Sluigi	if ((qp->q_flags & G_FLAG_COMPLETED) &&
366206497Sluigi	    ticks - qp->q_slice_end >= 0)
367206497Sluigi		return (1);
368206497Sluigi
369206497Sluigi	return (0);
370206497Sluigi}
371206497Sluigi
372206497Sluigistatic inline int
373206497Sluigig_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp)
374206497Sluigi{
375206497Sluigi	int wait = get_bounded(&me.wait_ms, 2);
376206497Sluigi
377206497Sluigi	if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE))
378206497Sluigi		return (0);
379206497Sluigi
380206497Sluigi	if (g_savg_valid(&qp->q_thinktime) &&
381206497Sluigi	    g_savg_read(&qp->q_thinktime) > wait)
382206497Sluigi		return (0);
383206497Sluigi
384206497Sluigi	if (g_savg_valid(&qp->q_seekdist) &&
385206497Sluigi	    g_savg_read(&qp->q_seekdist) > 8192)
386206497Sluigi		return (0);
387206497Sluigi
388206497Sluigi	return (1);
389206497Sluigi}
390206497Sluigi
391206497Sluigi/*
392206497Sluigi * Called on a request arrival, timeout or completion.
393206497Sluigi * Try to serve a request among those queued.
394206497Sluigi */
395206497Sluigistatic struct bio *
396206497Sluigig_rr_next(void *data, int force)
397206497Sluigi{
398206497Sluigi	struct g_rr_softc *sc = data;
399206497Sluigi	struct g_rr_queue *qp;
400206497Sluigi	struct bio *bp, *next;
401206497Sluigi	int expired;
402206497Sluigi
403206497Sluigi	qp = sc->sc_active;
404206497Sluigi	if (me.bypass == 0 && !force) {
405206497Sluigi		if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0))
406206497Sluigi			return (NULL);
407206497Sluigi
408206497Sluigi		/* Try with the queue under service first. */
409206497Sluigi		if (qp != NULL && qp->q_status != G_QUEUE_READY) {
410206497Sluigi			/*
411206497Sluigi			 * Queue is anticipating, ignore request.
412206497Sluigi			 * We should check that we are not past
413206497Sluigi			 * the timeout, but in that case the timeout
414206497Sluigi			 * will fire immediately afterwards so we
415206497Sluigi			 * don't bother.
416206497Sluigi			 */
417206497Sluigi			return (NULL);
418206497Sluigi		}
419206497Sluigi	} else if (qp != NULL && qp->q_status != G_QUEUE_READY) {
420206497Sluigi		g_rr_queue_put(qp);
421206497Sluigi		sc->sc_active = qp = NULL;
422206497Sluigi	}
423206497Sluigi
424206497Sluigi	/*
425206497Sluigi	 * No queue under service, look for the first in RR order.
426206497Sluigi	 * If we find it, select if as sc_active, clear service
427206497Sluigi	 * and record the end time of the slice.
428206497Sluigi	 */
429206497Sluigi	if (qp == NULL) {
430206497Sluigi		qp = TAILQ_FIRST(&sc->sc_rr_tailq);
431206497Sluigi		if (qp == NULL)
432206497Sluigi			return (NULL); /* no queues at all, return */
433206497Sluigi		/* otherwise select the new queue for service. */
434206497Sluigi		TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq);
435206497Sluigi		sc->sc_active = qp;
436206497Sluigi		qp->q_service = 0;
437206497Sluigi		qp->q_flags &= ~G_FLAG_COMPLETED;
438206497Sluigi	}
439206497Sluigi
440206497Sluigi	bp = gs_bioq_takefirst(&qp->q_bioq);	/* surely not NULL */
441206497Sluigi	qp->q_service += bp->bio_length;	/* charge the service */
442206497Sluigi
443206497Sluigi	/*
444206497Sluigi	 * The request at the head of the active queue is always
445206497Sluigi	 * dispatched, and gs_rr_next() will be called again
446206497Sluigi	 * immediately.
447206497Sluigi	 * We need to prepare for what to do next:
448206497Sluigi	 *
449206497Sluigi	 * 1. have we reached the end of the (time or service) slice ?
450206497Sluigi	 *    If so, clear sc_active and possibly requeue the previous
451206497Sluigi	 *    active queue if it has more requests pending;
452206497Sluigi	 * 2. do we have more requests in sc_active ?
453206497Sluigi	 *    If yes, do not anticipate, as gs_rr_next() will run again;
454206497Sluigi	 *    if no, decide whether or not to anticipate depending
455206497Sluigi	 *    on read or writes (e.g., anticipate only on reads).
456206497Sluigi	 */
457206497Sluigi	expired = g_rr_queue_expired(qp);	/* are we expired ? */
458206497Sluigi	next = gs_bioq_first(&qp->q_bioq);	/* do we have one more ? */
459206497Sluigi 	if (expired) {
460206497Sluigi		sc->sc_active = NULL;
461206497Sluigi		/* Either requeue or release reference. */
462206497Sluigi		if (next != NULL)
463206497Sluigi			TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
464206497Sluigi		else
465206497Sluigi			g_rr_queue_put(qp);
466206497Sluigi	} else if (next != NULL) {
467206497Sluigi		qp->q_status = G_QUEUE_READY;
468206497Sluigi	} else {
469206497Sluigi		if (!force && g_rr_should_anticipate(qp, bp)) {
470206497Sluigi			/* anticipate */
471206497Sluigi			qp->q_status = G_QUEUE_BUSY;
472206497Sluigi		} else {
473206497Sluigi			/* do not anticipate, release reference */
474206497Sluigi			g_rr_queue_put(qp);
475206497Sluigi			sc->sc_active = NULL;
476206497Sluigi		}
477206497Sluigi	}
478206497Sluigi	/* If sc_active != NULL, its q_status is always correct. */
479206497Sluigi
480206497Sluigi	sc->sc_in_flight++;
481206497Sluigi
482206497Sluigi	return (bp);
483206497Sluigi}
484206497Sluigi
485206497Sluigistatic inline void
486206497Sluigig_rr_update_thinktime(struct g_rr_queue *qp)
487206497Sluigi{
488206497Sluigi	int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2);
489206497Sluigi
490206497Sluigi	if (qp->q_sc->sc_active != qp)
491206497Sluigi		return;
492206497Sluigi
493206497Sluigi	qp->q_lastsub = ticks;
494206497Sluigi	delta = (delta > 2 * wait) ? 2 * wait : delta;
495206497Sluigi	if (qp->q_bionum > 7)
496206497Sluigi		g_savg_add_sample(&qp->q_thinktime, delta);
497206497Sluigi}
498206497Sluigi
499206497Sluigistatic inline void
500206497Sluigig_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp)
501206497Sluigi{
502206497Sluigi	off_t dist;
503206497Sluigi
504206497Sluigi	if (qp->q_lastoff > bp->bio_offset)
505206497Sluigi		dist = qp->q_lastoff - bp->bio_offset;
506206497Sluigi	else
507206497Sluigi		dist = bp->bio_offset - qp->q_lastoff;
508206497Sluigi
509206497Sluigi	if (dist > (8192 * 8))
510206497Sluigi		dist = 8192 * 8;
511206497Sluigi
512206497Sluigi	qp->q_lastoff = bp->bio_offset + bp->bio_length;
513206497Sluigi
514206497Sluigi	if (qp->q_bionum > 7)
515206497Sluigi		g_savg_add_sample(&qp->q_seekdist, dist);
516206497Sluigi}
517206497Sluigi
518206497Sluigi/*
519206497Sluigi * Called when a real request for disk I/O arrives.
520206497Sluigi * Locate the queue associated with the client.
521206497Sluigi * If the queue is the one we are anticipating for, reset its timeout;
522206497Sluigi * if the queue is not in the round robin list, insert it in the list.
523206497Sluigi * On any error, do not queue the request and return -1, the caller
524206497Sluigi * will take care of this request.
525206497Sluigi */
526206497Sluigistatic int
527206497Sluigig_rr_start(void *data, struct bio *bp)
528206497Sluigi{
529206497Sluigi	struct g_rr_softc *sc = data;
530206497Sluigi	struct g_rr_queue *qp;
531206497Sluigi
532206497Sluigi	if (me.bypass)
533206497Sluigi		return (-1);	/* bypass the scheduler */
534206497Sluigi
535206497Sluigi	/* Get the queue for the request. */
536206497Sluigi	qp = g_rr_queue_get(sc, bp);
537206497Sluigi	if (qp == NULL)
538206497Sluigi		return (-1); /* allocation failed, tell upstream */
539206497Sluigi
540206497Sluigi	if (gs_bioq_first(&qp->q_bioq) == NULL) {
541206497Sluigi		/*
542206497Sluigi		 * We are inserting into an empty queue.
543206497Sluigi		 * Reset its state if it is sc_active,
544206497Sluigi		 * otherwise insert it in the RR list.
545206497Sluigi		 */
546206497Sluigi		if (qp == sc->sc_active) {
547206497Sluigi			qp->q_status = G_QUEUE_READY;
548206497Sluigi			callout_stop(&sc->sc_wait);
549206497Sluigi		} else {
550206497Sluigi			g_sched_priv_ref(qp);
551206497Sluigi			TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
552206497Sluigi		}
553206497Sluigi	}
554206497Sluigi
555206497Sluigi	qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3);
556206497Sluigi
557206497Sluigi	g_rr_update_thinktime(qp);
558206497Sluigi	g_rr_update_seekdist(qp, bp);
559206497Sluigi
560206497Sluigi	/* Inherit the reference returned by g_rr_queue_get(). */
561206497Sluigi	bp->bio_caller1 = qp;
562206497Sluigi	gs_bioq_disksort(&qp->q_bioq, bp);
563206497Sluigi
564206497Sluigi	return (0);
565206497Sluigi}
566206497Sluigi
567206497Sluigi/*
568206497Sluigi * Callout executed when a queue times out anticipating a new request.
569206497Sluigi */
570206497Sluigistatic void
571206497Sluigig_rr_wait_timeout(void *data)
572206497Sluigi{
573206497Sluigi	struct g_rr_softc *sc = data;
574206497Sluigi	struct g_geom *geom = sc->sc_geom;
575206497Sluigi
576206497Sluigi	g_sched_lock(geom);
577206497Sluigi	/*
578206497Sluigi	 * We can race with other events, so check if
579206497Sluigi	 * sc_active is still valid.
580206497Sluigi	 */
581206497Sluigi	if (sc->sc_active != NULL) {
582206497Sluigi		/* Release the reference to the queue. */
583206497Sluigi		g_rr_queue_put(sc->sc_active);
584206497Sluigi		sc->sc_active = NULL;
585206497Sluigi		me.wait_hit--;
586206497Sluigi		me.wait_miss++;	/* record the miss */
587206497Sluigi	}
588206497Sluigi	g_sched_dispatch(geom);
589206497Sluigi	g_sched_unlock(geom);
590206497Sluigi}
591206497Sluigi
592206497Sluigi/*
593206497Sluigi * Module glue: allocate descriptor, initialize its fields.
594206497Sluigi */
595206497Sluigistatic void *
596206497Sluigig_rr_init(struct g_geom *geom)
597206497Sluigi{
598206497Sluigi	struct g_rr_softc *sc;
599206497Sluigi
600206497Sluigi	/* XXX check whether we can sleep */
601206497Sluigi	sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO);
602206497Sluigi	sc->sc_geom = geom;
603206497Sluigi	TAILQ_INIT(&sc->sc_rr_tailq);
604206497Sluigi	callout_init(&sc->sc_wait, CALLOUT_MPSAFE);
605206497Sluigi	LIST_INSERT_HEAD(&me.sc_head, sc, sc_next);
606206497Sluigi	me.units++;
607206497Sluigi
608206497Sluigi	return (sc);
609206497Sluigi}
610206497Sluigi
611206497Sluigi/*
612206497Sluigi * Module glue -- drain the callout structure, destroy the
613206497Sluigi * hash table and its element, and free the descriptor.
614206497Sluigi */
615206497Sluigistatic void
616206497Sluigig_rr_fini(void *data)
617206497Sluigi{
618206497Sluigi	struct g_rr_softc *sc = data;
619206497Sluigi
620206497Sluigi	callout_drain(&sc->sc_wait);
621206497Sluigi	KASSERT(sc->sc_active == NULL, ("still a queue under service"));
622206497Sluigi	KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues"));
623206497Sluigi
624206497Sluigi	LIST_REMOVE(sc, sc_next);
625206497Sluigi	me.units--;
626206497Sluigi	free(sc, M_GEOM_SCHED);
627206497Sluigi}
628206497Sluigi
629206497Sluigi/*
630206497Sluigi * Called when the request under service terminates.
631206497Sluigi * Start the anticipation timer if needed.
632206497Sluigi */
633206497Sluigistatic void
634206497Sluigig_rr_done(void *data, struct bio *bp)
635206497Sluigi{
636206497Sluigi	struct g_rr_softc *sc = data;
637206497Sluigi	struct g_rr_queue *qp;
638206497Sluigi
639206497Sluigi	sc->sc_in_flight--;
640206497Sluigi
641206497Sluigi	qp = bp->bio_caller1;
642218675Sluigi
643218675Sluigi	/*
644218675Sluigi	 * When the first request for this queue completes, update the
645218675Sluigi	 * duration and end of the slice. We do not do it when the
646218675Sluigi	 * slice starts to avoid charging to the queue the time for
647218675Sluigi	 * the first seek.
648218675Sluigi	 */
649218675Sluigi	if (!(qp->q_flags & G_FLAG_COMPLETED)) {
650218675Sluigi		qp->q_flags |= G_FLAG_COMPLETED;
651218675Sluigi		/*
652218675Sluigi		 * recompute the slice duration, in case we want
653218675Sluigi		 * to make it adaptive. This is not used right now.
654218675Sluigi		 * XXX should we do the same for q_quantum and q_wait_ticks ?
655218675Sluigi		 */
656218675Sluigi		qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
657218675Sluigi		qp->q_slice_end = ticks + qp->q_slice_duration;
658218675Sluigi	}
659218675Sluigi
660206497Sluigi	if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) {
661206497Sluigi		/* The queue is trying anticipation, start the timer. */
662206497Sluigi		qp->q_status = G_QUEUE_IDLING;
663206497Sluigi		/* may make this adaptive */
664206497Sluigi		qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
665206497Sluigi		me.wait_hit++;
666206497Sluigi		callout_reset(&sc->sc_wait, qp->q_wait_ticks,
667206497Sluigi		    g_rr_wait_timeout, sc);
668206497Sluigi	} else
669206497Sluigi		g_sched_dispatch(sc->sc_geom);
670206497Sluigi
671206497Sluigi	/* Release a reference to the queue. */
672206497Sluigi	g_rr_queue_put(qp);
673206497Sluigi}
674206497Sluigi
675206497Sluigistatic void
676206497Sluigig_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
677206497Sluigi    struct g_consumer *cp, struct g_provider *pp)
678206497Sluigi{
679206497Sluigi	if (indent == NULL) {   /* plaintext */
680206497Sluigi		sbuf_printf(sb, " units %d queues %d",
681206497Sluigi			me.units, me.queues);
682206497Sluigi        }
683206497Sluigi}
684206497Sluigi
685206497Sluigistatic struct g_gsched g_rr = {
686206497Sluigi	.gs_name = "rr",
687206497Sluigi	.gs_priv_size = sizeof(struct g_rr_queue),
688206497Sluigi	.gs_init = g_rr_init,
689206497Sluigi	.gs_fini = g_rr_fini,
690206497Sluigi	.gs_start = g_rr_start,
691206497Sluigi	.gs_done = g_rr_done,
692206497Sluigi	.gs_next = g_rr_next,
693206497Sluigi	.gs_dumpconf = g_rr_dumpconf,
694206497Sluigi	.gs_init_class = g_rr_init_class,
695206497Sluigi	.gs_fini_class = g_rr_fini_class,
696206497Sluigi};
697206497Sluigi
698206497SluigiDECLARE_GSCHED_MODULE(rr, &g_rr);
699