gs_rr.c revision 217324
1219820Sjeff/*-
2219820Sjeff * Copyright (c) 2009-2010 Fabio Checconi
3219820Sjeff * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
4219820Sjeff * All rights reserved.
5219820Sjeff *
6219820Sjeff * Redistribution and use in source and binary forms, with or without
7219820Sjeff * modification, are permitted provided that the following conditions
8219820Sjeff * are met:
9219820Sjeff * 1. Redistributions of source code must retain the above copyright
10219820Sjeff *    notice, this list of conditions and the following disclaimer.
11219820Sjeff * 2. Redistributions in binary form must reproduce the above copyright
12219820Sjeff *    notice, this list of conditions and the following disclaimer in the
13219820Sjeff *    documentation and/or other materials provided with the distribution.
14219820Sjeff *
15219820Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16219820Sjeff * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17219820Sjeff * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18219820Sjeff * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19219820Sjeff * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20219820Sjeff * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21219820Sjeff * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22219820Sjeff * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23219820Sjeff * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24219820Sjeff * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25219820Sjeff * SUCH DAMAGE.
26219820Sjeff */
27219820Sjeff
28219820Sjeff/*
29219820Sjeff * $Id$
30219820Sjeff * $FreeBSD: head/sys/geom/sched/gs_rr.c 217324 2011-01-12 19:54:07Z mdf $
31219820Sjeff *
32219820Sjeff * A round-robin (RR) anticipatory scheduler, with per-client queues.
33219820Sjeff *
34219820Sjeff * The goal of this implementation is to improve throughput compared
35219820Sjeff * to the pure elevator algorithm, and insure some fairness among
36219820Sjeff * clients.
37219820Sjeff *
38219820Sjeff * Requests coming from the same client are put in the same queue.
39219820Sjeff * We use anticipation to help reducing seeks, and each queue
40219820Sjeff * is never served continuously for more than a given amount of
41219820Sjeff * time or data. Queues are then served in a round-robin fashion.
42219820Sjeff *
43219820Sjeff * Each queue can be in any of the following states:
44219820Sjeff *     READY	immediately serve the first pending request;
45219820Sjeff *     BUSY	one request is under service, wait for completion;
46219820Sjeff *     IDLING	do not serve incoming requests immediately, unless
47219820Sjeff * 		they are "eligible" as defined later.
48219820Sjeff *
49219820Sjeff * Scheduling is made looking at the status of all queues,
50219820Sjeff * and the first one in round-robin order is privileged.
51219820Sjeff */
52219820Sjeff
53219820Sjeff#include <sys/param.h>
54219820Sjeff#include <sys/systm.h>
55219820Sjeff#include <sys/kernel.h>
56219820Sjeff#include <sys/bio.h>
57219820Sjeff#include <sys/callout.h>
58219820Sjeff#include <sys/malloc.h>
59219820Sjeff#include <sys/module.h>
60219820Sjeff#include <sys/proc.h>
61219820Sjeff#include <sys/queue.h>
62219820Sjeff#include <sys/sysctl.h>
63219820Sjeff#include "gs_scheduler.h"
64219820Sjeff
65219820Sjeff/* possible states of the scheduler */
66219820Sjeffenum g_rr_state {
67219820Sjeff	G_QUEUE_READY = 0,	/* Ready to dispatch. */
68219820Sjeff	G_QUEUE_BUSY,		/* Waiting for a completion. */
69219820Sjeff	G_QUEUE_IDLING		/* Waiting for a new request. */
70219820Sjeff};
71219820Sjeff
72219820Sjeff/* possible queue flags */
73219820Sjeffenum g_rr_flags {
74219820Sjeff	G_FLAG_COMPLETED = 1,	/* Completed a req. in the current budget. */
75219820Sjeff};
76219820Sjeff
77219820Sjeffstruct g_rr_softc;
78219820Sjeff
79219820Sjeff/*
80219820Sjeff * Queue descriptor, containing reference count, scheduling
81219820Sjeff * state, a queue of pending requests, configuration parameters.
82219820Sjeff * Queues with pending request(s) and not under service are also
83219820Sjeff * stored in a Round Robin (RR) list.
84219820Sjeff */
85219820Sjeffstruct g_rr_queue {
86219820Sjeff	struct g_rr_softc *q_sc;	/* link to the parent */
87219820Sjeff
88219820Sjeff	enum g_rr_state	q_status;
89219820Sjeff	unsigned int	q_service;	/* service received so far */
90219820Sjeff	int		q_slice_end;	/* actual slice end in ticks */
91219820Sjeff	enum g_rr_flags	q_flags;	/* queue flags */
92219820Sjeff	struct bio_queue_head q_bioq;
93219820Sjeff
94219820Sjeff	/* Scheduling parameters */
95219820Sjeff	unsigned int	q_budget;	/* slice size in bytes */
96219820Sjeff	unsigned int	q_slice_duration; /* slice size in ticks */
97219820Sjeff	unsigned int	q_wait_ticks;	/* wait time for anticipation */
98219820Sjeff
99219820Sjeff	/* Stats to drive the various heuristics. */
100219820Sjeff	struct g_savg	q_thinktime;	/* Thinktime average. */
101219820Sjeff	struct g_savg	q_seekdist;	/* Seek distance average. */
102219820Sjeff
103219820Sjeff	int		q_bionum;	/* Number of requests. */
104219820Sjeff
105219820Sjeff	off_t		q_lastoff;	/* Last submitted req. offset. */
106219820Sjeff	int		q_lastsub;	/* Last submitted req. time. */
107219820Sjeff
108219820Sjeff	/* Expiration deadline for an empty queue. */
109219820Sjeff	int		q_expire;
110219820Sjeff
111219820Sjeff	TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */
112219820Sjeff};
113219820Sjeff
114219820Sjeff/* List types. */
115219820SjeffTAILQ_HEAD(g_rr_tailq, g_rr_queue);
116219820Sjeff
117219820Sjeff/* list of scheduler instances */
118219820SjeffLIST_HEAD(g_scheds, g_rr_softc);
119219820Sjeff
120219820Sjeff/* Default quantum for RR between queues. */
121219820Sjeff#define	G_RR_DEFAULT_BUDGET	0x00800000
122219820Sjeff
123219820Sjeff/*
124219820Sjeff * Per device descriptor, holding the Round Robin list of queues
125219820Sjeff * accessing the disk, a reference to the geom, and the timer.
126219820Sjeff */
127219820Sjeffstruct g_rr_softc {
128219820Sjeff	struct g_geom	*sc_geom;
129219820Sjeff
130219820Sjeff	/*
131219820Sjeff	 * sc_active is the queue we are anticipating for.
132219820Sjeff	 * It is set only in gs_rr_next(), and possibly cleared
133219820Sjeff	 * only in gs_rr_next() or on a timeout.
134219820Sjeff	 * The active queue is never in the Round Robin list
135219820Sjeff	 * even if it has requests queued.
136219820Sjeff	 */
137219820Sjeff	struct g_rr_queue *sc_active;
138219820Sjeff	struct callout	sc_wait;	/* timer for sc_active */
139219820Sjeff
140219820Sjeff	struct g_rr_tailq sc_rr_tailq;	/* the round-robin list */
141219820Sjeff	int		sc_nqueues;	/* number of queues */
142219820Sjeff
143219820Sjeff	/* Statistics */
144	int		sc_in_flight;	/* requests in the driver */
145
146	LIST_ENTRY(g_rr_softc)	sc_next;
147};
148
149/* Descriptor for bounded values, min and max are constant. */
150struct x_bound {
151	const int	x_min;
152	int		x_cur;
153	const int	x_max;
154};
155
156/*
157 * parameters, config and stats
158 */
159struct g_rr_params {
160	int	queues;			/* total number of queues */
161	int	w_anticipate;		/* anticipate writes */
162	int	bypass;			/* bypass scheduling writes */
163
164	int	units;			/* how many instances */
165	/* sc_head is used for debugging */
166	struct g_scheds	sc_head;	/* first scheduler instance */
167
168	struct x_bound queue_depth;	/* max parallel requests */
169	struct x_bound wait_ms;		/* wait time, milliseconds */
170	struct x_bound quantum_ms;	/* quantum size, milliseconds */
171	struct x_bound quantum_kb;	/* quantum size, Kb (1024 bytes) */
172
173	/* statistics */
174	int	wait_hit;		/* success in anticipation */
175	int	wait_miss;		/* failure in anticipation */
176};
177
178/*
179 * Default parameters for the scheduler.  The quantum sizes target
180 * a 80MB/s disk; if the hw is faster or slower the minimum of the
181 * two will have effect: the clients will still be isolated but
182 * the fairness may be limited.  A complete solution would involve
183 * the on-line measurement of the actual disk throughput to derive
184 * these parameters.  Or we may just choose to ignore service domain
185 * fairness and accept what can be achieved with time-only budgets.
186 */
187static struct g_rr_params me = {
188	.sc_head = LIST_HEAD_INITIALIZER(&me.sc_head),
189	.w_anticipate =	1,
190	.queue_depth =	{ 1,	1,	50 },
191	.wait_ms =	{ 1, 	10,	30 },
192	.quantum_ms =	{ 1, 	100,	500 },
193	.quantum_kb =	{ 16, 	8192,	65536 },
194};
195
196struct g_rr_params *gs_rr_me = &me;
197
198SYSCTL_DECL(_kern_geom_sched);
199SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0,
200    "GEOM_SCHED ROUND ROBIN stuff");
201SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD,
202    &me.units, 0, "Scheduler instances");
203SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD,
204    &me.queues, 0, "Total rr queues");
205SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW,
206    &me.wait_ms.x_cur, 0, "Wait time milliseconds");
207SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW,
208    &me.quantum_ms.x_cur, 0, "Quantum size milliseconds");
209SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW,
210    &me.bypass, 0, "Bypass scheduler");
211SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW,
212    &me.w_anticipate, 0, "Do anticipation on writes");
213SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW,
214    &me.quantum_kb.x_cur, 0, "Quantum size Kbytes");
215SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW,
216    &me.queue_depth.x_cur, 0, "Maximum simultaneous requests");
217SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW,
218    &me.wait_hit, 0, "Hits in anticipation");
219SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW,
220    &me.wait_miss, 0, "Misses in anticipation");
221
222#ifdef DEBUG_QUEUES
223/* print the status of a queue */
224static void
225gs_rr_dump_q(struct g_rr_queue *qp, int index)
226{
227	int l = 0;
228	struct bio *bp;
229
230	TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) {
231		l++;
232	}
233	printf("--- rr queue %d %p status %d len %d ---\n",
234	    index, qp, qp->q_status, l);
235}
236
237/*
238 * Dump the scheduler status when writing to this sysctl variable.
239 * XXX right now we only dump the status of the last instance created.
240 * not a severe issue because this is only for debugging
241 */
242static int
243gs_rr_sysctl_status(SYSCTL_HANDLER_ARGS)
244{
245        int error, val = 0;
246	struct g_rr_softc *sc;
247
248        error = sysctl_handle_int(oidp, &val, 0, req);
249        if (error || !req->newptr )
250                return (error);
251
252        printf("called %s\n", __FUNCTION__);
253
254	LIST_FOREACH(sc, &me.sc_head, sc_next) {
255		int i, tot = 0;
256		printf("--- sc %p active %p nqueues %d "
257		    "callout %d in_flight %d ---\n",
258		    sc, sc->sc_active, sc->sc_nqueues,
259		    callout_active(&sc->sc_wait),
260		    sc->sc_in_flight);
261		for (i = 0; i < G_RR_HASH_SIZE; i++) {
262			struct g_rr_queue *qp;
263			LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) {
264				gs_rr_dump_q(qp, tot);
265				tot++;
266			}
267		}
268	}
269        return (0);
270}
271
272SYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status,
273	CTLTYPE_UINT | CTLFLAG_RW,
274    0, sizeof(int), gs_rr_sysctl_status, "I", "status");
275
276#endif	/* DEBUG_QUEUES */
277
278/*
279 * Get a bounded value, optionally convert to a min of t_min ticks.
280 */
281static int
282get_bounded(struct x_bound *v, int t_min)
283{
284	int x;
285
286	x = v->x_cur;
287	if (x < v->x_min)
288		x = v->x_min;
289	else if (x > v->x_max)
290		x = v->x_max;
291	if (t_min) {
292		x = x * hz / 1000;	/* convert to ticks */
293		if (x < t_min)
294			x = t_min;
295	}
296	return x;
297}
298
299/*
300 * Get a reference to the queue for bp, using the generic
301 * classification mechanism.
302 */
303static struct g_rr_queue *
304g_rr_queue_get(struct g_rr_softc *sc, struct bio *bp)
305{
306
307	return (g_sched_get_class(sc->sc_geom, bp));
308}
309
310static int
311g_rr_init_class(void *data, void *priv)
312{
313	struct g_rr_softc *sc = data;
314	struct g_rr_queue *qp = priv;
315
316	gs_bioq_init(&qp->q_bioq);
317
318	/*
319	 * Set the initial parameters for the client:
320	 * slice size in bytes and ticks, and wait ticks.
321	 * Right now these are constant, but we could have
322	 * autoconfiguration code to adjust the values based on
323	 * the actual workload.
324	 */
325	qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0);
326	qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
327	qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
328
329	qp->q_sc = sc;		/* link to the parent */
330	qp->q_sc->sc_nqueues++;
331	me.queues++;
332
333	return (0);
334}
335
336/*
337 * Release a reference to the queue.
338 */
339static void
340g_rr_queue_put(struct g_rr_queue *qp)
341{
342
343	g_sched_put_class(qp->q_sc->sc_geom, qp);
344}
345
346static void
347g_rr_fini_class(void *data, void *priv)
348{
349	struct g_rr_queue *qp = priv;
350
351	KASSERT(gs_bioq_first(&qp->q_bioq) == NULL,
352			("released nonempty queue"));
353	qp->q_sc->sc_nqueues--;
354	me.queues--;
355}
356
357static inline int
358g_rr_queue_expired(struct g_rr_queue *qp)
359{
360
361	if (qp->q_service >= qp->q_budget)
362		return (1);
363
364	if ((qp->q_flags & G_FLAG_COMPLETED) &&
365	    ticks - qp->q_slice_end >= 0)
366		return (1);
367
368	return (0);
369}
370
371static inline int
372g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp)
373{
374	int wait = get_bounded(&me.wait_ms, 2);
375
376	if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE))
377		return (0);
378
379	if (g_savg_valid(&qp->q_thinktime) &&
380	    g_savg_read(&qp->q_thinktime) > wait)
381		return (0);
382
383	if (g_savg_valid(&qp->q_seekdist) &&
384	    g_savg_read(&qp->q_seekdist) > 8192)
385		return (0);
386
387	return (1);
388}
389
390/*
391 * Called on a request arrival, timeout or completion.
392 * Try to serve a request among those queued.
393 */
394static struct bio *
395g_rr_next(void *data, int force)
396{
397	struct g_rr_softc *sc = data;
398	struct g_rr_queue *qp;
399	struct bio *bp, *next;
400	int expired;
401
402	qp = sc->sc_active;
403	if (me.bypass == 0 && !force) {
404		if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0))
405			return (NULL);
406
407		/* Try with the queue under service first. */
408		if (qp != NULL && qp->q_status != G_QUEUE_READY) {
409			/*
410			 * Queue is anticipating, ignore request.
411			 * We should check that we are not past
412			 * the timeout, but in that case the timeout
413			 * will fire immediately afterwards so we
414			 * don't bother.
415			 */
416			return (NULL);
417		}
418	} else if (qp != NULL && qp->q_status != G_QUEUE_READY) {
419		g_rr_queue_put(qp);
420		sc->sc_active = qp = NULL;
421	}
422
423	/*
424	 * No queue under service, look for the first in RR order.
425	 * If we find it, select if as sc_active, clear service
426	 * and record the end time of the slice.
427	 */
428	if (qp == NULL) {
429		qp = TAILQ_FIRST(&sc->sc_rr_tailq);
430		if (qp == NULL)
431			return (NULL); /* no queues at all, return */
432		/* otherwise select the new queue for service. */
433		TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq);
434		sc->sc_active = qp;
435		qp->q_service = 0;
436		qp->q_flags &= ~G_FLAG_COMPLETED;
437	}
438
439	bp = gs_bioq_takefirst(&qp->q_bioq);	/* surely not NULL */
440	qp->q_service += bp->bio_length;	/* charge the service */
441
442	/*
443	 * The request at the head of the active queue is always
444	 * dispatched, and gs_rr_next() will be called again
445	 * immediately.
446	 * We need to prepare for what to do next:
447	 *
448	 * 1. have we reached the end of the (time or service) slice ?
449	 *    If so, clear sc_active and possibly requeue the previous
450	 *    active queue if it has more requests pending;
451	 * 2. do we have more requests in sc_active ?
452	 *    If yes, do not anticipate, as gs_rr_next() will run again;
453	 *    if no, decide whether or not to anticipate depending
454	 *    on read or writes (e.g., anticipate only on reads).
455	 */
456	expired = g_rr_queue_expired(qp);	/* are we expired ? */
457	next = gs_bioq_first(&qp->q_bioq);	/* do we have one more ? */
458 	if (expired) {
459		sc->sc_active = NULL;
460		/* Either requeue or release reference. */
461		if (next != NULL)
462			TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
463		else
464			g_rr_queue_put(qp);
465	} else if (next != NULL) {
466		qp->q_status = G_QUEUE_READY;
467	} else {
468		if (!force && g_rr_should_anticipate(qp, bp)) {
469			/* anticipate */
470			qp->q_status = G_QUEUE_BUSY;
471		} else {
472			/* do not anticipate, release reference */
473			g_rr_queue_put(qp);
474			sc->sc_active = NULL;
475		}
476	}
477	/* If sc_active != NULL, its q_status is always correct. */
478
479	sc->sc_in_flight++;
480
481	return (bp);
482}
483
484static inline void
485g_rr_update_thinktime(struct g_rr_queue *qp)
486{
487	int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2);
488
489	if (qp->q_sc->sc_active != qp)
490		return;
491
492	qp->q_lastsub = ticks;
493	delta = (delta > 2 * wait) ? 2 * wait : delta;
494	if (qp->q_bionum > 7)
495		g_savg_add_sample(&qp->q_thinktime, delta);
496}
497
498static inline void
499g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp)
500{
501	off_t dist;
502
503	if (qp->q_lastoff > bp->bio_offset)
504		dist = qp->q_lastoff - bp->bio_offset;
505	else
506		dist = bp->bio_offset - qp->q_lastoff;
507
508	if (dist > (8192 * 8))
509		dist = 8192 * 8;
510
511	qp->q_lastoff = bp->bio_offset + bp->bio_length;
512
513	if (qp->q_bionum > 7)
514		g_savg_add_sample(&qp->q_seekdist, dist);
515}
516
517/*
518 * Called when a real request for disk I/O arrives.
519 * Locate the queue associated with the client.
520 * If the queue is the one we are anticipating for, reset its timeout;
521 * if the queue is not in the round robin list, insert it in the list.
522 * On any error, do not queue the request and return -1, the caller
523 * will take care of this request.
524 */
525static int
526g_rr_start(void *data, struct bio *bp)
527{
528	struct g_rr_softc *sc = data;
529	struct g_rr_queue *qp;
530
531	if (me.bypass)
532		return (-1);	/* bypass the scheduler */
533
534	/* Get the queue for the request. */
535	qp = g_rr_queue_get(sc, bp);
536	if (qp == NULL)
537		return (-1); /* allocation failed, tell upstream */
538
539	if (gs_bioq_first(&qp->q_bioq) == NULL) {
540		/*
541		 * We are inserting into an empty queue.
542		 * Reset its state if it is sc_active,
543		 * otherwise insert it in the RR list.
544		 */
545		if (qp == sc->sc_active) {
546			qp->q_status = G_QUEUE_READY;
547			callout_stop(&sc->sc_wait);
548		} else {
549			g_sched_priv_ref(qp);
550			TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
551		}
552	}
553
554	qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3);
555
556	g_rr_update_thinktime(qp);
557	g_rr_update_seekdist(qp, bp);
558
559	/* Inherit the reference returned by g_rr_queue_get(). */
560	bp->bio_caller1 = qp;
561	gs_bioq_disksort(&qp->q_bioq, bp);
562
563	return (0);
564}
565
566/*
567 * Callout executed when a queue times out anticipating a new request.
568 */
569static void
570g_rr_wait_timeout(void *data)
571{
572	struct g_rr_softc *sc = data;
573	struct g_geom *geom = sc->sc_geom;
574
575	g_sched_lock(geom);
576	/*
577	 * We can race with other events, so check if
578	 * sc_active is still valid.
579	 */
580	if (sc->sc_active != NULL) {
581		/* Release the reference to the queue. */
582		g_rr_queue_put(sc->sc_active);
583		sc->sc_active = NULL;
584		me.wait_hit--;
585		me.wait_miss++;	/* record the miss */
586	}
587	g_sched_dispatch(geom);
588	g_sched_unlock(geom);
589}
590
591/*
592 * Module glue: allocate descriptor, initialize its fields.
593 */
594static void *
595g_rr_init(struct g_geom *geom)
596{
597	struct g_rr_softc *sc;
598
599	/* XXX check whether we can sleep */
600	sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO);
601	sc->sc_geom = geom;
602	TAILQ_INIT(&sc->sc_rr_tailq);
603	callout_init(&sc->sc_wait, CALLOUT_MPSAFE);
604	LIST_INSERT_HEAD(&me.sc_head, sc, sc_next);
605	me.units++;
606
607	return (sc);
608}
609
610/*
611 * Module glue -- drain the callout structure, destroy the
612 * hash table and its element, and free the descriptor.
613 */
614static void
615g_rr_fini(void *data)
616{
617	struct g_rr_softc *sc = data;
618
619	callout_drain(&sc->sc_wait);
620	KASSERT(sc->sc_active == NULL, ("still a queue under service"));
621	KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues"));
622
623	LIST_REMOVE(sc, sc_next);
624	me.units--;
625	free(sc, M_GEOM_SCHED);
626}
627
628/*
629 * Called when the request under service terminates.
630 * Start the anticipation timer if needed.
631 */
632static void
633g_rr_done(void *data, struct bio *bp)
634{
635	struct g_rr_softc *sc = data;
636	struct g_rr_queue *qp;
637
638	sc->sc_in_flight--;
639
640	qp = bp->bio_caller1;
641	if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) {
642		if (!(qp->q_flags & G_FLAG_COMPLETED)) {
643			qp->q_flags |= G_FLAG_COMPLETED;
644			/* in case we want to make the slice adaptive */
645			qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
646			qp->q_slice_end = ticks + qp->q_slice_duration;
647		}
648
649		/* The queue is trying anticipation, start the timer. */
650		qp->q_status = G_QUEUE_IDLING;
651		/* may make this adaptive */
652		qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
653		me.wait_hit++;
654		callout_reset(&sc->sc_wait, qp->q_wait_ticks,
655		    g_rr_wait_timeout, sc);
656	} else
657		g_sched_dispatch(sc->sc_geom);
658
659	/* Release a reference to the queue. */
660	g_rr_queue_put(qp);
661}
662
663static void
664g_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
665    struct g_consumer *cp, struct g_provider *pp)
666{
667	if (indent == NULL) {   /* plaintext */
668		sbuf_printf(sb, " units %d queues %d",
669			me.units, me.queues);
670        }
671}
672
673static struct g_gsched g_rr = {
674	.gs_name = "rr",
675	.gs_priv_size = sizeof(struct g_rr_queue),
676	.gs_init = g_rr_init,
677	.gs_fini = g_rr_fini,
678	.gs_start = g_rr_start,
679	.gs_done = g_rr_done,
680	.gs_next = g_rr_next,
681	.gs_dumpconf = g_rr_dumpconf,
682	.gs_init_class = g_rr_init_class,
683	.gs_fini_class = g_rr_fini_class,
684};
685
686DECLARE_GSCHED_MODULE(rr, &g_rr);
687