1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2009-2010 Fabio Checconi
5 * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/*
31 * $Id$
32 * $FreeBSD$
33 *
34 * Main control module for geom-based disk schedulers ('sched').
35 *
36 * USER VIEW
37 * A 'sched' node is typically inserted transparently between
38 * an existing provider pp and its original geom gp
39 *
40 *	[pp --> gp  ..]
41 *
42 * using the command "geom sched insert <provider>" and
43 * resulting in the following topology
44 *
45 *	[pp --> sched_gp --> cp]   [new_pp --> gp ... ]
46 *
47 * Deletion "geom sched destroy <provider>.sched." restores the
48 * original chain. The normal "geom sched create <provide>"
49 * is also supported.
50 *
51 * INTERNALS
52 * Internally, the 'sched' uses the following data structures
53 *
54 *   geom{}         g_sched_softc{}      g_gsched{}
55 * +----------+    +---------------+   +-------------+
56 * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
57 * |  ...     |    |               |   |  gs_fini    |
58 * |          |    | [ hash table] |   |  gs_start   |
59 * +----------+    |               |   |  ...        |
60 *                 |               |   +-------------+
61 *                 |               |
62 *                 |               |     g_*_softc{}
63 *                 |               |   +-------------+
64 *                 | sc_data     *-|-->|             |
65 *                 +---------------+   |  algorithm- |
66 *                                     |  specific   |
67 *                                     +-------------+
68 *
69 * A g_sched_softc{} is created with a "geom sched insert" call.
70 * In turn this instantiates a specific scheduling algorithm,
71 * which sets sc_gsched to point to the algorithm callbacks,
72 * and calls gs_init() to create the g_*_softc{} .
73 * The other callbacks (gs_start, gs_next, ...) are invoked
74 * as needed
75 *
76 * g_sched_softc{} is defined in g_sched.h and mostly used here;
77 * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
78 * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
79 *
80 * DATA MOVING
81 * When a bio is received on the provider, it goes to the
82 * g_sched_start() which calls gs_start() to initially queue it;
83 * then we call g_sched_dispatch() that loops around gs_next()
84 * to select zero or more bio's to be sent downstream.
85 *
86 * g_sched_dispatch() can also be called as a result of a timeout,
87 * e.g. when doing anticipation or pacing requests.
88 *
89 * When a bio comes back, it goes to g_sched_done() which in turn
90 * calls gs_done(). The latter does any necessary housekeeping in
91 * the scheduling algorithm, and may decide to call g_sched_dispatch()
92 * to send more bio's downstream.
93 *
94 * If an algorithm needs per-flow queues, these are created
95 * calling gs_init_class() and destroyed with gs_fini_class(),
96 * and they are also inserted in the hash table implemented in
97 * the g_sched_softc{}
98 *
99 * If an algorithm is replaced, or a transparently-inserted node is
100 * removed with "geom sched destroy", we need to remove all references
101 * to the g_*_softc{} and g_sched_softc from the bio's still in
102 * the scheduler. g_sched_forced_dispatch() helps doing this.
103 * XXX need to explain better.
104 */
105
106#include <sys/cdefs.h>
107#include <sys/param.h>
108#include <sys/systm.h>
109#include <sys/kernel.h>
110#include <sys/module.h>
111#include <sys/lock.h>
112#include <sys/mutex.h>
113#include <sys/bio.h>
114#include <sys/limits.h>
115#include <sys/hash.h>
116#include <sys/sbuf.h>
117#include <sys/sysctl.h>
118#include <sys/malloc.h>
119#include <sys/proc.h>		/* we access curthread */
120#include <geom/geom.h>
121#include "gs_scheduler.h"
122#include "g_sched.h"		/* geom hooks */
123
124/*
125 * Size of the per-geom hash table storing traffic classes.
126 * We may decide to change it at a later time, it has no ABI
127 * implications as it is only used for run-time allocations.
128 */
129#define G_SCHED_HASH_SIZE	32
130
131static int g_sched_destroy(struct g_geom *gp, boolean_t force);
132static int g_sched_destroy_geom(struct gctl_req *req,
133    struct g_class *mp, struct g_geom *gp);
134static void g_sched_config(struct gctl_req *req, struct g_class *mp,
135    const char *verb);
136static struct g_geom *g_sched_taste(struct g_class *mp,
137    struct g_provider *pp, int flags __unused);
138static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
139    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
140static void g_sched_init(struct g_class *mp);
141static void g_sched_fini(struct g_class *mp);
142static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data,
143    int fflag, struct thread *td);
144
145struct g_class g_sched_class = {
146	.name = G_SCHED_CLASS_NAME,
147	.version = G_VERSION,
148	.ctlreq = g_sched_config,
149	.taste = g_sched_taste,
150	.destroy_geom = g_sched_destroy_geom,
151	.init = g_sched_init,
152	.ioctl = g_sched_ioctl,
153	.fini = g_sched_fini
154};
155
156MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
157
158/*
159 * Global variables describing the state of the geom_sched module.
160 * There is only one static instance of this structure.
161 */
162LIST_HEAD(gs_list, g_gsched);	/* type, link field */
163struct geom_sched_vars {
164	struct mtx	gs_mtx;
165	struct gs_list	gs_scheds;	/* list of algorithms */
166	u_int		gs_debug;
167	u_int		gs_sched_count;	/* how many algorithms ? */
168	u_int 		gs_patched;	/* g_io_request was patched */
169
170	u_int		gs_initialized;
171	u_int		gs_expire_secs;	/* expiration of hash entries */
172
173	struct bio_queue_head gs_pending;
174	u_int		gs_npending;
175
176	/* The following are for stats, usually protected by gs_mtx. */
177	u_long		gs_requests;	/* total requests */
178	u_long		gs_done;	/* total done */
179	u_int 		gs_in_flight;	/* requests in flight */
180	u_int 		gs_writes_in_flight;
181	u_int 		gs_bytes_in_flight;
182	u_int 		gs_write_bytes_in_flight;
183
184	char		gs_names[256];	/* names of schedulers */
185};
186
187static struct geom_sched_vars me = {
188	.gs_expire_secs = 10,
189};
190
191SYSCTL_DECL(_kern_geom);
192SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
193    "GEOM_SCHED stuff");
194
195SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
196    &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
197
198SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
199    &me.gs_bytes_in_flight, 0, "Bytes in flight");
200
201SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
202    &me.gs_writes_in_flight, 0, "Write Requests in flight");
203
204SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
205    &me.gs_in_flight, 0, "Requests in flight");
206
207SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
208    &me.gs_done, 0, "Total done");
209
210SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
211    &me.gs_requests, 0, "Total requests");
212
213SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
214    &me.gs_names, 0, "Algorithm names");
215
216SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
217    &me.gs_sched_count, 0, "Number of algorithms");
218
219SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
220    &me.gs_debug, 0, "Debug level");
221
222SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
223    &me.gs_expire_secs, 0, "Expire time in seconds");
224
225/*
226 * g_sched calls the scheduler algorithms with this lock held.
227 * The locking functions are exposed so the scheduler algorithms can also
228 * protect themselves e.g. when running a callout handler.
229 */
230void
231g_sched_lock(struct g_geom *gp)
232{
233	struct g_sched_softc *sc = gp->softc;
234
235	mtx_lock(&sc->sc_mtx);
236}
237
238void
239g_sched_unlock(struct g_geom *gp)
240{
241	struct g_sched_softc *sc = gp->softc;
242
243	mtx_unlock(&sc->sc_mtx);
244}
245
246/*
247 * Support functions to handle references to the module,
248 * which are coming from devices using this scheduler.
249 */
250static inline void
251g_gsched_ref(struct g_gsched *gsp)
252{
253
254	atomic_add_int(&gsp->gs_refs, 1);
255}
256
257static inline void
258g_gsched_unref(struct g_gsched *gsp)
259{
260
261	atomic_add_int(&gsp->gs_refs, -1);
262}
263
264/*
265 * Update the stats when this request is done.
266 */
267static void
268g_sched_update_stats(struct bio *bio)
269{
270
271	me.gs_done++;
272	me.gs_in_flight--;
273	me.gs_bytes_in_flight -= bio->bio_length;
274	if (bio->bio_cmd == BIO_WRITE) {
275		me.gs_writes_in_flight--;
276		me.gs_write_bytes_in_flight -= bio->bio_length;
277	}
278}
279
280/*
281 * Dispatch any pending request.
282 */
283static void
284g_sched_forced_dispatch(struct g_geom *gp)
285{
286	struct g_sched_softc *sc = gp->softc;
287	struct g_gsched *gsp = sc->sc_gsched;
288	struct bio *bp;
289
290	KASSERT(mtx_owned(&sc->sc_mtx),
291	    ("sc_mtx not owned during forced dispatch"));
292
293	while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
294		g_io_request(bp, LIST_FIRST(&gp->consumer));
295}
296
297/*
298 * The main dispatch loop, called either here after the start
299 * routine, or by scheduling algorithms when they receive a timeout
300 * or a 'done' notification.  Does not share code with the forced
301 * dispatch path, since the gs_done() callback can call us.
302 */
303void
304g_sched_dispatch(struct g_geom *gp)
305{
306	struct g_sched_softc *sc = gp->softc;
307	struct g_gsched *gsp = sc->sc_gsched;
308	struct bio *bp;
309
310	KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
311
312	if ((sc->sc_flags & G_SCHED_FLUSHING))
313		return;
314
315	while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
316		g_io_request(bp, LIST_FIRST(&gp->consumer));
317}
318
319/*
320 * Recent (8.0 and above) versions of FreeBSD have support to
321 * register classifiers of disk requests. The classifier is
322 * invoked by g_io_request(), and stores the information into
323 * bp->bio_classifier1.
324 *
325 * Support for older versions, which is left here only for
326 * documentation purposes, relies on two hacks:
327 * 1. classification info is written into the bio_caller1
328 *    field of the topmost node in the bio chain. This field
329 *    is rarely used, but this module is incompatible with
330 *    those that use bio_caller1 for other purposes,
331 *    such as ZFS and gjournal;
332 * 2. g_io_request() is patched in-memory when the module is
333 *    loaded, so that the function calls a classifier as its
334 *    first thing. g_io_request() is restored when the module
335 *    is unloaded. This functionality is only supported for
336 *    x86 and amd64, other architectures need source code changes.
337 */
338
339/*
340 * Lookup the identity of the issuer of the original request.
341 * In the current implementation we use the curthread of the
342 * issuer, but different mechanisms may be implemented later
343 * so we do not make assumptions on the return value which for
344 * us is just an opaque identifier.
345 */
346
347static inline u_long
348g_sched_classify(struct bio *bp)
349{
350
351	/* we have classifier fields in the struct bio */
352	return ((u_long)bp->bio_classifier1);
353}
354
355/* Return the hash chain for the given key. */
356static inline struct g_hash *
357g_sched_hash(struct g_sched_softc *sc, u_long key)
358{
359
360	return (&sc->sc_hash[key & sc->sc_mask]);
361}
362
363/*
364 * Helper function for the children classes, which takes
365 * a geom and a bio and returns the private descriptor
366 * associated to the request.  This involves fetching
367 * the classification field and [al]locating the
368 * corresponding entry in the hash table.
369 */
370void *
371g_sched_get_class(struct g_geom *gp, struct bio *bp)
372{
373	struct g_sched_softc *sc;
374	struct g_sched_class *gsc;
375	struct g_gsched *gsp;
376	struct g_hash *bucket;
377	u_long key;
378
379	sc = gp->softc;
380	key = g_sched_classify(bp);
381	bucket = g_sched_hash(sc, key);
382	LIST_FOREACH(gsc, bucket, gsc_clist) {
383		if (key == gsc->gsc_key) {
384			gsc->gsc_refs++;
385			return (gsc->gsc_priv);
386		}
387	}
388
389	gsp = sc->sc_gsched;
390	gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
391	    M_GEOM_SCHED, M_NOWAIT | M_ZERO);
392	if (!gsc)
393		return (NULL);
394
395	if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
396		free(gsc, M_GEOM_SCHED);
397		return (NULL);
398	}
399
400	gsc->gsc_refs = 2;	/* 1 for the hash table, 1 for the caller. */
401	gsc->gsc_key = key;
402	LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
403
404	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
405
406	return (gsc->gsc_priv);
407}
408
409/*
410 * Release a reference to the per-client descriptor,
411 */
412void
413g_sched_put_class(struct g_geom *gp, void *priv)
414{
415	struct g_sched_class *gsc;
416	struct g_sched_softc *sc;
417
418	gsc = g_sched_priv2class(priv);
419	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
420
421	if (--gsc->gsc_refs > 0)
422		return;
423
424	sc = gp->softc;
425	sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
426
427	LIST_REMOVE(gsc, gsc_clist);
428	free(gsc, M_GEOM_SCHED);
429}
430
431static void
432g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
433    struct g_gsched *gsp, void *data)
434{
435	struct g_sched_class *cp, *cp2;
436	int i;
437
438	if (!hp)
439		return;
440
441	if (data && gsp->gs_hash_unref)
442		gsp->gs_hash_unref(data);
443
444	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
445		LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
446			g_sched_put_class(gp, cp->gsc_priv);
447	}
448
449	hashdestroy(hp, M_GEOM_SCHED, mask);
450}
451
452static struct g_hash *
453g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
454{
455	struct g_hash *hash;
456
457	if (gsp->gs_priv_size == 0)
458		return (NULL);
459
460	hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
461
462	return (hash);
463}
464
465static void
466g_sched_flush_classes(struct g_geom *gp)
467{
468	struct g_sched_softc *sc;
469	struct g_sched_class *cp, *cp2;
470	int i;
471
472	sc = gp->softc;
473
474	if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
475		return;
476
477	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
478		LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
479			if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
480				g_sched_put_class(gp, cp->gsc_priv);
481		}
482	}
483
484	sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
485}
486
487/*
488 * Wait for the completion of any outstanding request.  To ensure
489 * that this does not take forever the caller has to make sure that
490 * no new request enter the scehduler before calling us.
491 *
492 * Must be called with the gp mutex held and topology locked.
493 */
494static int
495g_sched_wait_pending(struct g_geom *gp)
496{
497	struct g_sched_softc *sc = gp->softc;
498	int endticks = ticks + hz;
499
500	g_topology_assert();
501
502	while (sc->sc_pending && endticks - ticks >= 0)
503		msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
504
505	return (sc->sc_pending ? ETIMEDOUT : 0);
506}
507
508static int
509g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
510{
511	struct g_sched_softc *sc = gp->softc;
512	int error;
513
514	/* Set the flushing flag: new bios will not enter the scheduler. */
515	sc->sc_flags |= G_SCHED_FLUSHING;
516
517	g_sched_forced_dispatch(gp);
518	error = g_sched_wait_pending(gp);
519	if (error)
520		goto failed;
521
522	/* No more requests pending or in flight from the old gsp. */
523
524	g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
525	sc->sc_hash = NULL;
526
527	/*
528	 * Avoid deadlock here by releasing the gp mutex and reacquiring
529	 * it once done.  It should be safe, since no reconfiguration or
530	 * destruction can take place due to the geom topology lock; no
531	 * new request can use the current sc_data since we flagged the
532	 * geom as being flushed.
533	 */
534	g_sched_unlock(gp);
535	gsp->gs_fini(sc->sc_data);
536	g_sched_lock(gp);
537
538	sc->sc_gsched = NULL;
539	sc->sc_data = NULL;
540	g_gsched_unref(gsp);
541
542failed:
543	sc->sc_flags &= ~G_SCHED_FLUSHING;
544
545	return (error);
546}
547
548static int
549g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
550{
551	int error;
552
553	g_sched_lock(gp);
554	error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
555	g_sched_unlock(gp);
556
557	return (error);
558}
559
560/*
561 * Support function for create/taste -- locate the desired
562 * algorithm and grab a reference to it.
563 */
564static struct g_gsched *
565g_gsched_find(const char *name)
566{
567	struct g_gsched *gsp = NULL;
568
569	mtx_lock(&me.gs_mtx);
570	LIST_FOREACH(gsp, &me.gs_scheds, glist) {
571		if (strcmp(name, gsp->gs_name) == 0) {
572			g_gsched_ref(gsp);
573			break;
574		}
575	}
576	mtx_unlock(&me.gs_mtx);
577
578	return (gsp);
579}
580
581/*
582 * Rebuild the list of scheduler names.
583 * To be called with me.gs_mtx lock held.
584 */
585static void
586g_gsched_build_names(struct g_gsched *gsp)
587{
588	int pos, l;
589	struct g_gsched *cur;
590
591	pos = 0;
592	LIST_FOREACH(cur, &me.gs_scheds, glist) {
593		l = strlen(cur->gs_name);
594		if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
595			if (pos != 0)
596				me.gs_names[pos++] = ' ';
597			strcpy(me.gs_names + pos, cur->gs_name);
598			pos += l;
599		}
600	}
601	me.gs_names[pos] = '\0';
602}
603
604/*
605 * Register or unregister individual scheduling algorithms.
606 */
607static int
608g_gsched_register(struct g_gsched *gsp)
609{
610	struct g_gsched *cur;
611	int error = 0;
612
613	mtx_lock(&me.gs_mtx);
614	LIST_FOREACH(cur, &me.gs_scheds, glist) {
615		if (strcmp(gsp->gs_name, cur->gs_name) == 0)
616			break;
617	}
618	if (cur != NULL) {
619		G_SCHED_DEBUG(0, "A scheduler named %s already"
620		    "exists.", gsp->gs_name);
621		error = EEXIST;
622	} else {
623		LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
624		gsp->gs_refs = 1;
625		me.gs_sched_count++;
626		g_gsched_build_names(gsp);
627	}
628	mtx_unlock(&me.gs_mtx);
629
630	return (error);
631}
632
633struct g_gsched_unregparm {
634	struct g_gsched *gup_gsp;
635	int		gup_error;
636};
637
638static void
639g_gsched_unregister(void *arg, int flag)
640{
641	struct g_gsched_unregparm *parm = arg;
642	struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
643	struct g_sched_softc *sc;
644	struct g_geom *gp, *gp_tmp;
645	int error;
646
647	parm->gup_error = 0;
648
649	g_topology_assert();
650
651	if (flag == EV_CANCEL)
652		return;
653
654	mtx_lock(&me.gs_mtx);
655
656	LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
657		if (gp->class != &g_sched_class)
658			continue;	/* Should not happen. */
659
660		sc = gp->softc;
661		if (sc->sc_gsched == gsp) {
662			error = g_sched_remove(gp, gsp);
663			if (error)
664				goto failed;
665		}
666	}
667
668	LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
669		if (cur != gsp)
670			continue;
671
672		if (gsp->gs_refs != 1) {
673			G_SCHED_DEBUG(0, "%s still in use.",
674			    gsp->gs_name);
675			parm->gup_error = EBUSY;
676		} else {
677			LIST_REMOVE(gsp, glist);
678			me.gs_sched_count--;
679			g_gsched_build_names(gsp);
680		}
681		break;
682	}
683
684	if (cur == NULL) {
685		G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
686		parm->gup_error = ENOENT;
687	}
688
689failed:
690	mtx_unlock(&me.gs_mtx);
691}
692
693static inline void
694g_gsched_global_init(void)
695{
696
697	if (!me.gs_initialized) {
698		G_SCHED_DEBUG(0, "Initializing global data.");
699		mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
700		LIST_INIT(&me.gs_scheds);
701		bioq_init(&me.gs_pending);
702		me.gs_initialized = 1;
703	}
704}
705
706/*
707 * Module event called when a scheduling algorithm module is loaded or
708 * unloaded.
709 */
710int
711g_gsched_modevent(module_t mod, int cmd, void *arg)
712{
713	struct g_gsched *gsp = arg;
714	struct g_gsched_unregparm parm;
715	int error;
716
717	G_SCHED_DEBUG(0, "Modevent %d.", cmd);
718
719	/*
720	 * If the module is loaded at boot, the geom thread that calls
721	 * g_sched_init() might actually run after g_gsched_modevent(),
722	 * so make sure that the module is properly initialized.
723	 */
724	g_gsched_global_init();
725
726	error = EOPNOTSUPP;
727	switch (cmd) {
728	case MOD_LOAD:
729		error = g_gsched_register(gsp);
730		G_SCHED_DEBUG(0, "Loaded module %s error %d.",
731		    gsp->gs_name, error);
732		if (error == 0)
733			g_retaste(&g_sched_class);
734		break;
735
736	case MOD_UNLOAD:
737		parm.gup_gsp = gsp;
738		parm.gup_error = 0;
739
740		error = g_waitfor_event(g_gsched_unregister,
741		    &parm, M_WAITOK, NULL);
742		if (error == 0)
743			error = parm.gup_error;
744		G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
745		    gsp->gs_name, error);
746		break;
747	}
748
749	return (error);
750}
751
752#ifdef KTR
753#define	TRC_BIO_EVENT(e, bp)	g_sched_trace_bio_ ## e (bp)
754
755static inline char
756g_sched_type(struct bio *bp)
757{
758
759	if (bp->bio_cmd == BIO_READ)
760		return ('R');
761	else if (bp->bio_cmd == BIO_WRITE)
762		return ('W');
763	return ('U');
764}
765
766static inline void
767g_sched_trace_bio_START(struct bio *bp)
768{
769
770	CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
771	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
772	    bp->bio_offset, bp->bio_length);
773}
774
775static inline void
776g_sched_trace_bio_DONE(struct bio *bp)
777{
778
779	CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
780	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
781	    bp->bio_offset, bp->bio_length);
782}
783#else /* !KTR */
784#define	TRC_BIO_EVENT(e, bp)
785#endif /* !KTR */
786
787/*
788 * g_sched_done() and g_sched_start() dispatch the geom requests to
789 * the scheduling algorithm in use.
790 */
791static void
792g_sched_done(struct bio *bio)
793{
794	struct g_geom *gp = bio->bio_caller2;
795	struct g_sched_softc *sc = gp->softc;
796
797	TRC_BIO_EVENT(DONE, bio);
798
799	KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
800
801	g_sched_lock(gp);
802
803	g_sched_update_stats(bio);
804	sc->sc_gsched->gs_done(sc->sc_data, bio);
805	if (!--sc->sc_pending)
806		wakeup(gp);
807
808	g_sched_flush_classes(gp);
809	g_sched_unlock(gp);
810
811	g_std_done(bio);
812}
813
814static void
815g_sched_start(struct bio *bp)
816{
817	struct g_geom *gp = bp->bio_to->geom;
818	struct g_sched_softc *sc = gp->softc;
819	struct bio *cbp;
820
821	TRC_BIO_EVENT(START, bp);
822	G_SCHED_LOGREQ(bp, "Request received.");
823
824	cbp = g_clone_bio(bp);
825	if (cbp == NULL) {
826		g_io_deliver(bp, ENOMEM);
827		return;
828	}
829	cbp->bio_done = g_sched_done;
830	cbp->bio_to = LIST_FIRST(&gp->provider);
831	KASSERT(cbp->bio_to != NULL, ("NULL provider"));
832
833	/* We only schedule reads and writes. */
834	if (bp->bio_cmd != BIO_READ && bp->bio_cmd != BIO_WRITE)
835		goto bypass;
836
837	G_SCHED_LOGREQ(cbp, "Sending request.");
838
839	g_sched_lock(gp);
840	/*
841	 * Call the algorithm's gs_start to queue the request in the
842	 * scheduler. If gs_start fails then pass the request down,
843	 * otherwise call g_sched_dispatch() which tries to push
844	 * one or more requests down.
845	 */
846	if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
847	    sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
848		g_sched_unlock(gp);
849		goto bypass;
850	}
851	/*
852	 * We use bio_caller1 to mark requests that are scheduled
853	 * so make sure it is not NULL.
854	 */
855	if (cbp->bio_caller1 == NULL)
856		cbp->bio_caller1 = &me;	/* anything not NULL */
857
858	cbp->bio_caller2 = gp;
859	sc->sc_pending++;
860
861	/* Update general stats. */
862	me.gs_in_flight++;
863	me.gs_requests++;
864	me.gs_bytes_in_flight += bp->bio_length;
865	if (bp->bio_cmd == BIO_WRITE) {
866		me.gs_writes_in_flight++;
867		me.gs_write_bytes_in_flight += bp->bio_length;
868	}
869	g_sched_dispatch(gp);
870	g_sched_unlock(gp);
871	return;
872
873bypass:
874	cbp->bio_done = g_std_done;
875	cbp->bio_caller1 = NULL; /* not scheduled */
876	g_io_request(cbp, LIST_FIRST(&gp->consumer));
877}
878
879/*
880 * The next few functions are the geom glue.
881 */
882static void
883g_sched_orphan(struct g_consumer *cp)
884{
885
886	g_topology_assert();
887	g_sched_destroy(cp->geom, 1);
888}
889
890static int
891g_sched_access(struct g_provider *pp, int dr, int dw, int de)
892{
893	struct g_geom *gp;
894	struct g_consumer *cp;
895	int error;
896
897	gp = pp->geom;
898	cp = LIST_FIRST(&gp->consumer);
899	error = g_access(cp, dr, dw, de);
900
901	return (error);
902}
903
904static void
905g_sched_temporary_start(struct bio *bio)
906{
907
908	mtx_lock(&me.gs_mtx);
909	me.gs_npending++;
910	bioq_disksort(&me.gs_pending, bio);
911	mtx_unlock(&me.gs_mtx);
912}
913
914static void
915g_sched_flush_pending(g_start_t *start)
916{
917	struct bio *bp;
918
919	while ((bp = bioq_takefirst(&me.gs_pending)))
920		start(bp);
921}
922
923static int
924g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
925    struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
926{
927	struct g_sched_softc *sc = gp->softc;
928	g_start_t *saved_start, *flush = g_sched_start;
929	int error = 0, endticks = ticks + hz;
930
931	g_cancel_event(newpp);	/* prevent taste() */
932	/* copy private fields */
933	newpp->private = pp->private;
934	newpp->index = pp->index;
935
936	/* Queue all the early requests coming for us. */
937	me.gs_npending = 0;
938	saved_start = pp->geom->start;
939	dstgp->start = g_sched_temporary_start;
940
941	while (pp->nstart - pp->nend != me.gs_npending &&
942	    endticks - ticks >= 0)
943		tsleep(pp, PRIBIO, "-", hz/10);
944
945	if (pp->nstart - pp->nend != me.gs_npending) {
946		flush = saved_start;
947		error = ETIMEDOUT;
948		goto fail;
949	}
950
951	/* link pp to this geom */
952	LIST_REMOVE(pp, provider);
953	pp->geom = gp;
954	LIST_INSERT_HEAD(&gp->provider, pp, provider);
955
956	/*
957	 * replicate the counts from the parent in the
958	 * new provider and consumer nodes
959	 */
960	cp->acr = newpp->acr = pp->acr;
961	cp->acw = newpp->acw = pp->acw;
962	cp->ace = newpp->ace = pp->ace;
963	sc->sc_flags |= G_SCHED_PROXYING;
964
965fail:
966	dstgp->start = saved_start;
967
968	g_sched_flush_pending(flush);
969
970	return (error);
971}
972
973/*
974 * Create a geom node for the device passed as *pp.
975 * If successful, add a reference to this gsp.
976 */
977static int
978g_sched_create(struct gctl_req *req, struct g_class *mp,
979    struct g_provider *pp, struct g_gsched *gsp, int proxy)
980{
981	struct g_sched_softc *sc = NULL;
982	struct g_geom *gp, *dstgp;
983	struct g_provider *newpp = NULL;
984	struct g_consumer *cp = NULL;
985	char name[64];
986	int error;
987
988	g_topology_assert();
989
990	snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
991	LIST_FOREACH(gp, &mp->geom, geom) {
992		if (strcmp(gp->name, name) == 0) {
993			gctl_error(req, "Geom %s already exists.",
994			    name);
995			return (EEXIST);
996		}
997	}
998
999	gp = g_new_geomf(mp, "%s", name);
1000	dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
1001
1002	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1003	sc->sc_gsched = gsp;
1004	sc->sc_data = gsp->gs_init(gp);
1005	if (sc->sc_data == NULL) {
1006		error = ENOMEM;
1007		goto fail;
1008	}
1009
1010	sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1011
1012	/*
1013	 * Do not initialize the flush mechanism, will be initialized
1014	 * on the first insertion on the hash table.
1015	 */
1016
1017	mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1018
1019	gp->softc = sc;
1020	gp->start = g_sched_start;
1021	gp->orphan = g_sched_orphan;
1022	gp->access = g_sched_access;
1023	gp->dumpconf = g_sched_dumpconf;
1024
1025	newpp = g_new_providerf(dstgp, "%s", gp->name);
1026	newpp->mediasize = pp->mediasize;
1027	newpp->sectorsize = pp->sectorsize;
1028
1029	cp = g_new_consumer(gp);
1030	error = g_attach(cp, proxy ? newpp : pp);
1031	if (error != 0) {
1032		gctl_error(req, "Cannot attach to provider %s.",
1033		    pp->name);
1034		goto fail;
1035	}
1036
1037	g_error_provider(newpp, 0);
1038	if (proxy) {
1039		error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1040		if (error)
1041			goto fail;
1042	}
1043	G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1044
1045	g_gsched_ref(gsp);
1046
1047	return (0);
1048
1049fail:
1050	if (cp != NULL) {
1051		if (cp->provider != NULL)
1052			g_detach(cp);
1053		g_destroy_consumer(cp);
1054	}
1055	if (newpp != NULL)
1056		g_destroy_provider(newpp);
1057	if (sc->sc_hash)
1058		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1059		    gsp, sc->sc_data);
1060	if (sc->sc_data)
1061		gsp->gs_fini(sc->sc_data);
1062	g_free(gp->softc);
1063	g_destroy_geom(gp);
1064
1065	return (error);
1066}
1067
1068/*
1069 * Support for dynamic switching of scheduling algorithms.
1070 * First initialize the data structures for the new algorithm,
1071 * then call g_sched_remove_locked() to flush all references
1072 * to the old one, finally link the new algorithm.
1073 */
1074static int
1075g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1076    struct g_provider *pp, struct g_gsched *gsp)
1077{
1078	struct g_sched_softc *sc;
1079	struct g_geom *gp;
1080	struct g_hash *newh;
1081	void *data;
1082	u_long mask;
1083	int error = 0;
1084
1085	gp = pp->geom;
1086	sc = gp->softc;
1087
1088	data = gsp->gs_init(gp);
1089	if (data == NULL)
1090		return (ENOMEM);
1091
1092	newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1093	if (gsp->gs_priv_size && !newh) {
1094		error = ENOMEM;
1095		goto fail;
1096	}
1097
1098	g_sched_lock(gp);
1099	if (sc->sc_gsched) {	/* can be NULL in some cases */
1100		error = g_sched_remove_locked(gp, sc->sc_gsched);
1101		if (error)
1102			goto fail;
1103	}
1104
1105	g_gsched_ref(gsp);
1106	sc->sc_gsched = gsp;
1107	sc->sc_data = data;
1108	sc->sc_hash = newh;
1109	sc->sc_mask = mask;
1110
1111	g_sched_unlock(gp);
1112
1113	return (0);
1114
1115fail:
1116	if (newh)
1117		g_sched_hash_fini(gp, newh, mask, gsp, data);
1118
1119	if (data)
1120		gsp->gs_fini(data);
1121
1122	g_sched_unlock(gp);
1123
1124	return (error);
1125}
1126
1127/*
1128 * Stop the request flow directed to the proxy, redirecting the new
1129 * requests to the me.gs_pending queue.
1130 */
1131static struct g_provider *
1132g_detach_proxy(struct g_geom *gp)
1133{
1134	struct g_consumer *cp;
1135	struct g_provider *pp, *newpp;
1136
1137	do {
1138		pp = LIST_FIRST(&gp->provider);
1139		if (pp == NULL)
1140			break;
1141		cp = LIST_FIRST(&gp->consumer);
1142		if (cp == NULL)
1143			break;
1144		newpp = cp->provider;
1145		if (newpp == NULL)
1146			break;
1147
1148		me.gs_npending = 0;
1149		pp->geom->start = g_sched_temporary_start;
1150
1151		return (pp);
1152	} while (0);
1153	printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1154
1155	return (NULL);
1156}
1157
1158static void
1159g_sched_blackhole(struct bio *bp)
1160{
1161
1162	g_io_deliver(bp, ENXIO);
1163}
1164
1165static inline void
1166g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1167    struct g_provider *newpp)
1168{
1169
1170	LIST_REMOVE(pp, provider);
1171	if (newpp) {
1172		pp->private = newpp->private;
1173		pp->index = newpp->index;
1174	}
1175	pp->geom = gp;
1176	LIST_INSERT_HEAD(&gp->provider, pp, provider);
1177}
1178
1179static inline void
1180g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1181{
1182	struct g_geom *gp = oldpp->geom;
1183
1184	g_reparent_provider(oldpp, newpp->geom, newpp);
1185
1186	/*
1187	 * Hackish: let the system destroy the old provider for us, just
1188	 * in case someone attached a consumer to it, in which case a
1189	 * direct call to g_destroy_provider() would not work.
1190	 */
1191	g_reparent_provider(newpp, gp, NULL);
1192}
1193
1194/*
1195 * Complete the proxy destruction, linking the old provider to its
1196 * original geom, and destroying the proxy provider.  Also take care
1197 * of issuing the pending requests collected in me.gs_pending (if any).
1198 */
1199static int
1200g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1201{
1202	struct g_consumer *cp;
1203	struct g_provider *newpp;
1204
1205	do {
1206		cp = LIST_FIRST(&gp->consumer);
1207		if (cp == NULL)
1208			break;
1209		newpp = cp->provider;
1210		if (newpp == NULL)
1211			break;
1212
1213		/* Relink the provider to its original geom. */
1214		g_unproxy_provider(oldpp, newpp);
1215
1216		/* Detach consumer from provider, and destroy provider. */
1217		cp->acr = newpp->acr = 0;
1218		cp->acw = newpp->acw = 0;
1219		cp->ace = newpp->ace = 0;
1220		g_detach(cp);
1221
1222		/* Send the pending bios through the right start function. */
1223		g_sched_flush_pending(oldpp->geom->start);
1224
1225		return (0);
1226	} while (0);
1227	printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1228
1229	/* We cannot send the pending bios anywhere... */
1230	g_sched_flush_pending(g_sched_blackhole);
1231
1232	return (EINVAL);
1233}
1234
1235static int
1236g_sched_destroy(struct g_geom *gp, boolean_t force)
1237{
1238	struct g_provider *pp, *oldpp = NULL;
1239	struct g_sched_softc *sc;
1240	struct g_gsched *gsp;
1241	int error;
1242
1243	g_topology_assert();
1244	sc = gp->softc;
1245	if (sc == NULL)
1246		return (ENXIO);
1247	if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1248		pp = LIST_FIRST(&gp->provider);
1249		if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1250			const char *msg = force ?
1251				"but we force removal" : "cannot remove";
1252
1253			G_SCHED_DEBUG(!force,
1254			    "Device %s is still open (r%dw%de%d), %s.",
1255			    pp->name, pp->acr, pp->acw, pp->ace, msg);
1256			if (!force)
1257				return (EBUSY);
1258		} else {
1259			G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1260		}
1261	} else
1262		oldpp = g_detach_proxy(gp);
1263
1264	gsp = sc->sc_gsched;
1265	if (gsp) {
1266		/*
1267		 * XXX bad hack here: force a dispatch to release
1268		 * any reference to the hash table still held by
1269		 * the scheduler.
1270		 */
1271		g_sched_lock(gp);
1272		/*
1273		 * We are dying here, no new requests should enter
1274		 * the scheduler.  This is granted by the topolgy,
1275		 * either in case we were proxying (new bios are
1276		 * being redirected) or not (see the access check
1277		 * above).
1278		 */
1279		g_sched_forced_dispatch(gp);
1280		error = g_sched_wait_pending(gp);
1281
1282		if (error) {
1283			/*
1284			 * Not all the requests came home: this might happen
1285			 * under heavy load, or if we were waiting for any
1286			 * bio which is served in the event path (see
1287			 * geom_slice.c for an example of how this can
1288			 * happen).  Try to restore a working configuration
1289			 * if we can fail.
1290			 */
1291			if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1292				g_sched_flush_pending(force ?
1293				    g_sched_blackhole : g_sched_start);
1294			}
1295
1296			/*
1297			 * In the forced destroy case there is not so much
1298			 * we can do, we have pending bios that will call
1299			 * g_sched_done() somehow, and we don't want them
1300			 * to crash the system using freed memory.  We tell
1301			 * the user that something went wrong, and leak some
1302			 * memory here.
1303			 * Note: the callers using force = 1 ignore the
1304			 * return value.
1305			 */
1306			if (force) {
1307				G_SCHED_DEBUG(0, "Pending requests while "
1308				    " destroying geom, some memory leaked.");
1309			}
1310
1311			return (error);
1312		}
1313
1314		g_sched_unlock(gp);
1315		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1316		    gsp, sc->sc_data);
1317		sc->sc_hash = NULL;
1318		gsp->gs_fini(sc->sc_data);
1319		g_gsched_unref(gsp);
1320		sc->sc_gsched = NULL;
1321	} else
1322		error = 0;
1323
1324	if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1325		error = g_destroy_proxy(gp, oldpp);
1326
1327		if (error) {
1328			if (force) {
1329				G_SCHED_DEBUG(0, "Unrecoverable error while "
1330				    "destroying a proxy geom, leaking some "
1331				    " memory.");
1332			}
1333
1334			return (error);
1335		}
1336	}
1337
1338	mtx_destroy(&sc->sc_mtx);
1339
1340	g_free(gp->softc);
1341	gp->softc = NULL;
1342	g_wither_geom(gp, ENXIO);
1343
1344	return (error);
1345}
1346
1347static int
1348g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1349    struct g_geom *gp)
1350{
1351
1352	return (g_sched_destroy(gp, 0));
1353}
1354
1355/*
1356 * Functions related to the classification of requests.
1357 *
1358 * On recent FreeBSD versions (8.0 and above), we store a reference
1359 * to the issuer of a request in bp->bio_classifier1 as soon
1360 * as the bio is posted to the geom queue (and not later, because
1361 * requests are managed by the g_down thread afterwards).
1362 */
1363
1364/*
1365 * Classifier support for recent FreeBSD versions: we use
1366 * a very simple classifier, only use curthread to tag a request.
1367 * The classifier is registered at module load, and unregistered
1368 * at module unload.
1369 */
1370static int
1371g_sched_tag(void *arg, struct bio *bp)
1372{
1373
1374	bp->bio_classifier1 = curthread;
1375	return (1);
1376}
1377
1378static struct g_classifier_hook g_sched_classifier = {
1379	.func =	g_sched_tag,
1380};
1381
1382static inline void
1383g_classifier_ini(void)
1384{
1385
1386	g_register_classifier(&g_sched_classifier);
1387}
1388
1389static inline void
1390g_classifier_fini(void)
1391{
1392
1393	g_unregister_classifier(&g_sched_classifier);
1394}
1395
1396static void
1397g_sched_init(struct g_class *mp)
1398{
1399
1400	g_gsched_global_init();
1401
1402	G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1403	    mp, &g_sched_class);
1404
1405	/* Patch g_io_request to store classification info in the bio. */
1406	g_classifier_ini();
1407}
1408
1409static void
1410g_sched_fini(struct g_class *mp)
1411{
1412
1413	g_classifier_fini();
1414
1415	G_SCHED_DEBUG(0, "Unloading...");
1416
1417	KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1418	mtx_destroy(&me.gs_mtx);
1419}
1420
1421static int
1422g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag,
1423    struct thread *td)
1424{
1425	struct g_consumer *cp;
1426	struct g_geom *gp;
1427
1428	cp = LIST_FIRST(&pp->geom->consumer);
1429	if (cp == NULL)
1430		return (ENOIOCTL);
1431	gp = cp->provider->geom;
1432	if (gp->ioctl == NULL)
1433		return (ENOIOCTL);
1434	return (gp->ioctl(cp->provider, cmd, data, fflag, td));
1435}
1436
1437/*
1438 * Read the i-th argument for a request, skipping the /dev/
1439 * prefix if present.
1440 */
1441static const char *
1442g_sched_argi(struct gctl_req *req, int i)
1443{
1444	static const char *dev_prefix = "/dev/";
1445	const char *name;
1446	char param[16];
1447	int l = strlen(dev_prefix);
1448
1449	snprintf(param, sizeof(param), "arg%d", i);
1450	name = gctl_get_asciiparam(req, param);
1451	if (name == NULL)
1452		gctl_error(req, "No 'arg%d' argument", i);
1453	else if (strncmp(name, dev_prefix, l) == 0)
1454		name += l;
1455	return (name);
1456}
1457
1458/*
1459 * Fetch nargs and do appropriate checks.
1460 */
1461static int
1462g_sched_get_nargs(struct gctl_req *req)
1463{
1464	int *nargs;
1465
1466	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1467	if (nargs == NULL) {
1468		gctl_error(req, "No 'nargs' argument");
1469		return (0);
1470	}
1471	if (*nargs <= 0)
1472		gctl_error(req, "Missing device(s).");
1473	return (*nargs);
1474}
1475
1476/*
1477 * Check whether we should add the class on certain volumes when
1478 * this geom is created. Right now this is under control of a kenv
1479 * variable containing the names of all devices that we care about.
1480 * Probably we should only support transparent insertion as the
1481 * preferred mode of operation.
1482 */
1483static struct g_geom *
1484g_sched_taste(struct g_class *mp, struct g_provider *pp,
1485		int flags __unused)
1486{
1487	struct g_gsched *gsp = NULL;	/* the . algorithm we want */
1488	const char *s;			/* generic string pointer */
1489	const char *taste_names;	/* devices we like */
1490	int l;
1491
1492        g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1493	    mp->name, pp->name);
1494        g_topology_assert();
1495
1496        G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1497
1498	do {
1499		/* do not taste on ourselves */
1500		if (pp->geom->class == mp)
1501                	break;
1502
1503		taste_names = kern_getenv("geom.sched.taste");
1504		if (taste_names == NULL)
1505			break;
1506
1507		l = strlen(pp->name);
1508		for (s = taste_names; *s &&
1509		    (s = strstr(s, pp->name)); s++) {
1510			/* further checks for an exact match */
1511			if ( (s == taste_names || s[-1] == ' ') &&
1512			     (s[l] == '\0' || s[l] == ' ') )
1513				break;
1514		}
1515		if (s == NULL)
1516			break;
1517		G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1518		    pp->name, s);
1519
1520		/* look up the provider name in the list */
1521		s = kern_getenv("geom.sched.algo");
1522		if (s == NULL)
1523			s = "rr";
1524
1525		gsp = g_gsched_find(s);	/* also get a reference */
1526		if (gsp == NULL) {
1527			G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1528			break;
1529		}
1530
1531		/* XXX create with 1 as last argument ? */
1532		g_sched_create(NULL, mp, pp, gsp, 0);
1533		g_gsched_unref(gsp);
1534	} while (0);
1535	return NULL;
1536}
1537
1538static void
1539g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1540{
1541	struct g_provider *pp;
1542	struct g_gsched *gsp;
1543	const char *name;
1544	int i, nargs;
1545
1546	g_topology_assert();
1547
1548	name = gctl_get_asciiparam(req, "algo");
1549	if (name == NULL) {
1550		gctl_error(req, "No '%s' argument", "algo");
1551		return;
1552	}
1553
1554	gsp = g_gsched_find(name);	/* also get a reference */
1555	if (gsp == NULL) {
1556		gctl_error(req, "Bad algorithm '%s'", name);
1557		return;
1558	}
1559
1560	nargs = g_sched_get_nargs(req);
1561
1562	/*
1563	 * Run on the arguments, and break on any error.
1564	 * We look for a device name, but skip the /dev/ prefix if any.
1565	 */
1566	for (i = 0; i < nargs; i++) {
1567		name = g_sched_argi(req, i);
1568		if (name == NULL)
1569			break;
1570		pp = g_provider_by_name(name);
1571		if (pp == NULL) {
1572			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1573			gctl_error(req, "Provider %s is invalid.", name);
1574			break;
1575		}
1576		if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1577			break;
1578	}
1579
1580	g_gsched_unref(gsp);
1581}
1582
1583static void
1584g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1585{
1586	struct g_provider *pp;
1587	struct g_gsched *gsp;
1588	const char *name;
1589	int i, nargs;
1590
1591	g_topology_assert();
1592
1593	name = gctl_get_asciiparam(req, "algo");
1594	if (name == NULL) {
1595		gctl_error(req, "No '%s' argument", "algo");
1596		return;
1597	}
1598
1599	gsp = g_gsched_find(name);	/* also get a reference */
1600	if (gsp == NULL) {
1601		gctl_error(req, "Bad algorithm '%s'", name);
1602		return;
1603	}
1604
1605	nargs = g_sched_get_nargs(req);
1606
1607	/*
1608	 * Run on the arguments, and break on any error.
1609	 * We look for a device name, but skip the /dev/ prefix if any.
1610	 */
1611	for (i = 0; i < nargs; i++) {
1612		name = g_sched_argi(req, i);
1613		if (name == NULL)
1614			break;
1615		pp = g_provider_by_name(name);
1616		if (pp == NULL || pp->geom->class != mp) {
1617			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1618			gctl_error(req, "Provider %s is invalid.", name);
1619			break;
1620		}
1621		if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1622			break;
1623	}
1624
1625	g_gsched_unref(gsp);
1626}
1627
1628static struct g_geom *
1629g_sched_find_geom(struct g_class *mp, const char *name)
1630{
1631	struct g_geom *gp;
1632
1633	LIST_FOREACH(gp, &mp->geom, geom) {
1634		if (strcmp(gp->name, name) == 0)
1635			return (gp);
1636	}
1637	return (NULL);
1638}
1639
1640static void
1641g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1642{
1643	int nargs, *force, error, i;
1644	struct g_geom *gp;
1645	const char *name;
1646
1647	g_topology_assert();
1648
1649	nargs = g_sched_get_nargs(req);
1650
1651	force = gctl_get_paraml(req, "force", sizeof(*force));
1652	if (force == NULL) {
1653		gctl_error(req, "No 'force' argument");
1654		return;
1655	}
1656
1657	for (i = 0; i < nargs; i++) {
1658		name = g_sched_argi(req, i);
1659		if (name == NULL)
1660			break;
1661
1662		gp = g_sched_find_geom(mp, name);
1663		if (gp == NULL) {
1664			G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1665			gctl_error(req, "Device %s is invalid.", name);
1666			break;
1667		}
1668
1669		error = g_sched_destroy(gp, *force);
1670		if (error != 0) {
1671			gctl_error(req, "Cannot destroy device %s (error=%d).",
1672			    gp->name, error);
1673			break;
1674		}
1675	}
1676}
1677
1678static void
1679g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1680{
1681	uint32_t *version;
1682
1683	g_topology_assert();
1684
1685	version = gctl_get_paraml(req, "version", sizeof(*version));
1686	if (version == NULL) {
1687		gctl_error(req, "No '%s' argument.", "version");
1688		return;
1689	}
1690
1691	if (*version != G_SCHED_VERSION) {
1692		gctl_error(req, "Userland and kernel parts are "
1693		    "out of sync.");
1694		return;
1695	}
1696
1697	if (strcmp(verb, "create") == 0) {
1698		g_sched_ctl_create(req, mp, 0);
1699		return;
1700	} else if (strcmp(verb, "insert") == 0) {
1701		g_sched_ctl_create(req, mp, 1);
1702		return;
1703	} else if (strcmp(verb, "configure") == 0) {
1704		g_sched_ctl_configure(req, mp);
1705		return;
1706	} else if (strcmp(verb, "destroy") == 0) {
1707		g_sched_ctl_destroy(req, mp);
1708		return;
1709	}
1710
1711	gctl_error(req, "Unknown verb.");
1712}
1713
1714static void
1715g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1716    struct g_consumer *cp, struct g_provider *pp)
1717{
1718	struct g_sched_softc *sc = gp->softc;
1719	struct g_gsched *gsp = sc->sc_gsched;
1720	if (indent == NULL) {	/* plaintext */
1721		sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1722	}
1723	if (gsp != NULL && gsp->gs_dumpconf)
1724		gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1725}
1726
1727DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1728MODULE_VERSION(geom_sched, 0);
1729