g_sched.c revision 296606
1/*-
2 * Copyright (c) 2009-2010 Fabio Checconi
3 * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28/*
29 * $Id$
30 * $FreeBSD: head/sys/geom/sched/g_sched.c 296606 2016-03-10 06:25:39Z imp $
31 *
32 * Main control module for geom-based disk schedulers ('sched').
33 *
34 * USER VIEW
35 * A 'sched' node is typically inserted transparently between
36 * an existing provider pp and its original geom gp
37 *
38 *	[pp --> gp  ..]
39 *
40 * using the command "geom sched insert <provider>" and
41 * resulting in the following topology
42 *
43 *	[pp --> sched_gp --> cp]   [new_pp --> gp ... ]
44 *
45 * Deletion "geom sched destroy <provider>.sched." restores the
46 * original chain. The normal "geom sched create <provide>"
47 * is also supported.
48 *
49 * INTERNALS
50 * Internally, the 'sched' uses the following data structures
51 *
52 *   geom{}         g_sched_softc{}      g_gsched{}
53 * +----------+    +---------------+   +-------------+
54 * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
55 * |  ...     |    |               |   |  gs_fini    |
56 * |          |    | [ hash table] |   |  gs_start   |
57 * +----------+    |               |   |  ...        |
58 *                 |               |   +-------------+
59 *                 |               |
60 *                 |               |     g_*_softc{}
61 *                 |               |   +-------------+
62 *                 | sc_data     *-|-->|             |
63 *                 +---------------+   |  algorithm- |
64 *                                     |  specific   |
65 *                                     +-------------+
66 *
67 * A g_sched_softc{} is created with a "geom sched insert" call.
68 * In turn this instantiates a specific scheduling algorithm,
69 * which sets sc_gsched to point to the algorithm callbacks,
70 * and calls gs_init() to create the g_*_softc{} .
71 * The other callbacks (gs_start, gs_next, ...) are invoked
72 * as needed
73 *
74 * g_sched_softc{} is defined in g_sched.h and mostly used here;
75 * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
76 * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
77 *
78 * DATA MOVING
79 * When a bio is received on the provider, it goes to the
80 * g_sched_start() which calls gs_start() to initially queue it;
81 * then we call g_sched_dispatch() that loops around gs_next()
82 * to select zero or more bio's to be sent downstream.
83 *
84 * g_sched_dispatch() can also be called as a result of a timeout,
85 * e.g. when doing anticipation or pacing requests.
86 *
87 * When a bio comes back, it goes to g_sched_done() which in turn
88 * calls gs_done(). The latter does any necessary housekeeping in
89 * the scheduling algorithm, and may decide to call g_sched_dispatch()
90 * to send more bio's downstream.
91 *
92 * If an algorithm needs per-flow queues, these are created
93 * calling gs_init_class() and destroyed with gs_fini_class(),
94 * and they are also inserted in the hash table implemented in
95 * the g_sched_softc{}
96 *
97 * If an algorithm is replaced, or a transparently-inserted node is
98 * removed with "geom sched destroy", we need to remove all references
99 * to the g_*_softc{} and g_sched_softc from the bio's still in
100 * the scheduler. g_sched_forced_dispatch() helps doing this.
101 * XXX need to explain better.
102 */
103
104#include <sys/cdefs.h>
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/kernel.h>
108#include <sys/module.h>
109#include <sys/lock.h>
110#include <sys/mutex.h>
111#include <sys/bio.h>
112#include <sys/limits.h>
113#include <sys/hash.h>
114#include <sys/sbuf.h>
115#include <sys/sysctl.h>
116#include <sys/malloc.h>
117#include <sys/proc.h>		/* we access curthread */
118#include <geom/geom.h>
119#include "gs_scheduler.h"
120#include "g_sched.h"		/* geom hooks */
121
122/*
123 * Size of the per-geom hash table storing traffic classes.
124 * We may decide to change it at a later time, it has no ABI
125 * implications as it is only used for run-time allocations.
126 */
127#define G_SCHED_HASH_SIZE	32
128
129static int g_sched_destroy(struct g_geom *gp, boolean_t force);
130static int g_sched_destroy_geom(struct gctl_req *req,
131    struct g_class *mp, struct g_geom *gp);
132static void g_sched_config(struct gctl_req *req, struct g_class *mp,
133    const char *verb);
134static struct g_geom *g_sched_taste(struct g_class *mp,
135    struct g_provider *pp, int flags __unused);
136static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
137    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
138static void g_sched_init(struct g_class *mp);
139static void g_sched_fini(struct g_class *mp);
140static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data,
141    int fflag, struct thread *td);
142
143struct g_class g_sched_class = {
144	.name = G_SCHED_CLASS_NAME,
145	.version = G_VERSION,
146	.ctlreq = g_sched_config,
147	.taste = g_sched_taste,
148	.destroy_geom = g_sched_destroy_geom,
149	.init = g_sched_init,
150	.ioctl = g_sched_ioctl,
151	.fini = g_sched_fini
152};
153
154MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
155
156/*
157 * Global variables describing the state of the geom_sched module.
158 * There is only one static instance of this structure.
159 */
160LIST_HEAD(gs_list, g_gsched);	/* type, link field */
161struct geom_sched_vars {
162	struct mtx	gs_mtx;
163	struct gs_list	gs_scheds;	/* list of algorithms */
164	u_int		gs_debug;
165	u_int		gs_sched_count;	/* how many algorithms ? */
166	u_int 		gs_patched;	/* g_io_request was patched */
167
168	u_int		gs_initialized;
169	u_int		gs_expire_secs;	/* expiration of hash entries */
170
171	struct bio_queue_head gs_pending;
172	u_int		gs_npending;
173
174	/* The following are for stats, usually protected by gs_mtx. */
175	u_long		gs_requests;	/* total requests */
176	u_long		gs_done;	/* total done */
177	u_int 		gs_in_flight;	/* requests in flight */
178	u_int 		gs_writes_in_flight;
179	u_int 		gs_bytes_in_flight;
180	u_int 		gs_write_bytes_in_flight;
181
182	char		gs_names[256];	/* names of schedulers */
183};
184
185static struct geom_sched_vars me = {
186	.gs_expire_secs = 10,
187};
188
189SYSCTL_DECL(_kern_geom);
190SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
191    "GEOM_SCHED stuff");
192
193SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
194    &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
195
196SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
197    &me.gs_bytes_in_flight, 0, "Bytes in flight");
198
199SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
200    &me.gs_writes_in_flight, 0, "Write Requests in flight");
201
202SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
203    &me.gs_in_flight, 0, "Requests in flight");
204
205SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
206    &me.gs_done, 0, "Total done");
207
208SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
209    &me.gs_requests, 0, "Total requests");
210
211SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
212    &me.gs_names, 0, "Algorithm names");
213
214SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
215    &me.gs_sched_count, 0, "Number of algorithms");
216
217SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
218    &me.gs_debug, 0, "Debug level");
219
220SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
221    &me.gs_expire_secs, 0, "Expire time in seconds");
222
223/*
224 * g_sched calls the scheduler algorithms with this lock held.
225 * The locking functions are exposed so the scheduler algorithms can also
226 * protect themselves e.g. when running a callout handler.
227 */
228void
229g_sched_lock(struct g_geom *gp)
230{
231	struct g_sched_softc *sc = gp->softc;
232
233	mtx_lock(&sc->sc_mtx);
234}
235
236void
237g_sched_unlock(struct g_geom *gp)
238{
239	struct g_sched_softc *sc = gp->softc;
240
241	mtx_unlock(&sc->sc_mtx);
242}
243
244/*
245 * Support functions to handle references to the module,
246 * which are coming from devices using this scheduler.
247 */
248static inline void
249g_gsched_ref(struct g_gsched *gsp)
250{
251
252	atomic_add_int(&gsp->gs_refs, 1);
253}
254
255static inline void
256g_gsched_unref(struct g_gsched *gsp)
257{
258
259	atomic_add_int(&gsp->gs_refs, -1);
260}
261
262/*
263 * Update the stats when this request is done.
264 */
265static void
266g_sched_update_stats(struct bio *bio)
267{
268
269	me.gs_done++;
270	me.gs_in_flight--;
271	me.gs_bytes_in_flight -= bio->bio_length;
272	if (bio->bio_cmd == BIO_WRITE) {
273		me.gs_writes_in_flight--;
274		me.gs_write_bytes_in_flight -= bio->bio_length;
275	}
276}
277
278/*
279 * Dispatch any pending request.
280 */
281static void
282g_sched_forced_dispatch(struct g_geom *gp)
283{
284	struct g_sched_softc *sc = gp->softc;
285	struct g_gsched *gsp = sc->sc_gsched;
286	struct bio *bp;
287
288	KASSERT(mtx_owned(&sc->sc_mtx),
289	    ("sc_mtx not owned during forced dispatch"));
290
291	while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
292		g_io_request(bp, LIST_FIRST(&gp->consumer));
293}
294
295/*
296 * The main dispatch loop, called either here after the start
297 * routine, or by scheduling algorithms when they receive a timeout
298 * or a 'done' notification.  Does not share code with the forced
299 * dispatch path, since the gs_done() callback can call us.
300 */
301void
302g_sched_dispatch(struct g_geom *gp)
303{
304	struct g_sched_softc *sc = gp->softc;
305	struct g_gsched *gsp = sc->sc_gsched;
306	struct bio *bp;
307
308	KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
309
310	if ((sc->sc_flags & G_SCHED_FLUSHING))
311		return;
312
313	while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
314		g_io_request(bp, LIST_FIRST(&gp->consumer));
315}
316
317/*
318 * Recent (8.0 and above) versions of FreeBSD have support to
319 * register classifiers of disk requests. The classifier is
320 * invoked by g_io_request(), and stores the information into
321 * bp->bio_classifier1.
322 *
323 * Support for older versions, which is left here only for
324 * documentation purposes, relies on two hacks:
325 * 1. classification info is written into the bio_caller1
326 *    field of the topmost node in the bio chain. This field
327 *    is rarely used, but this module is incompatible with
328 *    those that use bio_caller1 for other purposes,
329 *    such as ZFS and gjournal;
330 * 2. g_io_request() is patched in-memory when the module is
331 *    loaded, so that the function calls a classifier as its
332 *    first thing. g_io_request() is restored when the module
333 *    is unloaded. This functionality is only supported for
334 *    x86 and amd64, other architectures need source code changes.
335 */
336
337/*
338 * Lookup the identity of the issuer of the original request.
339 * In the current implementation we use the curthread of the
340 * issuer, but different mechanisms may be implemented later
341 * so we do not make assumptions on the return value which for
342 * us is just an opaque identifier.
343 */
344
345static inline u_long
346g_sched_classify(struct bio *bp)
347{
348
349	/* we have classifier fields in the struct bio */
350	return ((u_long)bp->bio_classifier1);
351}
352
353/* Return the hash chain for the given key. */
354static inline struct g_hash *
355g_sched_hash(struct g_sched_softc *sc, u_long key)
356{
357
358	return (&sc->sc_hash[key & sc->sc_mask]);
359}
360
361/*
362 * Helper function for the children classes, which takes
363 * a geom and a bio and returns the private descriptor
364 * associated to the request.  This involves fetching
365 * the classification field and [al]locating the
366 * corresponding entry in the hash table.
367 */
368void *
369g_sched_get_class(struct g_geom *gp, struct bio *bp)
370{
371	struct g_sched_softc *sc;
372	struct g_sched_class *gsc;
373	struct g_gsched *gsp;
374	struct g_hash *bucket;
375	u_long key;
376
377	sc = gp->softc;
378	key = g_sched_classify(bp);
379	bucket = g_sched_hash(sc, key);
380	LIST_FOREACH(gsc, bucket, gsc_clist) {
381		if (key == gsc->gsc_key) {
382			gsc->gsc_refs++;
383			return (gsc->gsc_priv);
384		}
385	}
386
387	gsp = sc->sc_gsched;
388	gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
389	    M_GEOM_SCHED, M_NOWAIT | M_ZERO);
390	if (!gsc)
391		return (NULL);
392
393	if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
394		free(gsc, M_GEOM_SCHED);
395		return (NULL);
396	}
397
398	gsc->gsc_refs = 2;	/* 1 for the hash table, 1 for the caller. */
399	gsc->gsc_key = key;
400	LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
401
402	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
403
404	return (gsc->gsc_priv);
405}
406
407/*
408 * Release a reference to the per-client descriptor,
409 */
410void
411g_sched_put_class(struct g_geom *gp, void *priv)
412{
413	struct g_sched_class *gsc;
414	struct g_sched_softc *sc;
415
416	gsc = g_sched_priv2class(priv);
417	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
418
419	if (--gsc->gsc_refs > 0)
420		return;
421
422	sc = gp->softc;
423	sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
424
425	LIST_REMOVE(gsc, gsc_clist);
426	free(gsc, M_GEOM_SCHED);
427}
428
429static void
430g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
431    struct g_gsched *gsp, void *data)
432{
433	struct g_sched_class *cp, *cp2;
434	int i;
435
436	if (!hp)
437		return;
438
439	if (data && gsp->gs_hash_unref)
440		gsp->gs_hash_unref(data);
441
442	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
443		LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
444			g_sched_put_class(gp, cp->gsc_priv);
445	}
446
447	hashdestroy(hp, M_GEOM_SCHED, mask);
448}
449
450static struct g_hash *
451g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
452{
453	struct g_hash *hash;
454
455	if (gsp->gs_priv_size == 0)
456		return (NULL);
457
458	hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
459
460	return (hash);
461}
462
463static void
464g_sched_flush_classes(struct g_geom *gp)
465{
466	struct g_sched_softc *sc;
467	struct g_sched_class *cp, *cp2;
468	int i;
469
470	sc = gp->softc;
471
472	if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
473		return;
474
475	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
476		LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
477			if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
478				g_sched_put_class(gp, cp->gsc_priv);
479		}
480	}
481
482	sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
483}
484
485/*
486 * Wait for the completion of any outstanding request.  To ensure
487 * that this does not take forever the caller has to make sure that
488 * no new request enter the scehduler before calling us.
489 *
490 * Must be called with the gp mutex held and topology locked.
491 */
492static int
493g_sched_wait_pending(struct g_geom *gp)
494{
495	struct g_sched_softc *sc = gp->softc;
496	int endticks = ticks + hz;
497
498	g_topology_assert();
499
500	while (sc->sc_pending && endticks - ticks >= 0)
501		msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
502
503	return (sc->sc_pending ? ETIMEDOUT : 0);
504}
505
506static int
507g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
508{
509	struct g_sched_softc *sc = gp->softc;
510	int error;
511
512	/* Set the flushing flag: new bios will not enter the scheduler. */
513	sc->sc_flags |= G_SCHED_FLUSHING;
514
515	g_sched_forced_dispatch(gp);
516	error = g_sched_wait_pending(gp);
517	if (error)
518		goto failed;
519
520	/* No more requests pending or in flight from the old gsp. */
521
522	g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
523	sc->sc_hash = NULL;
524
525	/*
526	 * Avoid deadlock here by releasing the gp mutex and reacquiring
527	 * it once done.  It should be safe, since no reconfiguration or
528	 * destruction can take place due to the geom topology lock; no
529	 * new request can use the current sc_data since we flagged the
530	 * geom as being flushed.
531	 */
532	g_sched_unlock(gp);
533	gsp->gs_fini(sc->sc_data);
534	g_sched_lock(gp);
535
536	sc->sc_gsched = NULL;
537	sc->sc_data = NULL;
538	g_gsched_unref(gsp);
539
540failed:
541	sc->sc_flags &= ~G_SCHED_FLUSHING;
542
543	return (error);
544}
545
546static int
547g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
548{
549	int error;
550
551	g_sched_lock(gp);
552	error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
553	g_sched_unlock(gp);
554
555	return (error);
556}
557
558/*
559 * Support function for create/taste -- locate the desired
560 * algorithm and grab a reference to it.
561 */
562static struct g_gsched *
563g_gsched_find(const char *name)
564{
565	struct g_gsched *gsp = NULL;
566
567	mtx_lock(&me.gs_mtx);
568	LIST_FOREACH(gsp, &me.gs_scheds, glist) {
569		if (strcmp(name, gsp->gs_name) == 0) {
570			g_gsched_ref(gsp);
571			break;
572		}
573	}
574	mtx_unlock(&me.gs_mtx);
575
576	return (gsp);
577}
578
579/*
580 * Rebuild the list of scheduler names.
581 * To be called with me.gs_mtx lock held.
582 */
583static void
584g_gsched_build_names(struct g_gsched *gsp)
585{
586	int pos, l;
587	struct g_gsched *cur;
588
589	pos = 0;
590	LIST_FOREACH(cur, &me.gs_scheds, glist) {
591		l = strlen(cur->gs_name);
592		if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
593			if (pos != 0)
594				me.gs_names[pos++] = ' ';
595			strcpy(me.gs_names + pos, cur->gs_name);
596			pos += l;
597		}
598	}
599	me.gs_names[pos] = '\0';
600}
601
602/*
603 * Register or unregister individual scheduling algorithms.
604 */
605static int
606g_gsched_register(struct g_gsched *gsp)
607{
608	struct g_gsched *cur;
609	int error = 0;
610
611	mtx_lock(&me.gs_mtx);
612	LIST_FOREACH(cur, &me.gs_scheds, glist) {
613		if (strcmp(gsp->gs_name, cur->gs_name) == 0)
614			break;
615	}
616	if (cur != NULL) {
617		G_SCHED_DEBUG(0, "A scheduler named %s already"
618		    "exists.", gsp->gs_name);
619		error = EEXIST;
620	} else {
621		LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
622		gsp->gs_refs = 1;
623		me.gs_sched_count++;
624		g_gsched_build_names(gsp);
625	}
626	mtx_unlock(&me.gs_mtx);
627
628	return (error);
629}
630
631struct g_gsched_unregparm {
632	struct g_gsched *gup_gsp;
633	int		gup_error;
634};
635
636static void
637g_gsched_unregister(void *arg, int flag)
638{
639	struct g_gsched_unregparm *parm = arg;
640	struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
641	struct g_sched_softc *sc;
642	struct g_geom *gp, *gp_tmp;
643	int error;
644
645	parm->gup_error = 0;
646
647	g_topology_assert();
648
649	if (flag == EV_CANCEL)
650		return;
651
652	mtx_lock(&me.gs_mtx);
653
654	LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
655		if (gp->class != &g_sched_class)
656			continue;	/* Should not happen. */
657
658		sc = gp->softc;
659		if (sc->sc_gsched == gsp) {
660			error = g_sched_remove(gp, gsp);
661			if (error)
662				goto failed;
663		}
664	}
665
666	LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
667		if (cur != gsp)
668			continue;
669
670		if (gsp->gs_refs != 1) {
671			G_SCHED_DEBUG(0, "%s still in use.",
672			    gsp->gs_name);
673			parm->gup_error = EBUSY;
674		} else {
675			LIST_REMOVE(gsp, glist);
676			me.gs_sched_count--;
677			g_gsched_build_names(gsp);
678		}
679		break;
680	}
681
682	if (cur == NULL) {
683		G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
684		parm->gup_error = ENOENT;
685	}
686
687failed:
688	mtx_unlock(&me.gs_mtx);
689}
690
691static inline void
692g_gsched_global_init(void)
693{
694
695	if (!me.gs_initialized) {
696		G_SCHED_DEBUG(0, "Initializing global data.");
697		mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
698		LIST_INIT(&me.gs_scheds);
699		bioq_init(&me.gs_pending);
700		me.gs_initialized = 1;
701	}
702}
703
704/*
705 * Module event called when a scheduling algorithm module is loaded or
706 * unloaded.
707 */
708int
709g_gsched_modevent(module_t mod, int cmd, void *arg)
710{
711	struct g_gsched *gsp = arg;
712	struct g_gsched_unregparm parm;
713	int error;
714
715	G_SCHED_DEBUG(0, "Modevent %d.", cmd);
716
717	/*
718	 * If the module is loaded at boot, the geom thread that calls
719	 * g_sched_init() might actually run after g_gsched_modevent(),
720	 * so make sure that the module is properly initialized.
721	 */
722	g_gsched_global_init();
723
724	error = EOPNOTSUPP;
725	switch (cmd) {
726	case MOD_LOAD:
727		error = g_gsched_register(gsp);
728		G_SCHED_DEBUG(0, "Loaded module %s error %d.",
729		    gsp->gs_name, error);
730		if (error == 0)
731			g_retaste(&g_sched_class);
732		break;
733
734	case MOD_UNLOAD:
735		parm.gup_gsp = gsp;
736		parm.gup_error = 0;
737
738		error = g_waitfor_event(g_gsched_unregister,
739		    &parm, M_WAITOK, NULL);
740		if (error == 0)
741			error = parm.gup_error;
742		G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
743		    gsp->gs_name, error);
744		break;
745	};
746
747	return (error);
748}
749
750#ifdef KTR
751#define	TRC_BIO_EVENT(e, bp)	g_sched_trace_bio_ ## e (bp)
752
753static inline char
754g_sched_type(struct bio *bp)
755{
756
757	if (bp->bio_cmd == BIO_READ)
758		return ('R');
759	else if (bp->bio_cmd == BIO_WRITE)
760		return ('W');
761	return ('U');
762}
763
764static inline void
765g_sched_trace_bio_START(struct bio *bp)
766{
767
768	CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
769	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
770	    bp->bio_offset, bp->bio_length);
771}
772
773static inline void
774g_sched_trace_bio_DONE(struct bio *bp)
775{
776
777	CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
778	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
779	    bp->bio_offset, bp->bio_length);
780}
781#else /* !KTR */
782#define	TRC_BIO_EVENT(e, bp)
783#endif /* !KTR */
784
785/*
786 * g_sched_done() and g_sched_start() dispatch the geom requests to
787 * the scheduling algorithm in use.
788 */
789static void
790g_sched_done(struct bio *bio)
791{
792	struct g_geom *gp = bio->bio_caller2;
793	struct g_sched_softc *sc = gp->softc;
794
795	TRC_BIO_EVENT(DONE, bio);
796
797	KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
798
799	g_sched_lock(gp);
800
801	g_sched_update_stats(bio);
802	sc->sc_gsched->gs_done(sc->sc_data, bio);
803	if (!--sc->sc_pending)
804		wakeup(gp);
805
806	g_sched_flush_classes(gp);
807	g_sched_unlock(gp);
808
809	g_std_done(bio);
810}
811
812static void
813g_sched_start(struct bio *bp)
814{
815	struct g_geom *gp = bp->bio_to->geom;
816	struct g_sched_softc *sc = gp->softc;
817	struct bio *cbp;
818
819	TRC_BIO_EVENT(START, bp);
820	G_SCHED_LOGREQ(bp, "Request received.");
821
822	cbp = g_clone_bio(bp);
823	if (cbp == NULL) {
824		g_io_deliver(bp, ENOMEM);
825		return;
826	}
827	cbp->bio_done = g_sched_done;
828	cbp->bio_to = LIST_FIRST(&gp->provider);
829	KASSERT(cbp->bio_to != NULL, ("NULL provider"));
830
831	/* We only schedule reads and writes. */
832	if (bp->bio_cmd != BIO_READ && bp->bio_cmd != BIO_WRITE)
833		goto bypass;
834
835	G_SCHED_LOGREQ(cbp, "Sending request.");
836
837	g_sched_lock(gp);
838	/*
839	 * Call the algorithm's gs_start to queue the request in the
840	 * scheduler. If gs_start fails then pass the request down,
841	 * otherwise call g_sched_dispatch() which tries to push
842	 * one or more requests down.
843	 */
844	if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
845	    sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
846		g_sched_unlock(gp);
847		goto bypass;
848	}
849	/*
850	 * We use bio_caller1 to mark requests that are scheduled
851	 * so make sure it is not NULL.
852	 */
853	if (cbp->bio_caller1 == NULL)
854		cbp->bio_caller1 = &me;	/* anything not NULL */
855
856	cbp->bio_caller2 = gp;
857	sc->sc_pending++;
858
859	/* Update general stats. */
860	me.gs_in_flight++;
861	me.gs_requests++;
862	me.gs_bytes_in_flight += bp->bio_length;
863	if (bp->bio_cmd == BIO_WRITE) {
864		me.gs_writes_in_flight++;
865		me.gs_write_bytes_in_flight += bp->bio_length;
866	}
867	g_sched_dispatch(gp);
868	g_sched_unlock(gp);
869	return;
870
871bypass:
872	cbp->bio_done = g_std_done;
873	cbp->bio_caller1 = NULL; /* not scheduled */
874	g_io_request(cbp, LIST_FIRST(&gp->consumer));
875}
876
877/*
878 * The next few functions are the geom glue.
879 */
880static void
881g_sched_orphan(struct g_consumer *cp)
882{
883
884	g_topology_assert();
885	g_sched_destroy(cp->geom, 1);
886}
887
888static int
889g_sched_access(struct g_provider *pp, int dr, int dw, int de)
890{
891	struct g_geom *gp;
892	struct g_consumer *cp;
893	int error;
894
895	gp = pp->geom;
896	cp = LIST_FIRST(&gp->consumer);
897	error = g_access(cp, dr, dw, de);
898
899	return (error);
900}
901
902static void
903g_sched_temporary_start(struct bio *bio)
904{
905
906	mtx_lock(&me.gs_mtx);
907	me.gs_npending++;
908	bioq_disksort(&me.gs_pending, bio);
909	mtx_unlock(&me.gs_mtx);
910}
911
912static void
913g_sched_flush_pending(g_start_t *start)
914{
915	struct bio *bp;
916
917	while ((bp = bioq_takefirst(&me.gs_pending)))
918		start(bp);
919}
920
921static int
922g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
923    struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
924{
925	struct g_sched_softc *sc = gp->softc;
926	g_start_t *saved_start, *flush = g_sched_start;
927	int error = 0, endticks = ticks + hz;
928
929	g_cancel_event(newpp);	/* prevent taste() */
930	/* copy private fields */
931	newpp->private = pp->private;
932	newpp->index = pp->index;
933
934	/* Queue all the early requests coming for us. */
935	me.gs_npending = 0;
936	saved_start = pp->geom->start;
937	dstgp->start = g_sched_temporary_start;
938
939	while (pp->nstart - pp->nend != me.gs_npending &&
940	    endticks - ticks >= 0)
941		tsleep(pp, PRIBIO, "-", hz/10);
942
943	if (pp->nstart - pp->nend != me.gs_npending) {
944		flush = saved_start;
945		error = ETIMEDOUT;
946		goto fail;
947	}
948
949	/* link pp to this geom */
950	LIST_REMOVE(pp, provider);
951	pp->geom = gp;
952	LIST_INSERT_HEAD(&gp->provider, pp, provider);
953
954	/*
955	 * replicate the counts from the parent in the
956	 * new provider and consumer nodes
957	 */
958	cp->acr = newpp->acr = pp->acr;
959	cp->acw = newpp->acw = pp->acw;
960	cp->ace = newpp->ace = pp->ace;
961	sc->sc_flags |= G_SCHED_PROXYING;
962
963fail:
964	dstgp->start = saved_start;
965
966	g_sched_flush_pending(flush);
967
968	return (error);
969}
970
971/*
972 * Create a geom node for the device passed as *pp.
973 * If successful, add a reference to this gsp.
974 */
975static int
976g_sched_create(struct gctl_req *req, struct g_class *mp,
977    struct g_provider *pp, struct g_gsched *gsp, int proxy)
978{
979	struct g_sched_softc *sc = NULL;
980	struct g_geom *gp, *dstgp;
981	struct g_provider *newpp = NULL;
982	struct g_consumer *cp = NULL;
983	char name[64];
984	int error;
985
986	g_topology_assert();
987
988	snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
989	LIST_FOREACH(gp, &mp->geom, geom) {
990		if (strcmp(gp->name, name) == 0) {
991			gctl_error(req, "Geom %s already exists.",
992			    name);
993			return (EEXIST);
994		}
995	}
996
997	gp = g_new_geomf(mp, "%s", name);
998	dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
999
1000	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1001	sc->sc_gsched = gsp;
1002	sc->sc_data = gsp->gs_init(gp);
1003	if (sc->sc_data == NULL) {
1004		error = ENOMEM;
1005		goto fail;
1006	}
1007
1008	sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1009
1010	/*
1011	 * Do not initialize the flush mechanism, will be initialized
1012	 * on the first insertion on the hash table.
1013	 */
1014
1015	mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1016
1017	gp->softc = sc;
1018	gp->start = g_sched_start;
1019	gp->orphan = g_sched_orphan;
1020	gp->access = g_sched_access;
1021	gp->dumpconf = g_sched_dumpconf;
1022
1023	newpp = g_new_providerf(dstgp, "%s", gp->name);
1024	newpp->mediasize = pp->mediasize;
1025	newpp->sectorsize = pp->sectorsize;
1026
1027	cp = g_new_consumer(gp);
1028	error = g_attach(cp, proxy ? newpp : pp);
1029	if (error != 0) {
1030		gctl_error(req, "Cannot attach to provider %s.",
1031		    pp->name);
1032		goto fail;
1033	}
1034
1035	g_error_provider(newpp, 0);
1036	if (proxy) {
1037		error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1038		if (error)
1039			goto fail;
1040	}
1041	G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1042
1043	g_gsched_ref(gsp);
1044
1045	return (0);
1046
1047fail:
1048	if (cp != NULL) {
1049		if (cp->provider != NULL)
1050			g_detach(cp);
1051		g_destroy_consumer(cp);
1052	}
1053	if (newpp != NULL)
1054		g_destroy_provider(newpp);
1055	if (sc->sc_hash)
1056		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1057		    gsp, sc->sc_data);
1058	if (sc->sc_data)
1059		gsp->gs_fini(sc->sc_data);
1060	g_free(gp->softc);
1061	g_destroy_geom(gp);
1062
1063	return (error);
1064}
1065
1066/*
1067 * Support for dynamic switching of scheduling algorithms.
1068 * First initialize the data structures for the new algorithm,
1069 * then call g_sched_remove_locked() to flush all references
1070 * to the old one, finally link the new algorithm.
1071 */
1072static int
1073g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1074    struct g_provider *pp, struct g_gsched *gsp)
1075{
1076	struct g_sched_softc *sc;
1077	struct g_geom *gp;
1078	struct g_hash *newh;
1079	void *data;
1080	u_long mask;
1081	int error = 0;
1082
1083	gp = pp->geom;
1084	sc = gp->softc;
1085
1086	data = gsp->gs_init(gp);
1087	if (data == NULL)
1088		return (ENOMEM);
1089
1090	newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1091	if (gsp->gs_priv_size && !newh) {
1092		error = ENOMEM;
1093		goto fail;
1094	}
1095
1096	g_sched_lock(gp);
1097	if (sc->sc_gsched) {	/* can be NULL in some cases */
1098		error = g_sched_remove_locked(gp, sc->sc_gsched);
1099		if (error)
1100			goto fail;
1101	}
1102
1103	g_gsched_ref(gsp);
1104	sc->sc_gsched = gsp;
1105	sc->sc_data = data;
1106	sc->sc_hash = newh;
1107	sc->sc_mask = mask;
1108
1109	g_sched_unlock(gp);
1110
1111	return (0);
1112
1113fail:
1114	if (newh)
1115		g_sched_hash_fini(gp, newh, mask, gsp, data);
1116
1117	if (data)
1118		gsp->gs_fini(data);
1119
1120	g_sched_unlock(gp);
1121
1122	return (error);
1123}
1124
1125/*
1126 * Stop the request flow directed to the proxy, redirecting the new
1127 * requests to the me.gs_pending queue.
1128 */
1129static struct g_provider *
1130g_detach_proxy(struct g_geom *gp)
1131{
1132	struct g_consumer *cp;
1133	struct g_provider *pp, *newpp;
1134
1135	do {
1136		pp = LIST_FIRST(&gp->provider);
1137		if (pp == NULL)
1138			break;
1139		cp = LIST_FIRST(&gp->consumer);
1140		if (cp == NULL)
1141			break;
1142		newpp = cp->provider;
1143		if (newpp == NULL)
1144			break;
1145
1146		me.gs_npending = 0;
1147		pp->geom->start = g_sched_temporary_start;
1148
1149		return (pp);
1150	} while (0);
1151	printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1152
1153	return (NULL);
1154}
1155
1156static void
1157g_sched_blackhole(struct bio *bp)
1158{
1159
1160	g_io_deliver(bp, ENXIO);
1161}
1162
1163static inline void
1164g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1165    struct g_provider *newpp)
1166{
1167
1168	LIST_REMOVE(pp, provider);
1169	if (newpp) {
1170		pp->private = newpp->private;
1171		pp->index = newpp->index;
1172	}
1173	pp->geom = gp;
1174	LIST_INSERT_HEAD(&gp->provider, pp, provider);
1175}
1176
1177static inline void
1178g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1179{
1180	struct g_geom *gp = oldpp->geom;
1181
1182	g_reparent_provider(oldpp, newpp->geom, newpp);
1183
1184	/*
1185	 * Hackish: let the system destroy the old provider for us, just
1186	 * in case someone attached a consumer to it, in which case a
1187	 * direct call to g_destroy_provider() would not work.
1188	 */
1189	g_reparent_provider(newpp, gp, NULL);
1190}
1191
1192/*
1193 * Complete the proxy destruction, linking the old provider to its
1194 * original geom, and destroying the proxy provider.  Also take care
1195 * of issuing the pending requests collected in me.gs_pending (if any).
1196 */
1197static int
1198g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1199{
1200	struct g_consumer *cp;
1201	struct g_provider *newpp;
1202
1203	do {
1204		cp = LIST_FIRST(&gp->consumer);
1205		if (cp == NULL)
1206			break;
1207		newpp = cp->provider;
1208		if (newpp == NULL)
1209			break;
1210
1211		/* Relink the provider to its original geom. */
1212		g_unproxy_provider(oldpp, newpp);
1213
1214		/* Detach consumer from provider, and destroy provider. */
1215		cp->acr = newpp->acr = 0;
1216		cp->acw = newpp->acw = 0;
1217		cp->ace = newpp->ace = 0;
1218		g_detach(cp);
1219
1220		/* Send the pending bios through the right start function. */
1221		g_sched_flush_pending(oldpp->geom->start);
1222
1223		return (0);
1224	} while (0);
1225	printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1226
1227	/* We cannot send the pending bios anywhere... */
1228	g_sched_flush_pending(g_sched_blackhole);
1229
1230	return (EINVAL);
1231}
1232
1233static int
1234g_sched_destroy(struct g_geom *gp, boolean_t force)
1235{
1236	struct g_provider *pp, *oldpp = NULL;
1237	struct g_sched_softc *sc;
1238	struct g_gsched *gsp;
1239	int error;
1240
1241	g_topology_assert();
1242	sc = gp->softc;
1243	if (sc == NULL)
1244		return (ENXIO);
1245	if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1246		pp = LIST_FIRST(&gp->provider);
1247		if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1248			const char *msg = force ?
1249				"but we force removal" : "cannot remove";
1250
1251			G_SCHED_DEBUG(!force,
1252			    "Device %s is still open (r%dw%de%d), %s.",
1253			    pp->name, pp->acr, pp->acw, pp->ace, msg);
1254			if (!force)
1255				return (EBUSY);
1256		} else {
1257			G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1258		}
1259	} else
1260		oldpp = g_detach_proxy(gp);
1261
1262	gsp = sc->sc_gsched;
1263	if (gsp) {
1264		/*
1265		 * XXX bad hack here: force a dispatch to release
1266		 * any reference to the hash table still held by
1267		 * the scheduler.
1268		 */
1269		g_sched_lock(gp);
1270		/*
1271		 * We are dying here, no new requests should enter
1272		 * the scheduler.  This is granted by the topolgy,
1273		 * either in case we were proxying (new bios are
1274		 * being redirected) or not (see the access check
1275		 * above).
1276		 */
1277		g_sched_forced_dispatch(gp);
1278		error = g_sched_wait_pending(gp);
1279
1280		if (error) {
1281			/*
1282			 * Not all the requests came home: this might happen
1283			 * under heavy load, or if we were waiting for any
1284			 * bio which is served in the event path (see
1285			 * geom_slice.c for an example of how this can
1286			 * happen).  Try to restore a working configuration
1287			 * if we can fail.
1288			 */
1289			if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1290				g_sched_flush_pending(force ?
1291				    g_sched_blackhole : g_sched_start);
1292			}
1293
1294			/*
1295			 * In the forced destroy case there is not so much
1296			 * we can do, we have pending bios that will call
1297			 * g_sched_done() somehow, and we don't want them
1298			 * to crash the system using freed memory.  We tell
1299			 * the user that something went wrong, and leak some
1300			 * memory here.
1301			 * Note: the callers using force = 1 ignore the
1302			 * return value.
1303			 */
1304			if (force) {
1305				G_SCHED_DEBUG(0, "Pending requests while "
1306				    " destroying geom, some memory leaked.");
1307			}
1308
1309			return (error);
1310		}
1311
1312		g_sched_unlock(gp);
1313		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1314		    gsp, sc->sc_data);
1315		sc->sc_hash = NULL;
1316		gsp->gs_fini(sc->sc_data);
1317		g_gsched_unref(gsp);
1318		sc->sc_gsched = NULL;
1319	}
1320
1321	if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1322		error = g_destroy_proxy(gp, oldpp);
1323
1324		if (error) {
1325			if (force) {
1326				G_SCHED_DEBUG(0, "Unrecoverable error while "
1327				    "destroying a proxy geom, leaking some "
1328				    " memory.");
1329			}
1330
1331			return (error);
1332		}
1333	}
1334
1335	mtx_destroy(&sc->sc_mtx);
1336
1337	g_free(gp->softc);
1338	gp->softc = NULL;
1339	g_wither_geom(gp, ENXIO);
1340
1341	return (error);
1342}
1343
1344static int
1345g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1346    struct g_geom *gp)
1347{
1348
1349	return (g_sched_destroy(gp, 0));
1350}
1351
1352/*
1353 * Functions related to the classification of requests.
1354 *
1355 * On recent FreeBSD versions (8.0 and above), we store a reference
1356 * to the issuer of a request in bp->bio_classifier1 as soon
1357 * as the bio is posted to the geom queue (and not later, because
1358 * requests are managed by the g_down thread afterwards).
1359 */
1360
1361/*
1362 * Classifier support for recent FreeBSD versions: we use
1363 * a very simple classifier, only use curthread to tag a request.
1364 * The classifier is registered at module load, and unregistered
1365 * at module unload.
1366 */
1367static int
1368g_sched_tag(void *arg, struct bio *bp)
1369{
1370
1371	bp->bio_classifier1 = curthread;
1372	return (1);
1373}
1374
1375static struct g_classifier_hook g_sched_classifier = {
1376	.func =	g_sched_tag,
1377};
1378
1379static inline void
1380g_classifier_ini(void)
1381{
1382
1383	g_register_classifier(&g_sched_classifier);
1384}
1385
1386static inline void
1387g_classifier_fini(void)
1388{
1389
1390	g_unregister_classifier(&g_sched_classifier);
1391}
1392
1393static void
1394g_sched_init(struct g_class *mp)
1395{
1396
1397	g_gsched_global_init();
1398
1399	G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1400	    mp, &g_sched_class);
1401
1402	/* Patch g_io_request to store classification info in the bio. */
1403	g_classifier_ini();
1404}
1405
1406static void
1407g_sched_fini(struct g_class *mp)
1408{
1409
1410	g_classifier_fini();
1411
1412	G_SCHED_DEBUG(0, "Unloading...");
1413
1414	KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1415	mtx_destroy(&me.gs_mtx);
1416}
1417
1418static int
1419g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag,
1420    struct thread *td)
1421{
1422	struct g_consumer *cp;
1423	struct g_geom *gp;
1424
1425	cp = LIST_FIRST(&pp->geom->consumer);
1426	if (cp == NULL)
1427		return (ENOIOCTL);
1428	gp = cp->provider->geom;
1429	if (gp->ioctl == NULL)
1430		return (ENOIOCTL);
1431	return (gp->ioctl(cp->provider, cmd, data, fflag, td));
1432}
1433
1434/*
1435 * Read the i-th argument for a request, skipping the /dev/
1436 * prefix if present.
1437 */
1438static const char *
1439g_sched_argi(struct gctl_req *req, int i)
1440{
1441	static const char *dev_prefix = "/dev/";
1442	const char *name;
1443	char param[16];
1444	int l = strlen(dev_prefix);
1445
1446	snprintf(param, sizeof(param), "arg%d", i);
1447	name = gctl_get_asciiparam(req, param);
1448	if (name == NULL)
1449		gctl_error(req, "No 'arg%d' argument", i);
1450	else if (strncmp(name, dev_prefix, l) == 0)
1451		name += l;
1452	return (name);
1453}
1454
1455/*
1456 * Fetch nargs and do appropriate checks.
1457 */
1458static int
1459g_sched_get_nargs(struct gctl_req *req)
1460{
1461	int *nargs;
1462
1463	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1464	if (nargs == NULL) {
1465		gctl_error(req, "No 'nargs' argument");
1466		return (0);
1467	}
1468	if (*nargs <= 0)
1469		gctl_error(req, "Missing device(s).");
1470	return (*nargs);
1471}
1472
1473/*
1474 * Check whether we should add the class on certain volumes when
1475 * this geom is created. Right now this is under control of a kenv
1476 * variable containing the names of all devices that we care about.
1477 * Probably we should only support transparent insertion as the
1478 * preferred mode of operation.
1479 */
1480static struct g_geom *
1481g_sched_taste(struct g_class *mp, struct g_provider *pp,
1482		int flags __unused)
1483{
1484	struct g_gsched *gsp = NULL;	/* the . algorithm we want */
1485	const char *s;			/* generic string pointer */
1486	const char *taste_names;	/* devices we like */
1487	int l;
1488
1489        g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1490	    mp->name, pp->name);
1491        g_topology_assert();
1492
1493        G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1494
1495	do {
1496		/* do not taste on ourselves */
1497		if (pp->geom->class == mp)
1498                	break;
1499
1500		taste_names = kern_getenv("geom.sched.taste");
1501		if (taste_names == NULL)
1502			break;
1503
1504		l = strlen(pp->name);
1505		for (s = taste_names; *s &&
1506		    (s = strstr(s, pp->name)); s++) {
1507			/* further checks for an exact match */
1508			if ( (s == taste_names || s[-1] == ' ') &&
1509			     (s[l] == '\0' || s[l] == ' ') )
1510				break;
1511		}
1512		if (s == NULL)
1513			break;
1514		G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1515		    pp->name, s);
1516
1517		/* look up the provider name in the list */
1518		s = kern_getenv("geom.sched.algo");
1519		if (s == NULL)
1520			s = "rr";
1521
1522		gsp = g_gsched_find(s);	/* also get a reference */
1523		if (gsp == NULL) {
1524			G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1525			break;
1526		}
1527
1528		/* XXX create with 1 as last argument ? */
1529		g_sched_create(NULL, mp, pp, gsp, 0);
1530		g_gsched_unref(gsp);
1531	} while (0);
1532	return NULL;
1533}
1534
1535static void
1536g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1537{
1538	struct g_provider *pp;
1539	struct g_gsched *gsp;
1540	const char *name;
1541	int i, nargs;
1542
1543	g_topology_assert();
1544
1545	name = gctl_get_asciiparam(req, "algo");
1546	if (name == NULL) {
1547		gctl_error(req, "No '%s' argument", "algo");
1548		return;
1549	}
1550
1551	gsp = g_gsched_find(name);	/* also get a reference */
1552	if (gsp == NULL) {
1553		gctl_error(req, "Bad algorithm '%s'", name);
1554		return;
1555	}
1556
1557	nargs = g_sched_get_nargs(req);
1558
1559	/*
1560	 * Run on the arguments, and break on any error.
1561	 * We look for a device name, but skip the /dev/ prefix if any.
1562	 */
1563	for (i = 0; i < nargs; i++) {
1564		name = g_sched_argi(req, i);
1565		if (name == NULL)
1566			break;
1567		pp = g_provider_by_name(name);
1568		if (pp == NULL) {
1569			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1570			gctl_error(req, "Provider %s is invalid.", name);
1571			break;
1572		}
1573		if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1574			break;
1575	}
1576
1577	g_gsched_unref(gsp);
1578}
1579
1580static void
1581g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1582{
1583	struct g_provider *pp;
1584	struct g_gsched *gsp;
1585	const char *name;
1586	int i, nargs;
1587
1588	g_topology_assert();
1589
1590	name = gctl_get_asciiparam(req, "algo");
1591	if (name == NULL) {
1592		gctl_error(req, "No '%s' argument", "algo");
1593		return;
1594	}
1595
1596	gsp = g_gsched_find(name);	/* also get a reference */
1597	if (gsp == NULL) {
1598		gctl_error(req, "Bad algorithm '%s'", name);
1599		return;
1600	}
1601
1602	nargs = g_sched_get_nargs(req);
1603
1604	/*
1605	 * Run on the arguments, and break on any error.
1606	 * We look for a device name, but skip the /dev/ prefix if any.
1607	 */
1608	for (i = 0; i < nargs; i++) {
1609		name = g_sched_argi(req, i);
1610		if (name == NULL)
1611			break;
1612		pp = g_provider_by_name(name);
1613		if (pp == NULL || pp->geom->class != mp) {
1614			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1615			gctl_error(req, "Provider %s is invalid.", name);
1616			break;
1617		}
1618		if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1619			break;
1620	}
1621
1622	g_gsched_unref(gsp);
1623}
1624
1625static struct g_geom *
1626g_sched_find_geom(struct g_class *mp, const char *name)
1627{
1628	struct g_geom *gp;
1629
1630	LIST_FOREACH(gp, &mp->geom, geom) {
1631		if (strcmp(gp->name, name) == 0)
1632			return (gp);
1633	}
1634	return (NULL);
1635}
1636
1637static void
1638g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1639{
1640	int nargs, *force, error, i;
1641	struct g_geom *gp;
1642	const char *name;
1643
1644	g_topology_assert();
1645
1646	nargs = g_sched_get_nargs(req);
1647
1648	force = gctl_get_paraml(req, "force", sizeof(*force));
1649	if (force == NULL) {
1650		gctl_error(req, "No 'force' argument");
1651		return;
1652	}
1653
1654	for (i = 0; i < nargs; i++) {
1655		name = g_sched_argi(req, i);
1656		if (name == NULL)
1657			break;
1658
1659		gp = g_sched_find_geom(mp, name);
1660		if (gp == NULL) {
1661			G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1662			gctl_error(req, "Device %s is invalid.", name);
1663			break;
1664		}
1665
1666		error = g_sched_destroy(gp, *force);
1667		if (error != 0) {
1668			gctl_error(req, "Cannot destroy device %s (error=%d).",
1669			    gp->name, error);
1670			break;
1671		}
1672	}
1673}
1674
1675static void
1676g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1677{
1678	uint32_t *version;
1679
1680	g_topology_assert();
1681
1682	version = gctl_get_paraml(req, "version", sizeof(*version));
1683	if (version == NULL) {
1684		gctl_error(req, "No '%s' argument.", "version");
1685		return;
1686	}
1687
1688	if (*version != G_SCHED_VERSION) {
1689		gctl_error(req, "Userland and kernel parts are "
1690		    "out of sync.");
1691		return;
1692	}
1693
1694	if (strcmp(verb, "create") == 0) {
1695		g_sched_ctl_create(req, mp, 0);
1696		return;
1697	} else if (strcmp(verb, "insert") == 0) {
1698		g_sched_ctl_create(req, mp, 1);
1699		return;
1700	} else if (strcmp(verb, "configure") == 0) {
1701		g_sched_ctl_configure(req, mp);
1702		return;
1703	} else if (strcmp(verb, "destroy") == 0) {
1704		g_sched_ctl_destroy(req, mp);
1705		return;
1706	}
1707
1708	gctl_error(req, "Unknown verb.");
1709}
1710
1711static void
1712g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1713    struct g_consumer *cp, struct g_provider *pp)
1714{
1715	struct g_sched_softc *sc = gp->softc;
1716	struct g_gsched *gsp = sc->sc_gsched;
1717	if (indent == NULL) {	/* plaintext */
1718		sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1719	}
1720	if (gsp != NULL && gsp->gs_dumpconf)
1721		gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1722}
1723
1724DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1725MODULE_VERSION(geom_sched, 0);
1726