g_sched.c revision 210795
1/*-
2 * Copyright (c) 2009-2010 Fabio Checconi
3 * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28/*
29 * $Id$
30 * $FreeBSD: head/sys/geom/sched/g_sched.c 210795 2010-08-03 11:21:17Z ae $
31 *
32 * Main control module for geom-based disk schedulers ('sched').
33 *
34 * USER VIEW
35 * A 'sched' node is typically inserted transparently between
36 * an existing provider pp and its original geom gp
37 *
38 *	[pp --> gp  ..]
39 *
40 * using the command "geom sched insert <provider>" and
41 * resulting in the following topology
42 *
43 *	[pp --> sched_gp --> cp]   [new_pp --> gp ... ]
44 *
45 * Deletion "geom sched destroy <provider>.sched." restores the
46 * original chain. The normal "geom sched create <provide>"
47 * is also supported.
48 *
49 * INTERNALS
50 * Internally, the 'sched' uses the following data structures
51 *
52 *   geom{}         g_sched_softc{}      g_gsched{}
53 * +----------+    +---------------+   +-------------+
54 * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
55 * |  ...     |    |               |   |  gs_fini    |
56 * |          |    | [ hash table] |   |  gs_start   |
57 * +----------+    |               |   |  ...        |
58 *                 |               |   +-------------+
59 *                 |               |
60 *                 |               |     g_*_softc{}
61 *                 |               |   +-------------+
62 *                 | sc_data     *-|-->|             |
63 *                 +---------------+   |  algorithm- |
64 *                                     |  specific   |
65 *                                     +-------------+
66 *
67 * A g_sched_softc{} is created with a "geom sched insert" call.
68 * In turn this instantiates a specific scheduling algorithm,
69 * which sets sc_gsched to point to the algorithm callbacks,
70 * and calls gs_init() to create the g_*_softc{} .
71 * The other callbacks (gs_start, gs_next, ...) are invoked
72 * as needed
73 *
74 * g_sched_softc{} is defined in g_sched.h and mostly used here;
75 * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
76 * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
77 *
78 * DATA MOVING
79 * When a bio is received on the provider, it goes to the
80 * g_sched_start() which calls gs_start() to initially queue it;
81 * then we call g_sched_dispatch() that loops around gs_next()
82 * to select zero or more bio's to be sent downstream.
83 *
84 * g_sched_dispatch() can also be called as a result of a timeout,
85 * e.g. when doing anticipation or pacing requests.
86 *
87 * When a bio comes back, it goes to g_sched_done() which in turn
88 * calls gs_done(). The latter does any necessary housekeeping in
89 * the scheduling algorithm, and may decide to call g_sched_dispatch()
90 * to send more bio's downstream.
91 *
92 * If an algorithm needs per-flow queues, these are created
93 * calling gs_init_class() and destroyed with gs_fini_class(),
94 * and they are also inserted in the hash table implemented in
95 * the g_sched_softc{}
96 *
97 * If an algorithm is replaced, or a transparently-inserted node is
98 * removed with "geom sched destroy", we need to remove all references
99 * to the g_*_softc{} and g_sched_softc from the bio's still in
100 * the scheduler. g_sched_forced_dispatch() helps doing this.
101 * XXX need to explain better.
102 */
103
104#include <sys/cdefs.h>
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/kernel.h>
108#include <sys/module.h>
109#include <sys/lock.h>
110#include <sys/mutex.h>
111#include <sys/bio.h>
112#include <sys/limits.h>
113#include <sys/hash.h>
114#include <sys/sysctl.h>
115#include <sys/malloc.h>
116#include <sys/proc.h>		/* we access curthread */
117#include <geom/geom.h>
118#include "gs_scheduler.h"
119#include "g_sched.h"		/* geom hooks */
120
121/*
122 * Size of the per-geom hash table storing traffic classes.
123 * We may decide to change it at a later time, it has no ABI
124 * implications as it is only used for run-time allocations.
125 */
126#define G_SCHED_HASH_SIZE	32
127
128static int g_sched_destroy(struct g_geom *gp, boolean_t force);
129static int g_sched_destroy_geom(struct gctl_req *req,
130    struct g_class *mp, struct g_geom *gp);
131static void g_sched_config(struct gctl_req *req, struct g_class *mp,
132    const char *verb);
133static struct g_geom *g_sched_taste(struct g_class *mp,
134    struct g_provider *pp, int flags __unused);
135static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
136    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
137static void g_sched_init(struct g_class *mp);
138static void g_sched_fini(struct g_class *mp);
139static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data,
140    int fflag, struct thread *td);
141
142struct g_class g_sched_class = {
143	.name = G_SCHED_CLASS_NAME,
144	.version = G_VERSION,
145	.ctlreq = g_sched_config,
146	.taste = g_sched_taste,
147	.destroy_geom = g_sched_destroy_geom,
148	.init = g_sched_init,
149	.ioctl = g_sched_ioctl,
150	.fini = g_sched_fini
151};
152
153MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
154
155/*
156 * Global variables describing the state of the geom_sched module.
157 * There is only one static instance of this structure.
158 */
159LIST_HEAD(gs_list, g_gsched);	/* type, link field */
160struct geom_sched_vars {
161	struct mtx	gs_mtx;
162	struct gs_list	gs_scheds;	/* list of algorithms */
163	u_int		gs_debug;
164	u_int		gs_sched_count;	/* how many algorithms ? */
165	u_int 		gs_patched;	/* g_io_request was patched */
166
167	u_int		gs_initialized;
168	u_int		gs_expire_secs;	/* expiration of hash entries */
169
170	struct bio_queue_head gs_pending;
171	u_int		gs_npending;
172
173	/* The following are for stats, usually protected by gs_mtx. */
174	u_long		gs_requests;	/* total requests */
175	u_long		gs_done;	/* total done */
176	u_int 		gs_in_flight;	/* requests in flight */
177	u_int 		gs_writes_in_flight;
178	u_int 		gs_bytes_in_flight;
179	u_int 		gs_write_bytes_in_flight;
180
181	char		gs_names[256];	/* names of schedulers */
182};
183
184static struct geom_sched_vars me = {
185	.gs_expire_secs = 10,
186};
187
188SYSCTL_DECL(_kern_geom);
189SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
190    "GEOM_SCHED stuff");
191
192SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
193    &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
194
195SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
196    &me.gs_bytes_in_flight, 0, "Bytes in flight");
197
198SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
199    &me.gs_writes_in_flight, 0, "Write Requests in flight");
200
201SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
202    &me.gs_in_flight, 0, "Requests in flight");
203
204SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
205    &me.gs_done, 0, "Total done");
206
207SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
208    &me.gs_requests, 0, "Total requests");
209
210SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
211    &me.gs_names, 0, "Algorithm names");
212
213SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
214    &me.gs_sched_count, 0, "Number of algorithms");
215
216SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
217    &me.gs_debug, 0, "Debug level");
218
219SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
220    &me.gs_expire_secs, 0, "Expire time in seconds");
221
222/*
223 * g_sched calls the scheduler algorithms with this lock held.
224 * The locking functions are exposed so the scheduler algorithms can also
225 * protect themselves e.g. when running a callout handler.
226 */
227void
228g_sched_lock(struct g_geom *gp)
229{
230	struct g_sched_softc *sc = gp->softc;
231
232	mtx_lock(&sc->sc_mtx);
233}
234
235void
236g_sched_unlock(struct g_geom *gp)
237{
238	struct g_sched_softc *sc = gp->softc;
239
240	mtx_unlock(&sc->sc_mtx);
241}
242
243/*
244 * Support functions to handle references to the module,
245 * which are coming from devices using this scheduler.
246 */
247static inline void
248g_gsched_ref(struct g_gsched *gsp)
249{
250
251	atomic_add_int(&gsp->gs_refs, 1);
252}
253
254static inline void
255g_gsched_unref(struct g_gsched *gsp)
256{
257
258	atomic_add_int(&gsp->gs_refs, -1);
259}
260
261/*
262 * Update the stats when this request is done.
263 */
264static void
265g_sched_update_stats(struct bio *bio)
266{
267
268	me.gs_done++;
269	me.gs_in_flight--;
270	me.gs_bytes_in_flight -= bio->bio_length;
271	if (bio->bio_cmd & BIO_WRITE) {
272		me.gs_writes_in_flight--;
273		me.gs_write_bytes_in_flight -= bio->bio_length;
274	}
275}
276
277/*
278 * Dispatch any pending request.
279 */
280static void
281g_sched_forced_dispatch(struct g_geom *gp)
282{
283	struct g_sched_softc *sc = gp->softc;
284	struct g_gsched *gsp = sc->sc_gsched;
285	struct bio *bp;
286
287	KASSERT(mtx_owned(&sc->sc_mtx),
288	    ("sc_mtx not owned during forced dispatch"));
289
290	while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
291		g_io_request(bp, LIST_FIRST(&gp->consumer));
292}
293
294/*
295 * The main dispatch loop, called either here after the start
296 * routine, or by scheduling algorithms when they receive a timeout
297 * or a 'done' notification.  Does not share code with the forced
298 * dispatch path, since the gs_done() callback can call us.
299 */
300void
301g_sched_dispatch(struct g_geom *gp)
302{
303	struct g_sched_softc *sc = gp->softc;
304	struct g_gsched *gsp = sc->sc_gsched;
305	struct bio *bp;
306
307	KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
308
309	if ((sc->sc_flags & G_SCHED_FLUSHING))
310		return;
311
312	while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
313		g_io_request(bp, LIST_FIRST(&gp->consumer));
314}
315
316/*
317 * Recent (8.0 and above) versions of FreeBSD have support to
318 * register classifiers of disk requests. The classifier is
319 * invoked by g_io_request(), and stores the information into
320 * bp->bio_classifier1.
321 *
322 * Support for older versions, which is left here only for
323 * documentation purposes, relies on two hacks:
324 * 1. classification info is written into the bio_caller1
325 *    field of the topmost node in the bio chain. This field
326 *    is rarely used, but this module is incompatible with
327 *    those that use bio_caller1 for other purposes,
328 *    such as ZFS and gjournal;
329 * 2. g_io_request() is patched in-memory when the module is
330 *    loaded, so that the function calls a classifier as its
331 *    first thing. g_io_request() is restored when the module
332 *    is unloaded. This functionality is only supported for
333 *    x86 and amd64, other architectures need source code changes.
334 */
335
336/*
337 * Lookup the identity of the issuer of the original request.
338 * In the current implementation we use the curthread of the
339 * issuer, but different mechanisms may be implemented later
340 * so we do not make assumptions on the return value which for
341 * us is just an opaque identifier.
342 */
343
344static inline u_long
345g_sched_classify(struct bio *bp)
346{
347
348#if __FreeBSD_version > 800098
349	/* we have classifier fields in the struct bio */
350#define HAVE_BIO_CLASSIFIER
351	return ((u_long)bp->bio_classifier1);
352#else
353#warning old version!!!
354	while (bp->bio_parent != NULL)
355		bp = bp->bio_parent;
356
357	return ((u_long)bp->bio_caller1);
358#endif
359}
360
361/* Return the hash chain for the given key. */
362static inline struct g_hash *
363g_sched_hash(struct g_sched_softc *sc, u_long key)
364{
365
366	return (&sc->sc_hash[key & sc->sc_mask]);
367}
368
369/*
370 * Helper function for the children classes, which takes
371 * a geom and a bio and returns the private descriptor
372 * associated to the request.  This involves fetching
373 * the classification field and [al]locating the
374 * corresponding entry in the hash table.
375 */
376void *
377g_sched_get_class(struct g_geom *gp, struct bio *bp)
378{
379	struct g_sched_softc *sc;
380	struct g_sched_class *gsc;
381	struct g_gsched *gsp;
382	struct g_hash *bucket;
383	u_long key;
384
385	sc = gp->softc;
386	key = g_sched_classify(bp);
387	bucket = g_sched_hash(sc, key);
388	LIST_FOREACH(gsc, bucket, gsc_clist) {
389		if (key == gsc->gsc_key) {
390			gsc->gsc_refs++;
391			return (gsc->gsc_priv);
392		}
393	}
394
395	gsp = sc->sc_gsched;
396	gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
397	    M_GEOM_SCHED, M_NOWAIT | M_ZERO);
398	if (!gsc)
399		return (NULL);
400
401	if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
402		free(gsc, M_GEOM_SCHED);
403		return (NULL);
404	}
405
406	gsc->gsc_refs = 2;	/* 1 for the hash table, 1 for the caller. */
407	gsc->gsc_key = key;
408	LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
409
410	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
411
412	return (gsc->gsc_priv);
413}
414
415/*
416 * Release a reference to the per-client descriptor,
417 */
418void
419g_sched_put_class(struct g_geom *gp, void *priv)
420{
421	struct g_sched_class *gsc;
422	struct g_sched_softc *sc;
423
424	gsc = g_sched_priv2class(priv);
425	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
426
427	if (--gsc->gsc_refs > 0)
428		return;
429
430	sc = gp->softc;
431	sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
432
433	LIST_REMOVE(gsc, gsc_clist);
434	free(gsc, M_GEOM_SCHED);
435}
436
437static void
438g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
439    struct g_gsched *gsp, void *data)
440{
441	struct g_sched_class *cp, *cp2;
442	int i;
443
444	if (!hp)
445		return;
446
447	if (data && gsp->gs_hash_unref)
448		gsp->gs_hash_unref(data);
449
450	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
451		LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
452			g_sched_put_class(gp, cp->gsc_priv);
453	}
454
455	hashdestroy(hp, M_GEOM_SCHED, mask);
456}
457
458static struct g_hash *
459g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
460{
461	struct g_hash *hash;
462
463	if (gsp->gs_priv_size == 0)
464		return (NULL);
465
466	hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
467
468	return (hash);
469}
470
471static void
472g_sched_flush_classes(struct g_geom *gp)
473{
474	struct g_sched_softc *sc;
475	struct g_sched_class *cp, *cp2;
476	int i;
477
478	sc = gp->softc;
479
480	if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
481		return;
482
483	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
484		LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
485			if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
486				g_sched_put_class(gp, cp->gsc_priv);
487		}
488	}
489
490	sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
491}
492
493/*
494 * Wait for the completion of any outstanding request.  To ensure
495 * that this does not take forever the caller has to make sure that
496 * no new request enter the scehduler before calling us.
497 *
498 * Must be called with the gp mutex held and topology locked.
499 */
500static int
501g_sched_wait_pending(struct g_geom *gp)
502{
503	struct g_sched_softc *sc = gp->softc;
504	int endticks = ticks + hz;
505
506	g_topology_assert();
507
508	while (sc->sc_pending && endticks - ticks >= 0)
509		msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
510
511	return (sc->sc_pending ? ETIMEDOUT : 0);
512}
513
514static int
515g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
516{
517	struct g_sched_softc *sc = gp->softc;
518	int error;
519
520	/* Set the flushing flag: new bios will not enter the scheduler. */
521	sc->sc_flags |= G_SCHED_FLUSHING;
522
523	g_sched_forced_dispatch(gp);
524	error = g_sched_wait_pending(gp);
525	if (error)
526		goto failed;
527
528	/* No more requests pending or in flight from the old gsp. */
529
530	g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
531	sc->sc_hash = NULL;
532
533	/*
534	 * Avoid deadlock here by releasing the gp mutex and reacquiring
535	 * it once done.  It should be safe, since no reconfiguration or
536	 * destruction can take place due to the geom topology lock; no
537	 * new request can use the current sc_data since we flagged the
538	 * geom as being flushed.
539	 */
540	g_sched_unlock(gp);
541	gsp->gs_fini(sc->sc_data);
542	g_sched_lock(gp);
543
544	sc->sc_gsched = NULL;
545	sc->sc_data = NULL;
546	g_gsched_unref(gsp);
547
548failed:
549	sc->sc_flags &= ~G_SCHED_FLUSHING;
550
551	return (error);
552}
553
554static int
555g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
556{
557	int error;
558
559	g_sched_lock(gp);
560	error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
561	g_sched_unlock(gp);
562
563	return (error);
564}
565
566/*
567 * Support function for create/taste -- locate the desired
568 * algorithm and grab a reference to it.
569 */
570static struct g_gsched *
571g_gsched_find(const char *name)
572{
573	struct g_gsched *gsp = NULL;
574
575	mtx_lock(&me.gs_mtx);
576	LIST_FOREACH(gsp, &me.gs_scheds, glist) {
577		if (strcmp(name, gsp->gs_name) == 0) {
578			g_gsched_ref(gsp);
579			break;
580		}
581	}
582	mtx_unlock(&me.gs_mtx);
583
584	return (gsp);
585}
586
587/*
588 * Rebuild the list of scheduler names.
589 * To be called with me.gs_mtx lock held.
590 */
591static void
592g_gsched_build_names(struct g_gsched *gsp)
593{
594	int pos, l;
595	struct g_gsched *cur;
596
597	pos = 0;
598	LIST_FOREACH(cur, &me.gs_scheds, glist) {
599		l = strlen(cur->gs_name);
600		if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
601			if (pos != 0)
602				me.gs_names[pos++] = ' ';
603			strcpy(me.gs_names + pos, cur->gs_name);
604			pos += l;
605		}
606	}
607	me.gs_names[pos] = '\0';
608}
609
610/*
611 * Register or unregister individual scheduling algorithms.
612 */
613static int
614g_gsched_register(struct g_gsched *gsp)
615{
616	struct g_gsched *cur;
617	int error = 0;
618
619	mtx_lock(&me.gs_mtx);
620	LIST_FOREACH(cur, &me.gs_scheds, glist) {
621		if (strcmp(gsp->gs_name, cur->gs_name) == 0)
622			break;
623	}
624	if (cur != NULL) {
625		G_SCHED_DEBUG(0, "A scheduler named %s already"
626		    "exists.", gsp->gs_name);
627		error = EEXIST;
628	} else {
629		LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
630		gsp->gs_refs = 1;
631		me.gs_sched_count++;
632		g_gsched_build_names(gsp);
633	}
634	mtx_unlock(&me.gs_mtx);
635
636	return (error);
637}
638
639struct g_gsched_unregparm {
640	struct g_gsched *gup_gsp;
641	int		gup_error;
642};
643
644static void
645g_gsched_unregister(void *arg, int flag)
646{
647	struct g_gsched_unregparm *parm = arg;
648	struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
649	struct g_sched_softc *sc;
650	struct g_geom *gp, *gp_tmp;
651	int error;
652
653	parm->gup_error = 0;
654
655	g_topology_assert();
656
657	if (flag == EV_CANCEL)
658		return;
659
660	mtx_lock(&me.gs_mtx);
661
662	LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
663		if (gp->class != &g_sched_class)
664			continue;	/* Should not happen. */
665
666		sc = gp->softc;
667		if (sc->sc_gsched == gsp) {
668			error = g_sched_remove(gp, gsp);
669			if (error)
670				goto failed;
671		}
672	}
673
674	LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
675		if (cur != gsp)
676			continue;
677
678		if (gsp->gs_refs != 1) {
679			G_SCHED_DEBUG(0, "%s still in use.",
680			    gsp->gs_name);
681			parm->gup_error = EBUSY;
682		} else {
683			LIST_REMOVE(gsp, glist);
684			me.gs_sched_count--;
685			g_gsched_build_names(gsp);
686		}
687		break;
688	}
689
690	if (cur == NULL) {
691		G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
692		parm->gup_error = ENOENT;
693	}
694
695failed:
696	mtx_unlock(&me.gs_mtx);
697}
698
699static inline void
700g_gsched_global_init(void)
701{
702
703	if (!me.gs_initialized) {
704		G_SCHED_DEBUG(0, "Initializing global data.");
705		mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
706		LIST_INIT(&me.gs_scheds);
707		gs_bioq_init(&me.gs_pending);
708		me.gs_initialized = 1;
709	}
710}
711
712/*
713 * Module event called when a scheduling algorithm module is loaded or
714 * unloaded.
715 */
716int
717g_gsched_modevent(module_t mod, int cmd, void *arg)
718{
719	struct g_gsched *gsp = arg;
720	struct g_gsched_unregparm parm;
721	int error;
722
723	G_SCHED_DEBUG(0, "Modevent %d.", cmd);
724
725	/*
726	 * If the module is loaded at boot, the geom thread that calls
727	 * g_sched_init() might actually run after g_gsched_modevent(),
728	 * so make sure that the module is properly initialized.
729	 */
730	g_gsched_global_init();
731
732	error = EOPNOTSUPP;
733	switch (cmd) {
734	case MOD_LOAD:
735		error = g_gsched_register(gsp);
736		G_SCHED_DEBUG(0, "Loaded module %s error %d.",
737		    gsp->gs_name, error);
738		if (error == 0)
739			g_retaste(&g_sched_class);
740		break;
741
742	case MOD_UNLOAD:
743		parm.gup_gsp = gsp;
744		parm.gup_error = 0;
745
746		error = g_waitfor_event(g_gsched_unregister,
747		    &parm, M_WAITOK, NULL);
748		if (error == 0)
749			error = parm.gup_error;
750		G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
751		    gsp->gs_name, error);
752		break;
753	};
754
755	return (error);
756}
757
758#ifdef KTR
759#define	TRC_BIO_EVENT(e, bp)	g_sched_trace_bio_ ## e (bp)
760
761static inline char
762g_sched_type(struct bio *bp)
763{
764
765	if (0 != (bp->bio_cmd & BIO_READ))
766		return ('R');
767	else if (0 != (bp->bio_cmd & BIO_WRITE))
768		return ('W');
769	return ('U');
770}
771
772static inline void
773g_sched_trace_bio_START(struct bio *bp)
774{
775
776	CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
777	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
778	    bp->bio_offset, bp->bio_length);
779}
780
781static inline void
782g_sched_trace_bio_DONE(struct bio *bp)
783{
784
785	CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
786	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
787	    bp->bio_offset, bp->bio_length);
788}
789#else /* !KTR */
790#define	TRC_BIO_EVENT(e, bp)
791#endif /* !KTR */
792
793/*
794 * g_sched_done() and g_sched_start() dispatch the geom requests to
795 * the scheduling algorithm in use.
796 */
797static void
798g_sched_done(struct bio *bio)
799{
800	struct g_geom *gp = bio->bio_caller2;
801	struct g_sched_softc *sc = gp->softc;
802
803	TRC_BIO_EVENT(DONE, bio);
804
805	KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
806
807	g_sched_lock(gp);
808
809	g_sched_update_stats(bio);
810	sc->sc_gsched->gs_done(sc->sc_data, bio);
811	if (!--sc->sc_pending)
812		wakeup(gp);
813
814	g_sched_flush_classes(gp);
815	g_sched_unlock(gp);
816
817	g_std_done(bio);
818}
819
820static void
821g_sched_start(struct bio *bp)
822{
823	struct g_geom *gp = bp->bio_to->geom;
824	struct g_sched_softc *sc = gp->softc;
825	struct bio *cbp;
826
827	TRC_BIO_EVENT(START, bp);
828	G_SCHED_LOGREQ(bp, "Request received.");
829
830	cbp = g_clone_bio(bp);
831	if (cbp == NULL) {
832		g_io_deliver(bp, ENOMEM);
833		return;
834	}
835	cbp->bio_done = g_sched_done;
836	cbp->bio_to = LIST_FIRST(&gp->provider);
837	KASSERT(cbp->bio_to != NULL, ("NULL provider"));
838
839	/* We only schedule reads and writes. */
840	if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE)))
841		goto bypass;
842
843	G_SCHED_LOGREQ(cbp, "Sending request.");
844
845	g_sched_lock(gp);
846	/*
847	 * Call the algorithm's gs_start to queue the request in the
848	 * scheduler. If gs_start fails then pass the request down,
849	 * otherwise call g_sched_dispatch() which tries to push
850	 * one or more requests down.
851	 */
852	if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
853	    sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
854		g_sched_unlock(gp);
855		goto bypass;
856	}
857	/*
858	 * We use bio_caller1 to mark requests that are scheduled
859	 * so make sure it is not NULL.
860	 */
861	if (cbp->bio_caller1 == NULL)
862		cbp->bio_caller1 = &me;	/* anything not NULL */
863
864	cbp->bio_caller2 = gp;
865	sc->sc_pending++;
866
867	/* Update general stats. */
868	me.gs_in_flight++;
869	me.gs_requests++;
870	me.gs_bytes_in_flight += bp->bio_length;
871	if (bp->bio_cmd & BIO_WRITE) {
872		me.gs_writes_in_flight++;
873		me.gs_write_bytes_in_flight += bp->bio_length;
874	}
875	g_sched_dispatch(gp);
876	g_sched_unlock(gp);
877	return;
878
879bypass:
880	cbp->bio_done = g_std_done;
881	cbp->bio_caller1 = NULL; /* not scheduled */
882	g_io_request(cbp, LIST_FIRST(&gp->consumer));
883}
884
885/*
886 * The next few functions are the geom glue.
887 */
888static void
889g_sched_orphan(struct g_consumer *cp)
890{
891
892	g_topology_assert();
893	g_sched_destroy(cp->geom, 1);
894}
895
896static int
897g_sched_access(struct g_provider *pp, int dr, int dw, int de)
898{
899	struct g_geom *gp;
900	struct g_consumer *cp;
901	int error;
902
903	gp = pp->geom;
904	cp = LIST_FIRST(&gp->consumer);
905	error = g_access(cp, dr, dw, de);
906
907	return (error);
908}
909
910static void
911g_sched_temporary_start(struct bio *bio)
912{
913
914	mtx_lock(&me.gs_mtx);
915	me.gs_npending++;
916	gs_bioq_disksort(&me.gs_pending, bio);
917	mtx_unlock(&me.gs_mtx);
918}
919
920static void
921g_sched_flush_pending(g_start_t *start)
922{
923	struct bio *bp;
924
925	while ((bp = gs_bioq_takefirst(&me.gs_pending)))
926		start(bp);
927}
928
929static int
930g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
931    struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
932{
933	struct g_sched_softc *sc = gp->softc;
934	g_start_t *saved_start, *flush = g_sched_start;
935	int error = 0, endticks = ticks + hz;
936
937	g_cancel_event(newpp);	/* prevent taste() */
938	/* copy private fields */
939	newpp->private = pp->private;
940	newpp->index = pp->index;
941
942	/* Queue all the early requests coming for us. */
943	me.gs_npending = 0;
944	saved_start = pp->geom->start;
945	dstgp->start = g_sched_temporary_start;
946
947	while (pp->nstart - pp->nend != me.gs_npending &&
948	    endticks - ticks >= 0)
949		tsleep(pp, PRIBIO, "-", hz/10);
950
951	if (pp->nstart - pp->nend != me.gs_npending) {
952		flush = saved_start;
953		error = ETIMEDOUT;
954		goto fail;
955	}
956
957	/* link pp to this geom */
958	LIST_REMOVE(pp, provider);
959	pp->geom = gp;
960	LIST_INSERT_HEAD(&gp->provider, pp, provider);
961
962	/*
963	 * replicate the counts from the parent in the
964	 * new provider and consumer nodes
965	 */
966	cp->acr = newpp->acr = pp->acr;
967	cp->acw = newpp->acw = pp->acw;
968	cp->ace = newpp->ace = pp->ace;
969	sc->sc_flags |= G_SCHED_PROXYING;
970
971fail:
972	dstgp->start = saved_start;
973
974	g_sched_flush_pending(flush);
975
976	return (error);
977}
978
979/*
980 * Create a geom node for the device passed as *pp.
981 * If successful, add a reference to this gsp.
982 */
983static int
984g_sched_create(struct gctl_req *req, struct g_class *mp,
985    struct g_provider *pp, struct g_gsched *gsp, int proxy)
986{
987	struct g_sched_softc *sc = NULL;
988	struct g_geom *gp, *dstgp;
989	struct g_provider *newpp = NULL;
990	struct g_consumer *cp = NULL;
991	char name[64];
992	int error;
993
994	g_topology_assert();
995
996	snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
997	LIST_FOREACH(gp, &mp->geom, geom) {
998		if (strcmp(gp->name, name) == 0) {
999			gctl_error(req, "Geom %s already exists.",
1000			    name);
1001			return (EEXIST);
1002		}
1003	}
1004
1005	gp = g_new_geomf(mp, name);
1006	dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
1007	if (gp == NULL) {
1008		gctl_error(req, "Cannot create geom %s.", name);
1009		error = ENOMEM;
1010		goto fail;
1011	}
1012
1013	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1014	sc->sc_gsched = gsp;
1015	sc->sc_data = gsp->gs_init(gp);
1016	if (sc->sc_data == NULL) {
1017		error = ENOMEM;
1018		goto fail;
1019	}
1020
1021	sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1022
1023	/*
1024	 * Do not initialize the flush mechanism, will be initialized
1025	 * on the first insertion on the hash table.
1026	 */
1027
1028	mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1029
1030	gp->softc = sc;
1031	gp->start = g_sched_start;
1032	gp->orphan = g_sched_orphan;
1033	gp->access = g_sched_access;
1034	gp->dumpconf = g_sched_dumpconf;
1035
1036	newpp = g_new_providerf(dstgp, gp->name);
1037	if (newpp == NULL) {
1038		gctl_error(req, "Cannot create provider %s.", name);
1039		error = ENOMEM;
1040		goto fail;
1041	}
1042
1043	newpp->mediasize = pp->mediasize;
1044	newpp->sectorsize = pp->sectorsize;
1045
1046	cp = g_new_consumer(gp);
1047	if (cp == NULL) {
1048		gctl_error(req, "Cannot create consumer for %s.",
1049		    gp->name);
1050		error = ENOMEM;
1051		goto fail;
1052	}
1053
1054	error = g_attach(cp, proxy ? newpp : pp);
1055	if (error != 0) {
1056		gctl_error(req, "Cannot attach to provider %s.",
1057		    pp->name);
1058		goto fail;
1059	}
1060
1061	g_error_provider(newpp, 0);
1062	if (proxy) {
1063		error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1064		if (error)
1065			goto fail;
1066	}
1067	G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1068
1069	g_gsched_ref(gsp);
1070
1071	return (0);
1072
1073fail:
1074	if (cp != NULL) {
1075		if (cp->provider != NULL)
1076			g_detach(cp);
1077		g_destroy_consumer(cp);
1078	}
1079
1080	if (newpp != NULL)
1081		g_destroy_provider(newpp);
1082
1083	if (sc && sc->sc_hash) {
1084		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1085		    gsp, sc->sc_data);
1086	}
1087
1088	if (sc && sc->sc_data)
1089		gsp->gs_fini(sc->sc_data);
1090
1091	if (gp != NULL) {
1092		if (gp->softc != NULL)
1093			g_free(gp->softc);
1094		g_destroy_geom(gp);
1095	}
1096
1097	return (error);
1098}
1099
1100/*
1101 * Support for dynamic switching of scheduling algorithms.
1102 * First initialize the data structures for the new algorithm,
1103 * then call g_sched_remove_locked() to flush all references
1104 * to the old one, finally link the new algorithm.
1105 */
1106static int
1107g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1108    struct g_provider *pp, struct g_gsched *gsp)
1109{
1110	struct g_sched_softc *sc;
1111	struct g_geom *gp;
1112	struct g_hash *newh;
1113	void *data;
1114	u_long mask;
1115	int error = 0;
1116
1117	gp = pp->geom;
1118	sc = gp->softc;
1119
1120	data = gsp->gs_init(gp);
1121	if (data == NULL)
1122		return (ENOMEM);
1123
1124	newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1125	if (gsp->gs_priv_size && !newh) {
1126		error = ENOMEM;
1127		goto fail;
1128	}
1129
1130	g_sched_lock(gp);
1131	if (sc->sc_gsched) {	/* can be NULL in some cases */
1132		error = g_sched_remove_locked(gp, sc->sc_gsched);
1133		if (error)
1134			goto fail;
1135	}
1136
1137	g_gsched_ref(gsp);
1138	sc->sc_gsched = gsp;
1139	sc->sc_data = data;
1140	sc->sc_hash = newh;
1141	sc->sc_mask = mask;
1142
1143	g_sched_unlock(gp);
1144
1145	return (0);
1146
1147fail:
1148	if (newh)
1149		g_sched_hash_fini(gp, newh, mask, gsp, data);
1150
1151	if (data)
1152		gsp->gs_fini(data);
1153
1154	g_sched_unlock(gp);
1155
1156	return (error);
1157}
1158
1159/*
1160 * Stop the request flow directed to the proxy, redirecting the new
1161 * requests to the me.gs_pending queue.
1162 */
1163static struct g_provider *
1164g_detach_proxy(struct g_geom *gp)
1165{
1166	struct g_consumer *cp;
1167	struct g_provider *pp, *newpp;
1168
1169	do {
1170		pp = LIST_FIRST(&gp->provider);
1171		if (pp == NULL)
1172			break;
1173		cp = LIST_FIRST(&gp->consumer);
1174		if (cp == NULL)
1175			break;
1176		newpp = cp->provider;
1177		if (newpp == NULL)
1178			break;
1179
1180		me.gs_npending = 0;
1181		pp->geom->start = g_sched_temporary_start;
1182
1183		return (pp);
1184	} while (0);
1185	printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1186
1187	return (NULL);
1188}
1189
1190static void
1191g_sched_blackhole(struct bio *bp)
1192{
1193
1194	g_io_deliver(bp, ENXIO);
1195}
1196
1197static inline void
1198g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1199    struct g_provider *newpp)
1200{
1201
1202	LIST_REMOVE(pp, provider);
1203	if (newpp) {
1204		pp->private = newpp->private;
1205		pp->index = newpp->index;
1206	}
1207	pp->geom = gp;
1208	LIST_INSERT_HEAD(&gp->provider, pp, provider);
1209}
1210
1211static inline void
1212g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1213{
1214	struct g_geom *gp = oldpp->geom;
1215
1216	g_reparent_provider(oldpp, newpp->geom, newpp);
1217
1218	/*
1219	 * Hackish: let the system destroy the old provider for us, just
1220	 * in case someone attached a consumer to it, in which case a
1221	 * direct call to g_destroy_provider() would not work.
1222	 */
1223	g_reparent_provider(newpp, gp, NULL);
1224}
1225
1226/*
1227 * Complete the proxy destruction, linking the old provider to its
1228 * original geom, and destroying the proxy provider.  Also take care
1229 * of issuing the pending requests collected in me.gs_pending (if any).
1230 */
1231static int
1232g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1233{
1234	struct g_consumer *cp;
1235	struct g_provider *newpp;
1236
1237	do {
1238		cp = LIST_FIRST(&gp->consumer);
1239		if (cp == NULL)
1240			break;
1241		newpp = cp->provider;
1242		if (newpp == NULL)
1243			break;
1244
1245		/* Relink the provider to its original geom. */
1246		g_unproxy_provider(oldpp, newpp);
1247
1248		/* Detach consumer from provider, and destroy provider. */
1249		cp->acr = newpp->acr = 0;
1250		cp->acw = newpp->acw = 0;
1251		cp->ace = newpp->ace = 0;
1252		g_detach(cp);
1253
1254		/* Send the pending bios through the right start function. */
1255		g_sched_flush_pending(oldpp->geom->start);
1256
1257		return (0);
1258	} while (0);
1259	printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1260
1261	/* We cannot send the pending bios anywhere... */
1262	g_sched_flush_pending(g_sched_blackhole);
1263
1264	return (EINVAL);
1265}
1266
1267static int
1268g_sched_destroy(struct g_geom *gp, boolean_t force)
1269{
1270	struct g_provider *pp, *oldpp = NULL;
1271	struct g_sched_softc *sc;
1272	struct g_gsched *gsp;
1273	int error;
1274
1275	g_topology_assert();
1276	sc = gp->softc;
1277	if (sc == NULL)
1278		return (ENXIO);
1279	if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1280		pp = LIST_FIRST(&gp->provider);
1281		if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1282			const char *msg = force ?
1283				"but we force removal" : "cannot remove";
1284
1285			G_SCHED_DEBUG(!force,
1286			    "Device %s is still open (r%dw%de%d), %s.",
1287			    pp->name, pp->acr, pp->acw, pp->ace, msg);
1288			if (!force)
1289				return (EBUSY);
1290		} else {
1291			G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1292		}
1293	} else
1294		oldpp = g_detach_proxy(gp);
1295
1296	gsp = sc->sc_gsched;
1297	if (gsp) {
1298		/*
1299		 * XXX bad hack here: force a dispatch to release
1300		 * any reference to the hash table still held by
1301		 * the scheduler.
1302		 */
1303		g_sched_lock(gp);
1304		/*
1305		 * We are dying here, no new requests should enter
1306		 * the scheduler.  This is granted by the topolgy,
1307		 * either in case we were proxying (new bios are
1308		 * being redirected) or not (see the access check
1309		 * above).
1310		 */
1311		g_sched_forced_dispatch(gp);
1312		error = g_sched_wait_pending(gp);
1313
1314		if (error) {
1315			/*
1316			 * Not all the requests came home: this might happen
1317			 * under heavy load, or if we were waiting for any
1318			 * bio which is served in the event path (see
1319			 * geom_slice.c for an example of how this can
1320			 * happen).  Try to restore a working configuration
1321			 * if we can fail.
1322			 */
1323			if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1324				g_sched_flush_pending(force ?
1325				    g_sched_blackhole : g_sched_start);
1326			}
1327
1328			/*
1329			 * In the forced destroy case there is not so much
1330			 * we can do, we have pending bios that will call
1331			 * g_sched_done() somehow, and we don't want them
1332			 * to crash the system using freed memory.  We tell
1333			 * the user that something went wrong, and leak some
1334			 * memory here.
1335			 * Note: the callers using force = 1 ignore the
1336			 * return value.
1337			 */
1338			if (force) {
1339				G_SCHED_DEBUG(0, "Pending requests while "
1340				    " destroying geom, some memory leaked.");
1341			}
1342
1343			return (error);
1344		}
1345
1346		g_sched_unlock(gp);
1347		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1348		    gsp, sc->sc_data);
1349		sc->sc_hash = NULL;
1350		gsp->gs_fini(sc->sc_data);
1351		g_gsched_unref(gsp);
1352		sc->sc_gsched = NULL;
1353	}
1354
1355	if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1356		error = g_destroy_proxy(gp, oldpp);
1357
1358		if (error) {
1359			if (force) {
1360				G_SCHED_DEBUG(0, "Unrecoverable error while "
1361				    "destroying a proxy geom, leaking some "
1362				    " memory.");
1363			}
1364
1365			return (error);
1366		}
1367	}
1368
1369	mtx_destroy(&sc->sc_mtx);
1370
1371	g_free(gp->softc);
1372	gp->softc = NULL;
1373	g_wither_geom(gp, ENXIO);
1374
1375	return (error);
1376}
1377
1378static int
1379g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1380    struct g_geom *gp)
1381{
1382
1383	return (g_sched_destroy(gp, 0));
1384}
1385
1386/*
1387 * Functions related to the classification of requests.
1388 *
1389 * On recent FreeBSD versions (8.0 and above), we store a reference
1390 * to the issuer of a request in bp->bio_classifier1 as soon
1391 * as the bio is posted to the geom queue (and not later, because
1392 * requests are managed by the g_down thread afterwards).
1393 *
1394 * On older versions of the system (but this code is not used
1395 * in any existing release), we [ab]use the caller1 field in the
1396 * root element of the bio tree to store the classification info.
1397 * The marking is done at the beginning of g_io_request()
1398 * and only if we find that the field is NULL.
1399 *
1400 * To avoid rebuilding the kernel, this module will patch the
1401 * initial part of g_io_request() so it jumps to some hand-coded
1402 * assembly that does the marking and then executes the original
1403 * body of g_io_request().
1404 *
1405 * fake_ioreq[] is architecture-specific machine code
1406 * that implements the above. CODE_SIZE, STORE_SIZE etc.
1407 * are constants used in the patching routine. Look at the
1408 * code in g_ioreq_patch() for the details.
1409 */
1410
1411#ifndef HAVE_BIO_CLASSIFIER
1412/*
1413 * Support for old FreeBSD versions
1414 */
1415#if defined(__i386__)
1416#define	CODE_SIZE	29
1417#define	STORE_SIZE	5
1418#define	EPILOGUE	5
1419#define	SIZE		(CODE_SIZE + STORE_SIZE + EPILOGUE)
1420
1421static u_char fake_ioreq[SIZE] = {
1422	0x8b, 0x44, 0x24, 0x04,		/* mov bp, %eax */
1423	/* 1: */
1424	0x89, 0xc2,			/* mov %eax, %edx # edx = bp */
1425	0x8b, 0x40, 0x64,		/* mov bp->bio_parent, %eax */
1426	0x85, 0xc0,			/* test %eax, %eax */
1427	0x75, 0xf7,			/* jne 1b */
1428	0x8b, 0x42, 0x30,		/* mov bp->bp_caller1, %eax */
1429	0x85, 0xc0,			/* test %eax, %eax */
1430	0x75, 0x09,			/* jne 2f */
1431	0x64, 0xa1, 0x00, 0x00,		/* mov %fs:0, %eax */
1432	0x00, 0x00,
1433	0x89, 0x42, 0x30,		/* mov %eax, bp->bio_caller1 */
1434	/* 2: */
1435        0x55, 0x89, 0xe5, 0x57, 0x56,
1436	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp back... */
1437};
1438#elif defined(__amd64)
1439#define	CODE_SIZE	38
1440#define	STORE_SIZE	6
1441#define	EPILOGUE	5
1442#define	SIZE		(CODE_SIZE + STORE_SIZE + EPILOGUE)
1443
1444static u_char fake_ioreq[SIZE] = {
1445	0x48, 0x89, 0xf8,		/* mov bp, %rax */
1446	/* 1: */
1447	0x48, 0x89, 0xc2,		/* mov %rax, %rdx # rdx = bp */
1448	0x48, 0x8b, 0x82, 0xa8,		/* mov bp->bio_parent, %rax */
1449	0x00, 0x00, 0x00,
1450	0x48, 0x85, 0xc0,		/* test %rax, %rax */
1451	0x75, 0xf1,			/* jne 1b */
1452	0x48, 0x83, 0x7a, 0x58,		/* cmp $0, bp->bp_caller1 */
1453	0x00,
1454	0x75, 0x0d,			/* jne 2f */
1455	0x65, 0x48, 0x8b, 0x04,		/* mov %gs:0, %rax */
1456	0x25, 0x00, 0x00, 0x00,
1457	0x00,
1458	0x48, 0x89, 0x42, 0x58,		/* mov %rax, bp->bio_caller1 */
1459	/* 2: */
1460	0x55, 0x48, 0x89, 0xe5, 0x41, 0x56,
1461	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp back... */
1462};
1463#else /* neither x86 nor amd64 */
1464static void
1465g_new_io_request(struct bio *bp, struct g_consumer *cp)
1466{
1467	struct bio *top = bp;
1468
1469        /*
1470         * bio classification: if bio_caller1 is available in the
1471         * root of the 'struct bio' tree, store there the thread id
1472         * of the thread that originated the request.
1473         * More sophisticated classification schemes can be used.
1474         */
1475	while (top->bio_parent)
1476		top = top->bio_parent;
1477
1478	if (top->bio_caller1 == NULL)
1479		top->bio_caller1 = curthread;
1480}
1481
1482#error please add the code above in g_new_io_request() to the beginning of \
1483	/sys/geom/geom_io.c::g_io_request(), and remove this line.
1484#endif /* end of arch-specific code */
1485
1486static int
1487g_ioreq_patch(void)
1488{
1489	u_char *original;
1490	u_long ofs;
1491	int found;
1492
1493	if (me.gs_patched)
1494		return (-1);
1495
1496	original = (u_char *)g_io_request;
1497
1498	found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE);
1499	if (!found)
1500		return (-1);
1501
1502	/* Jump back to the original + STORE_SIZE. */
1503	ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE);
1504	bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4);
1505
1506	/* Patch the original address with a jump to the trampoline. */
1507	*original = 0xe9;     /* jump opcode */
1508	ofs = fake_ioreq - (original + 5);
1509	bcopy(&ofs, original + 1, 4);
1510
1511	me.gs_patched = 1;
1512
1513	return (0);
1514}
1515
1516/*
1517 * Restore the original code, this is easy.
1518 */
1519static void
1520g_ioreq_restore(void)
1521{
1522	u_char *original;
1523
1524	if (me.gs_patched) {
1525		original = (u_char *)g_io_request;
1526		bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE);
1527		me.gs_patched = 0;
1528	}
1529}
1530
1531static inline void
1532g_classifier_ini(void)
1533{
1534
1535	g_ioreq_patch();
1536}
1537
1538static inline void
1539g_classifier_fini(void)
1540{
1541
1542	g_ioreq_restore();
1543}
1544
1545/*--- end of support code for older FreeBSD versions */
1546
1547#else /* HAVE_BIO_CLASSIFIER */
1548
1549/*
1550 * Classifier support for recent FreeBSD versions: we use
1551 * a very simple classifier, only use curthread to tag a request.
1552 * The classifier is registered at module load, and unregistered
1553 * at module unload.
1554 */
1555static int
1556g_sched_tag(void *arg, struct bio *bp)
1557{
1558
1559	bp->bio_classifier1 = curthread;
1560	return (1);
1561}
1562
1563static struct g_classifier_hook g_sched_classifier = {
1564	.func =	g_sched_tag,
1565};
1566
1567static inline void
1568g_classifier_ini(void)
1569{
1570
1571	g_register_classifier(&g_sched_classifier);
1572}
1573
1574static inline void
1575g_classifier_fini(void)
1576{
1577
1578	g_unregister_classifier(&g_sched_classifier);
1579}
1580#endif /* HAVE_BIO_CLASSIFIER */
1581
1582static void
1583g_sched_init(struct g_class *mp)
1584{
1585
1586	g_gsched_global_init();
1587
1588	G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1589	    mp, &g_sched_class);
1590
1591	/* Patch g_io_request to store classification info in the bio. */
1592	g_classifier_ini();
1593}
1594
1595static void
1596g_sched_fini(struct g_class *mp)
1597{
1598
1599	g_classifier_fini();
1600
1601	G_SCHED_DEBUG(0, "Unloading...");
1602
1603	KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1604	mtx_destroy(&me.gs_mtx);
1605}
1606
1607static int
1608g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag,
1609    struct thread *td)
1610{
1611	struct g_consumer *cp;
1612	struct g_geom *gp;
1613
1614	cp = LIST_FIRST(&pp->geom->consumer);
1615	if (cp == NULL)
1616		return (ENOIOCTL);
1617	gp = cp->provider->geom;
1618	if (gp->ioctl == NULL)
1619		return (ENOIOCTL);
1620	return (gp->ioctl(cp->provider, cmd, data, fflag, td));
1621}
1622
1623/*
1624 * Read the i-th argument for a request, skipping the /dev/
1625 * prefix if present.
1626 */
1627static const char *
1628g_sched_argi(struct gctl_req *req, int i)
1629{
1630	static const char *dev_prefix = "/dev/";
1631	const char *name;
1632	char param[16];
1633	int l = strlen(dev_prefix);
1634
1635	snprintf(param, sizeof(param), "arg%d", i);
1636	name = gctl_get_asciiparam(req, param);
1637	if (name == NULL)
1638		gctl_error(req, "No 'arg%d' argument", i);
1639	else if (strncmp(name, dev_prefix, l) == 0)
1640		name += l;
1641	return (name);
1642}
1643
1644/*
1645 * Fetch nargs and do appropriate checks.
1646 */
1647static int
1648g_sched_get_nargs(struct gctl_req *req)
1649{
1650	int *nargs;
1651
1652	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1653	if (nargs == NULL) {
1654		gctl_error(req, "No 'nargs' argument");
1655		return (0);
1656	}
1657	if (*nargs <= 0)
1658		gctl_error(req, "Missing device(s).");
1659	return (*nargs);
1660}
1661
1662/*
1663 * Check whether we should add the class on certain volumes when
1664 * this geom is created. Right now this is under control of a kenv
1665 * variable containing the names of all devices that we care about.
1666 * Probably we should only support transparent insertion as the
1667 * preferred mode of operation.
1668 */
1669static struct g_geom *
1670g_sched_taste(struct g_class *mp, struct g_provider *pp,
1671		int flags __unused)
1672{
1673	struct g_gsched *gsp = NULL;	/* the . algorithm we want */
1674	const char *s;			/* generic string pointer */
1675	const char *taste_names;	/* devices we like */
1676	int l;
1677
1678        g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1679	    mp->name, pp->name);
1680        g_topology_assert();
1681
1682        G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1683
1684	do {
1685		/* do not taste on ourselves */
1686		if (pp->geom->class == mp)
1687                	break;
1688
1689		taste_names = getenv("geom.sched.taste");
1690		if (taste_names == NULL)
1691			break;
1692
1693		l = strlen(pp->name);
1694		for (s = taste_names; *s &&
1695		    (s = strstr(s, pp->name)); s++) {
1696			/* further checks for an exact match */
1697			if ( (s == taste_names || s[-1] == ' ') &&
1698			     (s[l] == '\0' || s[l] == ' ') )
1699				break;
1700		}
1701		if (s == NULL)
1702			break;
1703		G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1704		    pp->name, s);
1705
1706		/* look up the provider name in the list */
1707		s = getenv("geom.sched.algo");
1708		if (s == NULL)
1709			s = "rr";
1710
1711		gsp = g_gsched_find(s);	/* also get a reference */
1712		if (gsp == NULL) {
1713			G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1714			break;
1715		}
1716
1717		/* XXX create with 1 as last argument ? */
1718		g_sched_create(NULL, mp, pp, gsp, 0);
1719		g_gsched_unref(gsp);
1720	} while (0);
1721	return NULL;
1722}
1723
1724static void
1725g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1726{
1727	struct g_provider *pp;
1728	struct g_gsched *gsp;
1729	const char *name;
1730	int i, nargs;
1731
1732	g_topology_assert();
1733
1734	name = gctl_get_asciiparam(req, "algo");
1735	if (name == NULL) {
1736		gctl_error(req, "No '%s' argument", "algo");
1737		return;
1738	}
1739
1740	gsp = g_gsched_find(name);	/* also get a reference */
1741	if (gsp == NULL) {
1742		gctl_error(req, "Bad algorithm '%s'", name);
1743		return;
1744	}
1745
1746	nargs = g_sched_get_nargs(req);
1747
1748	/*
1749	 * Run on the arguments, and break on any error.
1750	 * We look for a device name, but skip the /dev/ prefix if any.
1751	 */
1752	for (i = 0; i < nargs; i++) {
1753		name = g_sched_argi(req, i);
1754		if (name == NULL)
1755			break;
1756		pp = g_provider_by_name(name);
1757		if (pp == NULL) {
1758			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1759			gctl_error(req, "Provider %s is invalid.", name);
1760			break;
1761		}
1762		if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1763			break;
1764	}
1765
1766	g_gsched_unref(gsp);
1767}
1768
1769static void
1770g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1771{
1772	struct g_provider *pp;
1773	struct g_gsched *gsp;
1774	const char *name;
1775	int i, nargs;
1776
1777	g_topology_assert();
1778
1779	name = gctl_get_asciiparam(req, "algo");
1780	if (name == NULL) {
1781		gctl_error(req, "No '%s' argument", "algo");
1782		return;
1783	}
1784
1785	gsp = g_gsched_find(name);	/* also get a reference */
1786	if (gsp == NULL) {
1787		gctl_error(req, "Bad algorithm '%s'", name);
1788		return;
1789	}
1790
1791	nargs = g_sched_get_nargs(req);
1792
1793	/*
1794	 * Run on the arguments, and break on any error.
1795	 * We look for a device name, but skip the /dev/ prefix if any.
1796	 */
1797	for (i = 0; i < nargs; i++) {
1798		name = g_sched_argi(req, i);
1799		if (name == NULL)
1800			break;
1801		pp = g_provider_by_name(name);
1802		if (pp == NULL || pp->geom->class != mp) {
1803			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1804			gctl_error(req, "Provider %s is invalid.", name);
1805			break;
1806		}
1807		if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1808			break;
1809	}
1810
1811	g_gsched_unref(gsp);
1812}
1813
1814static struct g_geom *
1815g_sched_find_geom(struct g_class *mp, const char *name)
1816{
1817	struct g_geom *gp;
1818
1819	LIST_FOREACH(gp, &mp->geom, geom) {
1820		if (strcmp(gp->name, name) == 0)
1821			return (gp);
1822	}
1823	return (NULL);
1824}
1825
1826static void
1827g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1828{
1829	int nargs, *force, error, i;
1830	struct g_geom *gp;
1831	const char *name;
1832
1833	g_topology_assert();
1834
1835	nargs = g_sched_get_nargs(req);
1836
1837	force = gctl_get_paraml(req, "force", sizeof(*force));
1838	if (force == NULL) {
1839		gctl_error(req, "No 'force' argument");
1840		return;
1841	}
1842
1843	for (i = 0; i < nargs; i++) {
1844		name = g_sched_argi(req, i);
1845		if (name == NULL)
1846			break;
1847
1848		gp = g_sched_find_geom(mp, name);
1849		if (gp == NULL) {
1850			G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1851			gctl_error(req, "Device %s is invalid.", name);
1852			break;
1853		}
1854
1855		error = g_sched_destroy(gp, *force);
1856		if (error != 0) {
1857			gctl_error(req, "Cannot destroy device %s (error=%d).",
1858			    gp->name, error);
1859			break;
1860		}
1861	}
1862}
1863
1864static void
1865g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1866{
1867	uint32_t *version;
1868
1869	g_topology_assert();
1870
1871	version = gctl_get_paraml(req, "version", sizeof(*version));
1872	if (version == NULL) {
1873		gctl_error(req, "No '%s' argument.", "version");
1874		return;
1875	}
1876
1877	if (*version != G_SCHED_VERSION) {
1878		gctl_error(req, "Userland and kernel parts are "
1879		    "out of sync.");
1880		return;
1881	}
1882
1883	if (strcmp(verb, "create") == 0) {
1884		g_sched_ctl_create(req, mp, 0);
1885		return;
1886	} else if (strcmp(verb, "insert") == 0) {
1887		g_sched_ctl_create(req, mp, 1);
1888		return;
1889	} else if (strcmp(verb, "configure") == 0) {
1890		g_sched_ctl_configure(req, mp);
1891		return;
1892	} else if (strcmp(verb, "destroy") == 0) {
1893		g_sched_ctl_destroy(req, mp);
1894		return;
1895	}
1896
1897	gctl_error(req, "Unknown verb.");
1898}
1899
1900static void
1901g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1902    struct g_consumer *cp, struct g_provider *pp)
1903{
1904	struct g_sched_softc *sc = gp->softc;
1905	struct g_gsched *gsp = sc->sc_gsched;
1906	if (indent == NULL) {	/* plaintext */
1907		sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1908	}
1909	if (gsp != NULL && gsp->gs_dumpconf)
1910		gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1911}
1912
1913DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1914MODULE_VERSION(geom_sched, 0);
1915