g_sched.c revision 206552
1/*-
2 * Copyright (c) 2009-2010 Fabio Checconi
3 * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28/*
29 * $Id$
30 * $FreeBSD: head/sys/geom/sched/g_sched.c 206552 2010-04-13 09:56:17Z luigi $
31 *
32 * Main control module for geom-based disk schedulers ('sched').
33 *
34 * USER VIEW
35 * A 'sched' node is typically inserted transparently between
36 * an existing provider pp and its original geom gp
37 *
38 *	[pp --> gp  ..]
39 *
40 * using the command "geom sched insert <provider>" and
41 * resulting in the following topology
42 *
43 *	[pp --> sched_gp --> cp]   [new_pp --> gp ... ]
44 *
45 * Deletion "geom sched destroy <provider>.sched." restores the
46 * original chain. The normal "geom sched create <provide>"
47 * is also supported.
48 *
49 * INTERNALS
50 * Internally, the 'sched' uses the following data structures
51 *
52 *   geom{}         g_sched_softc{}      g_gsched{}
53 * +----------+    +---------------+   +-------------+
54 * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
55 * |  ...     |    |               |   |  gs_fini    |
56 * |          |    | [ hash table] |   |  gs_start   |
57 * +----------+    |               |   |  ...        |
58 *                 |               |   +-------------+
59 *                 |               |
60 *                 |               |     g_*_softc{}
61 *                 |               |   +-------------+
62 *                 | sc_data     *-|-->|             |
63 *                 +---------------+   |  algorithm- |
64 *                                     |  specific   |
65 *                                     +-------------+
66 *
67 * A g_sched_softc{} is created with a "geom sched insert" call.
68 * In turn this instantiates a specific scheduling algorithm,
69 * which sets sc_gsched to point to the algorithm callbacks,
70 * and calls gs_init() to create the g_*_softc{} .
71 * The other callbacks (gs_start, gs_next, ...) are invoked
72 * as needed
73 *
74 * g_sched_softc{} is defined in g_sched.h and mostly used here;
75 * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
76 * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
77 *
78 * DATA MOVING
79 * When a bio is received on the provider, it goes to the
80 * g_sched_start() which calls gs_start() to initially queue it;
81 * then we call g_sched_dispatch() that loops around gs_next()
82 * to select zero or more bio's to be sent downstream.
83 *
84 * g_sched_dispatch() can also be called as a result of a timeout,
85 * e.g. when doing anticipation or pacing requests.
86 *
87 * When a bio comes back, it goes to g_sched_done() which in turn
88 * calls gs_done(). The latter does any necessary housekeeping in
89 * the scheduling algorithm, and may decide to call g_sched_dispatch()
90 * to send more bio's downstream.
91 *
92 * If an algorithm needs per-flow queues, these are created
93 * calling gs_init_class() and destroyed with gs_fini_class(),
94 * and they are also inserted in the hash table implemented in
95 * the g_sched_softc{}
96 *
97 * If an algorithm is replaced, or a transparently-inserted node is
98 * removed with "geom sched destroy", we need to remove all references
99 * to the g_*_softc{} and g_sched_softc from the bio's still in
100 * the scheduler. g_sched_forced_dispatch() helps doing this.
101 * XXX need to explain better.
102 */
103
104#include <sys/cdefs.h>
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/kernel.h>
108#include <sys/module.h>
109#include <sys/lock.h>
110#include <sys/mutex.h>
111#include <sys/bio.h>
112#include <sys/limits.h>
113#include <sys/hash.h>
114#include <sys/sysctl.h>
115#include <sys/malloc.h>
116#include <sys/proc.h>		/* we access curthread */
117#include <geom/geom.h>
118#include "gs_scheduler.h"
119#include "g_sched.h"		/* geom hooks */
120
121/*
122 * Size of the per-geom hash table storing traffic classes.
123 * We may decide to change it at a later time, it has no ABI
124 * implications as it is only used for run-time allocations.
125 */
126#define G_SCHED_HASH_SIZE	32
127
128static int g_sched_destroy(struct g_geom *gp, boolean_t force);
129static int g_sched_destroy_geom(struct gctl_req *req,
130    struct g_class *mp, struct g_geom *gp);
131static void g_sched_config(struct gctl_req *req, struct g_class *mp,
132    const char *verb);
133static struct g_geom *g_sched_taste(struct g_class *mp,
134    struct g_provider *pp, int flags __unused);
135static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
136    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
137static void g_sched_init(struct g_class *mp);
138static void g_sched_fini(struct g_class *mp);
139
140struct g_class g_sched_class = {
141	.name = G_SCHED_CLASS_NAME,
142	.version = G_VERSION,
143	.ctlreq = g_sched_config,
144	.taste = g_sched_taste,
145	.destroy_geom = g_sched_destroy_geom,
146	.init = g_sched_init,
147	.fini = g_sched_fini
148};
149
150MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
151
152/*
153 * Global variables describing the state of the geom_sched module.
154 * There is only one static instance of this structure.
155 */
156LIST_HEAD(gs_list, g_gsched);	/* type, link field */
157struct geom_sched_vars {
158	struct mtx	gs_mtx;
159	struct gs_list	gs_scheds;	/* list of algorithms */
160	u_int		gs_debug;
161	u_int		gs_sched_count;	/* how many algorithms ? */
162	u_int 		gs_patched;	/* g_io_request was patched */
163
164	u_int		gs_initialized;
165	u_int		gs_expire_secs;	/* expiration of hash entries */
166
167	struct bio_queue_head gs_pending;
168	u_int		gs_npending;
169
170	/* The following are for stats, usually protected by gs_mtx. */
171	u_long		gs_requests;	/* total requests */
172	u_long		gs_done;	/* total done */
173	u_int 		gs_in_flight;	/* requests in flight */
174	u_int 		gs_writes_in_flight;
175	u_int 		gs_bytes_in_flight;
176	u_int 		gs_write_bytes_in_flight;
177
178	char		gs_names[256];	/* names of schedulers */
179};
180
181static struct geom_sched_vars me = {
182	.gs_expire_secs = 10,
183};
184
185SYSCTL_DECL(_kern_geom);
186SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
187    "GEOM_SCHED stuff");
188
189SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
190    &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
191
192SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
193    &me.gs_bytes_in_flight, 0, "Bytes in flight");
194
195SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
196    &me.gs_writes_in_flight, 0, "Write Requests in flight");
197
198SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
199    &me.gs_in_flight, 0, "Requests in flight");
200
201SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
202    &me.gs_done, 0, "Total done");
203
204SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
205    &me.gs_requests, 0, "Total requests");
206
207SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
208    &me.gs_names, 0, "Algorithm names");
209
210SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
211    &me.gs_sched_count, 0, "Number of algorithms");
212
213SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
214    &me.gs_debug, 0, "Debug level");
215
216SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
217    &me.gs_expire_secs, 0, "Expire time in seconds");
218
219/*
220 * g_sched calls the scheduler algorithms with this lock held.
221 * The locking functions are exposed so the scheduler algorithms can also
222 * protect themselves e.g. when running a callout handler.
223 */
224void
225g_sched_lock(struct g_geom *gp)
226{
227	struct g_sched_softc *sc = gp->softc;
228
229	mtx_lock(&sc->sc_mtx);
230}
231
232void
233g_sched_unlock(struct g_geom *gp)
234{
235	struct g_sched_softc *sc = gp->softc;
236
237	mtx_unlock(&sc->sc_mtx);
238}
239
240/*
241 * Support functions to handle references to the module,
242 * which are coming from devices using this scheduler.
243 */
244static inline void
245g_gsched_ref(struct g_gsched *gsp)
246{
247
248	atomic_add_int(&gsp->gs_refs, 1);
249}
250
251static inline void
252g_gsched_unref(struct g_gsched *gsp)
253{
254
255	atomic_add_int(&gsp->gs_refs, -1);
256}
257
258/*
259 * Update the stats when this request is done.
260 */
261static void
262g_sched_update_stats(struct bio *bio)
263{
264
265	me.gs_done++;
266	me.gs_in_flight--;
267	me.gs_bytes_in_flight -= bio->bio_length;
268	if (bio->bio_cmd & BIO_WRITE) {
269		me.gs_writes_in_flight--;
270		me.gs_write_bytes_in_flight -= bio->bio_length;
271	}
272}
273
274/*
275 * Dispatch any pending request.
276 */
277static void
278g_sched_forced_dispatch(struct g_geom *gp)
279{
280	struct g_sched_softc *sc = gp->softc;
281	struct g_gsched *gsp = sc->sc_gsched;
282	struct bio *bp;
283
284	KASSERT(mtx_owned(&sc->sc_mtx),
285	    ("sc_mtx not owned during forced dispatch"));
286
287	while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
288		g_io_request(bp, LIST_FIRST(&gp->consumer));
289}
290
291/*
292 * The main dispatch loop, called either here after the start
293 * routine, or by scheduling algorithms when they receive a timeout
294 * or a 'done' notification.  Does not share code with the forced
295 * dispatch path, since the gs_done() callback can call us.
296 */
297void
298g_sched_dispatch(struct g_geom *gp)
299{
300	struct g_sched_softc *sc = gp->softc;
301	struct g_gsched *gsp = sc->sc_gsched;
302	struct bio *bp;
303
304	KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
305
306	if ((sc->sc_flags & G_SCHED_FLUSHING))
307		return;
308
309	while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
310		g_io_request(bp, LIST_FIRST(&gp->consumer));
311}
312
313/*
314 * Recent (8.0 and above) versions of FreeBSD have support to
315 * register classifiers of disk requests. The classifier is
316 * invoked by g_io_request(), and stores the information into
317 * bp->bio_classifier1.
318 *
319 * Support for older versions, which is left here only for
320 * documentation purposes, relies on two hacks:
321 * 1. classification info is written into the bio_caller1
322 *    field of the topmost node in the bio chain. This field
323 *    is rarely used, but this module is incompatible with
324 *    those that use bio_caller1 for other purposes,
325 *    such as ZFS and gjournal;
326 * 2. g_io_request() is patched in-memory when the module is
327 *    loaded, so that the function calls a classifier as its
328 *    first thing. g_io_request() is restored when the module
329 *    is unloaded. This functionality is only supported for
330 *    x86 and amd64, other architectures need source code changes.
331 */
332
333/*
334 * Lookup the identity of the issuer of the original request.
335 * In the current implementation we use the curthread of the
336 * issuer, but different mechanisms may be implemented later
337 * so we do not make assumptions on the return value which for
338 * us is just an opaque identifier.
339 */
340
341static inline u_long
342g_sched_classify(struct bio *bp)
343{
344
345#if __FreeBSD_version > 800098
346	/* we have classifier fields in the struct bio */
347#define HAVE_BIO_CLASSIFIER
348	return ((u_long)bp->bio_classifier1);
349#else
350#warning old version!!!
351	while (bp->bio_parent != NULL)
352		bp = bp->bio_parent;
353
354	return ((u_long)bp->bio_caller1);
355#endif
356}
357
358/* Return the hash chain for the given key. */
359static inline struct g_hash *
360g_sched_hash(struct g_sched_softc *sc, u_long key)
361{
362
363	return (&sc->sc_hash[key & sc->sc_mask]);
364}
365
366/*
367 * Helper function for the children classes, which takes
368 * a geom and a bio and returns the private descriptor
369 * associated to the request.  This involves fetching
370 * the classification field and [al]locating the
371 * corresponding entry in the hash table.
372 */
373void *
374g_sched_get_class(struct g_geom *gp, struct bio *bp)
375{
376	struct g_sched_softc *sc;
377	struct g_sched_class *gsc;
378	struct g_gsched *gsp;
379	struct g_hash *bucket;
380	u_long key;
381
382	sc = gp->softc;
383	key = g_sched_classify(bp);
384	bucket = g_sched_hash(sc, key);
385	LIST_FOREACH(gsc, bucket, gsc_clist) {
386		if (key == gsc->gsc_key) {
387			gsc->gsc_refs++;
388			return (gsc->gsc_priv);
389		}
390	}
391
392	gsp = sc->sc_gsched;
393	gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
394	    M_GEOM_SCHED, M_NOWAIT | M_ZERO);
395	if (!gsc)
396		return (NULL);
397
398	if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
399		free(gsc, M_GEOM_SCHED);
400		return (NULL);
401	}
402
403	gsc->gsc_refs = 2;	/* 1 for the hash table, 1 for the caller. */
404	gsc->gsc_key = key;
405	LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
406
407	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
408
409	return (gsc->gsc_priv);
410}
411
412/*
413 * Release a reference to the per-client descriptor,
414 */
415void
416g_sched_put_class(struct g_geom *gp, void *priv)
417{
418	struct g_sched_class *gsc;
419	struct g_sched_softc *sc;
420
421	gsc = g_sched_priv2class(priv);
422	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
423
424	if (--gsc->gsc_refs > 0)
425		return;
426
427	sc = gp->softc;
428	sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
429
430	LIST_REMOVE(gsc, gsc_clist);
431	free(gsc, M_GEOM_SCHED);
432}
433
434static void
435g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
436    struct g_gsched *gsp, void *data)
437{
438	struct g_sched_class *cp, *cp2;
439	int i;
440
441	if (!hp)
442		return;
443
444	if (data && gsp->gs_hash_unref)
445		gsp->gs_hash_unref(data);
446
447	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
448		LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
449			g_sched_put_class(gp, cp->gsc_priv);
450	}
451
452	hashdestroy(hp, M_GEOM_SCHED, mask);
453}
454
455static struct g_hash *
456g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
457{
458	struct g_hash *hash;
459
460	if (gsp->gs_priv_size == 0)
461		return (NULL);
462
463	hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
464
465	return (hash);
466}
467
468static void
469g_sched_flush_classes(struct g_geom *gp)
470{
471	struct g_sched_softc *sc;
472	struct g_sched_class *cp, *cp2;
473	int i;
474
475	sc = gp->softc;
476
477	if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
478		return;
479
480	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
481		LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
482			if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
483				g_sched_put_class(gp, cp->gsc_priv);
484		}
485	}
486
487	sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
488}
489
490/*
491 * Wait for the completion of any outstanding request.  To ensure
492 * that this does not take forever the caller has to make sure that
493 * no new request enter the scehduler before calling us.
494 *
495 * Must be called with the gp mutex held and topology locked.
496 */
497static int
498g_sched_wait_pending(struct g_geom *gp)
499{
500	struct g_sched_softc *sc = gp->softc;
501	int endticks = ticks + hz;
502
503	g_topology_assert();
504
505	while (sc->sc_pending && endticks - ticks >= 0)
506		msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
507
508	return (sc->sc_pending ? ETIMEDOUT : 0);
509}
510
511static int
512g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
513{
514	struct g_sched_softc *sc = gp->softc;
515	int error;
516
517	/* Set the flushing flag: new bios will not enter the scheduler. */
518	sc->sc_flags |= G_SCHED_FLUSHING;
519
520	g_sched_forced_dispatch(gp);
521	error = g_sched_wait_pending(gp);
522	if (error)
523		goto failed;
524
525	/* No more requests pending or in flight from the old gsp. */
526
527	g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
528	sc->sc_hash = NULL;
529
530	/*
531	 * Avoid deadlock here by releasing the gp mutex and reacquiring
532	 * it once done.  It should be safe, since no reconfiguration or
533	 * destruction can take place due to the geom topology lock; no
534	 * new request can use the current sc_data since we flagged the
535	 * geom as being flushed.
536	 */
537	g_sched_unlock(gp);
538	gsp->gs_fini(sc->sc_data);
539	g_sched_lock(gp);
540
541	sc->sc_gsched = NULL;
542	sc->sc_data = NULL;
543	g_gsched_unref(gsp);
544
545failed:
546	sc->sc_flags &= ~G_SCHED_FLUSHING;
547
548	return (error);
549}
550
551static int
552g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
553{
554	int error;
555
556	g_sched_lock(gp);
557	error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
558	g_sched_unlock(gp);
559
560	return (error);
561}
562
563/*
564 * Support function for create/taste -- locate the desired
565 * algorithm and grab a reference to it.
566 */
567static struct g_gsched *
568g_gsched_find(const char *name)
569{
570	struct g_gsched *gsp = NULL;
571
572	mtx_lock(&me.gs_mtx);
573	LIST_FOREACH(gsp, &me.gs_scheds, glist) {
574		if (strcmp(name, gsp->gs_name) == 0) {
575			g_gsched_ref(gsp);
576			break;
577		}
578	}
579	mtx_unlock(&me.gs_mtx);
580
581	return (gsp);
582}
583
584/*
585 * Rebuild the list of scheduler names.
586 * To be called with me.gs_mtx lock held.
587 */
588static void
589g_gsched_build_names(struct g_gsched *gsp)
590{
591	int pos, l;
592	struct g_gsched *cur;
593
594	pos = 0;
595	LIST_FOREACH(cur, &me.gs_scheds, glist) {
596		l = strlen(cur->gs_name);
597		if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
598			if (pos != 0)
599				me.gs_names[pos++] = ' ';
600			strcpy(me.gs_names + pos, cur->gs_name);
601			pos += l;
602		}
603	}
604	me.gs_names[pos] = '\0';
605}
606
607/*
608 * Register or unregister individual scheduling algorithms.
609 */
610static int
611g_gsched_register(struct g_gsched *gsp)
612{
613	struct g_gsched *cur;
614	int error = 0;
615
616	mtx_lock(&me.gs_mtx);
617	LIST_FOREACH(cur, &me.gs_scheds, glist) {
618		if (strcmp(gsp->gs_name, cur->gs_name) == 0)
619			break;
620	}
621	if (cur != NULL) {
622		G_SCHED_DEBUG(0, "A scheduler named %s already"
623		    "exists.", gsp->gs_name);
624		error = EEXIST;
625	} else {
626		LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
627		gsp->gs_refs = 1;
628		me.gs_sched_count++;
629		g_gsched_build_names(gsp);
630	}
631	mtx_unlock(&me.gs_mtx);
632
633	return (error);
634}
635
636struct g_gsched_unregparm {
637	struct g_gsched *gup_gsp;
638	int		gup_error;
639};
640
641static void
642g_gsched_unregister(void *arg, int flag)
643{
644	struct g_gsched_unregparm *parm = arg;
645	struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
646	struct g_sched_softc *sc;
647	struct g_geom *gp, *gp_tmp;
648	int error;
649
650	parm->gup_error = 0;
651
652	g_topology_assert();
653
654	if (flag == EV_CANCEL)
655		return;
656
657	mtx_lock(&me.gs_mtx);
658
659	LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
660		if (gp->class != &g_sched_class)
661			continue;	/* Should not happen. */
662
663		sc = gp->softc;
664		if (sc->sc_gsched == gsp) {
665			error = g_sched_remove(gp, gsp);
666			if (error)
667				goto failed;
668		}
669	}
670
671	LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
672		if (cur != gsp)
673			continue;
674
675		if (gsp->gs_refs != 1) {
676			G_SCHED_DEBUG(0, "%s still in use.",
677			    gsp->gs_name);
678			parm->gup_error = EBUSY;
679		} else {
680			LIST_REMOVE(gsp, glist);
681			me.gs_sched_count--;
682			g_gsched_build_names(gsp);
683		}
684		break;
685	}
686
687	if (cur == NULL) {
688		G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
689		parm->gup_error = ENOENT;
690	}
691
692failed:
693	mtx_unlock(&me.gs_mtx);
694}
695
696static inline void
697g_gsched_global_init(void)
698{
699
700	if (!me.gs_initialized) {
701		G_SCHED_DEBUG(0, "Initializing global data.");
702		mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
703		LIST_INIT(&me.gs_scheds);
704		gs_bioq_init(&me.gs_pending);
705		me.gs_initialized = 1;
706	}
707}
708
709/*
710 * Module event called when a scheduling algorithm module is loaded or
711 * unloaded.
712 */
713int
714g_gsched_modevent(module_t mod, int cmd, void *arg)
715{
716	struct g_gsched *gsp = arg;
717	struct g_gsched_unregparm parm;
718	int error;
719
720	G_SCHED_DEBUG(0, "Modevent %d.", cmd);
721
722	/*
723	 * If the module is loaded at boot, the geom thread that calls
724	 * g_sched_init() might actually run after g_gsched_modevent(),
725	 * so make sure that the module is properly initialized.
726	 */
727	g_gsched_global_init();
728
729	error = EOPNOTSUPP;
730	switch (cmd) {
731	case MOD_LOAD:
732		error = g_gsched_register(gsp);
733		G_SCHED_DEBUG(0, "Loaded module %s error %d.",
734		    gsp->gs_name, error);
735		if (error == 0)
736			g_retaste(&g_sched_class);
737		break;
738
739	case MOD_UNLOAD:
740		parm.gup_gsp = gsp;
741		parm.gup_error = 0;
742
743		error = g_waitfor_event(g_gsched_unregister,
744		    &parm, M_WAITOK, NULL);
745		if (error == 0)
746			error = parm.gup_error;
747		G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
748		    gsp->gs_name, error);
749		break;
750	};
751
752	return (error);
753}
754
755#ifdef KTR
756#define	TRC_BIO_EVENT(e, bp)	g_sched_trace_bio_ ## e (bp)
757
758static inline char
759g_sched_type(struct bio *bp)
760{
761
762	if (0 != (bp->bio_cmd & BIO_READ))
763		return ('R');
764	else if (0 != (bp->bio_cmd & BIO_WRITE))
765		return ('W');
766	return ('U');
767}
768
769static inline void
770g_sched_trace_bio_START(struct bio *bp)
771{
772
773	CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
774	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
775	    bp->bio_offset, bp->bio_length);
776}
777
778static inline void
779g_sched_trace_bio_DONE(struct bio *bp)
780{
781
782	CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
783	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
784	    bp->bio_offset, bp->bio_length);
785}
786#else /* !KTR */
787#define	TRC_BIO_EVENT(e, bp)
788#endif /* !KTR */
789
790/*
791 * g_sched_done() and g_sched_start() dispatch the geom requests to
792 * the scheduling algorithm in use.
793 */
794static void
795g_sched_done(struct bio *bio)
796{
797	struct g_geom *gp = bio->bio_caller2;
798	struct g_sched_softc *sc = gp->softc;
799
800	TRC_BIO_EVENT(DONE, bio);
801
802	KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
803
804	g_sched_lock(gp);
805
806	g_sched_update_stats(bio);
807	sc->sc_gsched->gs_done(sc->sc_data, bio);
808	if (!--sc->sc_pending)
809		wakeup(gp);
810
811	g_sched_flush_classes(gp);
812	g_sched_unlock(gp);
813
814	g_std_done(bio);
815}
816
817static void
818g_sched_start(struct bio *bp)
819{
820	struct g_geom *gp = bp->bio_to->geom;
821	struct g_sched_softc *sc = gp->softc;
822	struct bio *cbp;
823
824	TRC_BIO_EVENT(START, bp);
825	G_SCHED_LOGREQ(bp, "Request received.");
826
827	cbp = g_clone_bio(bp);
828	if (cbp == NULL) {
829		g_io_deliver(bp, ENOMEM);
830		return;
831	}
832	cbp->bio_done = g_sched_done;
833	cbp->bio_to = LIST_FIRST(&gp->provider);
834	KASSERT(cbp->bio_to != NULL, ("NULL provider"));
835
836	/* We only schedule reads and writes. */
837	if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE)))
838		goto bypass;
839
840	G_SCHED_LOGREQ(cbp, "Sending request.");
841
842	g_sched_lock(gp);
843	/*
844	 * Call the algorithm's gs_start to queue the request in the
845	 * scheduler. If gs_start fails then pass the request down,
846	 * otherwise call g_sched_dispatch() which tries to push
847	 * one or more requests down.
848	 */
849	if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
850	    sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
851		g_sched_unlock(gp);
852		goto bypass;
853	}
854	/*
855	 * We use bio_caller1 to mark requests that are scheduled
856	 * so make sure it is not NULL.
857	 */
858	if (cbp->bio_caller1 == NULL)
859		cbp->bio_caller1 = &me;	/* anything not NULL */
860
861	cbp->bio_caller2 = gp;
862	sc->sc_pending++;
863
864	/* Update general stats. */
865	me.gs_in_flight++;
866	me.gs_requests++;
867	me.gs_bytes_in_flight += bp->bio_length;
868	if (bp->bio_cmd & BIO_WRITE) {
869		me.gs_writes_in_flight++;
870		me.gs_write_bytes_in_flight += bp->bio_length;
871	}
872	g_sched_dispatch(gp);
873	g_sched_unlock(gp);
874	return;
875
876bypass:
877	cbp->bio_done = g_std_done;
878	cbp->bio_caller1 = NULL; /* not scheduled */
879	g_io_request(cbp, LIST_FIRST(&gp->consumer));
880}
881
882/*
883 * The next few functions are the geom glue.
884 */
885static void
886g_sched_orphan(struct g_consumer *cp)
887{
888
889	g_topology_assert();
890	g_sched_destroy(cp->geom, 1);
891}
892
893static int
894g_sched_access(struct g_provider *pp, int dr, int dw, int de)
895{
896	struct g_geom *gp;
897	struct g_consumer *cp;
898	int error;
899
900	gp = pp->geom;
901	cp = LIST_FIRST(&gp->consumer);
902	error = g_access(cp, dr, dw, de);
903
904	return (error);
905}
906
907static void
908g_sched_temporary_start(struct bio *bio)
909{
910
911	mtx_lock(&me.gs_mtx);
912	me.gs_npending++;
913	gs_bioq_disksort(&me.gs_pending, bio);
914	mtx_unlock(&me.gs_mtx);
915}
916
917static void
918g_sched_flush_pending(g_start_t *start)
919{
920	struct bio *bp;
921
922	while ((bp = gs_bioq_takefirst(&me.gs_pending)))
923		start(bp);
924}
925
926static int
927g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
928    struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
929{
930	struct g_sched_softc *sc = gp->softc;
931	g_start_t *saved_start, *flush = g_sched_start;
932	int error = 0, endticks = ticks + hz;
933
934	g_cancel_event(newpp);	/* prevent taste() */
935	/* copy private fields */
936	newpp->private = pp->private;
937	newpp->index = pp->index;
938
939	/* Queue all the early requests coming for us. */
940	me.gs_npending = 0;
941	saved_start = pp->geom->start;
942	dstgp->start = g_sched_temporary_start;
943
944	while (pp->nstart - pp->nend != me.gs_npending &&
945	    endticks - ticks >= 0)
946		tsleep(pp, PRIBIO, "-", hz/10);
947
948	if (pp->nstart - pp->nend != me.gs_npending) {
949		flush = saved_start;
950		error = ETIMEDOUT;
951		goto fail;
952	}
953
954	/* link pp to this geom */
955	LIST_REMOVE(pp, provider);
956	pp->geom = gp;
957	LIST_INSERT_HEAD(&gp->provider, pp, provider);
958
959	/*
960	 * replicate the counts from the parent in the
961	 * new provider and consumer nodes
962	 */
963	cp->acr = newpp->acr = pp->acr;
964	cp->acw = newpp->acw = pp->acw;
965	cp->ace = newpp->ace = pp->ace;
966	sc->sc_flags |= G_SCHED_PROXYING;
967
968fail:
969	dstgp->start = saved_start;
970
971	g_sched_flush_pending(flush);
972
973	return (error);
974}
975
976/*
977 * Create a geom node for the device passed as *pp.
978 * If successful, add a reference to this gsp.
979 */
980static int
981g_sched_create(struct gctl_req *req, struct g_class *mp,
982    struct g_provider *pp, struct g_gsched *gsp, int proxy)
983{
984	struct g_sched_softc *sc = NULL;
985	struct g_geom *gp, *dstgp;
986	struct g_provider *newpp = NULL;
987	struct g_consumer *cp = NULL;
988	char name[64];
989	int error;
990
991	g_topology_assert();
992
993	snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
994	LIST_FOREACH(gp, &mp->geom, geom) {
995		if (strcmp(gp->name, name) == 0) {
996			gctl_error(req, "Geom %s already exists.",
997			    name);
998			return (EEXIST);
999		}
1000	}
1001
1002	gp = g_new_geomf(mp, name);
1003	dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
1004	if (gp == NULL) {
1005		gctl_error(req, "Cannot create geom %s.", name);
1006		error = ENOMEM;
1007		goto fail;
1008	}
1009
1010	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1011	sc->sc_gsched = gsp;
1012	sc->sc_data = gsp->gs_init(gp);
1013	if (sc->sc_data == NULL) {
1014		error = ENOMEM;
1015		goto fail;
1016	}
1017
1018	sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1019
1020	/*
1021	 * Do not initialize the flush mechanism, will be initialized
1022	 * on the first insertion on the hash table.
1023	 */
1024
1025	mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1026
1027	gp->softc = sc;
1028	gp->start = g_sched_start;
1029	gp->orphan = g_sched_orphan;
1030	gp->access = g_sched_access;
1031	gp->dumpconf = g_sched_dumpconf;
1032
1033	newpp = g_new_providerf(dstgp, gp->name);
1034	if (newpp == NULL) {
1035		gctl_error(req, "Cannot create provider %s.", name);
1036		error = ENOMEM;
1037		goto fail;
1038	}
1039
1040	newpp->mediasize = pp->mediasize;
1041	newpp->sectorsize = pp->sectorsize;
1042
1043	cp = g_new_consumer(gp);
1044	if (cp == NULL) {
1045		gctl_error(req, "Cannot create consumer for %s.",
1046		    gp->name);
1047		error = ENOMEM;
1048		goto fail;
1049	}
1050
1051	error = g_attach(cp, proxy ? newpp : pp);
1052	if (error != 0) {
1053		gctl_error(req, "Cannot attach to provider %s.",
1054		    pp->name);
1055		goto fail;
1056	}
1057
1058	g_error_provider(newpp, 0);
1059	if (proxy) {
1060		error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1061		if (error)
1062			goto fail;
1063	}
1064	G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1065
1066	g_gsched_ref(gsp);
1067
1068	return (0);
1069
1070fail:
1071	if (cp != NULL) {
1072		if (cp->provider != NULL)
1073			g_detach(cp);
1074		g_destroy_consumer(cp);
1075	}
1076
1077	if (newpp != NULL)
1078		g_destroy_provider(newpp);
1079
1080	if (sc && sc->sc_hash) {
1081		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1082		    gsp, sc->sc_data);
1083	}
1084
1085	if (sc && sc->sc_data)
1086		gsp->gs_fini(sc->sc_data);
1087
1088	if (gp != NULL) {
1089		if (gp->softc != NULL)
1090			g_free(gp->softc);
1091		g_destroy_geom(gp);
1092	}
1093
1094	return (error);
1095}
1096
1097/*
1098 * Support for dynamic switching of scheduling algorithms.
1099 * First initialize the data structures for the new algorithm,
1100 * then call g_sched_remove_locked() to flush all references
1101 * to the old one, finally link the new algorithm.
1102 */
1103static int
1104g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1105    struct g_provider *pp, struct g_gsched *gsp)
1106{
1107	struct g_sched_softc *sc;
1108	struct g_geom *gp;
1109	struct g_hash *newh;
1110	void *data;
1111	u_long mask;
1112	int error = 0;
1113
1114	gp = pp->geom;
1115	sc = gp->softc;
1116
1117	data = gsp->gs_init(gp);
1118	if (data == NULL)
1119		return (ENOMEM);
1120
1121	newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1122	if (gsp->gs_priv_size && !newh) {
1123		error = ENOMEM;
1124		goto fail;
1125	}
1126
1127	g_sched_lock(gp);
1128	if (sc->sc_gsched) {	/* can be NULL in some cases */
1129		error = g_sched_remove_locked(gp, sc->sc_gsched);
1130		if (error)
1131			goto fail;
1132	}
1133
1134	g_gsched_ref(gsp);
1135	sc->sc_gsched = gsp;
1136	sc->sc_data = data;
1137	sc->sc_hash = newh;
1138	sc->sc_mask = mask;
1139
1140	g_sched_unlock(gp);
1141
1142	return (0);
1143
1144fail:
1145	if (newh)
1146		g_sched_hash_fini(gp, newh, mask, gsp, data);
1147
1148	if (data)
1149		gsp->gs_fini(data);
1150
1151	g_sched_unlock(gp);
1152
1153	return (error);
1154}
1155
1156/*
1157 * Stop the request flow directed to the proxy, redirecting the new
1158 * requests to the me.gs_pending queue.
1159 */
1160static struct g_provider *
1161g_detach_proxy(struct g_geom *gp)
1162{
1163	struct g_consumer *cp;
1164	struct g_provider *pp, *newpp;
1165
1166	do {
1167		pp = LIST_FIRST(&gp->provider);
1168		if (pp == NULL)
1169			break;
1170		cp = LIST_FIRST(&gp->consumer);
1171		if (cp == NULL)
1172			break;
1173		newpp = cp->provider;
1174		if (newpp == NULL)
1175			break;
1176
1177		me.gs_npending = 0;
1178		pp->geom->start = g_sched_temporary_start;
1179
1180		return (pp);
1181	} while (0);
1182	printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1183
1184	return (NULL);
1185}
1186
1187static void
1188g_sched_blackhole(struct bio *bp)
1189{
1190
1191	g_io_deliver(bp, ENXIO);
1192}
1193
1194static inline void
1195g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1196    struct g_provider *newpp)
1197{
1198
1199	LIST_REMOVE(pp, provider);
1200	if (newpp) {
1201		pp->private = newpp->private;
1202		pp->index = newpp->index;
1203	}
1204	pp->geom = gp;
1205	LIST_INSERT_HEAD(&gp->provider, pp, provider);
1206}
1207
1208static inline void
1209g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1210{
1211	struct g_geom *gp = oldpp->geom;
1212
1213	g_reparent_provider(oldpp, newpp->geom, newpp);
1214
1215	/*
1216	 * Hackish: let the system destroy the old provider for us, just
1217	 * in case someone attached a consumer to it, in which case a
1218	 * direct call to g_destroy_provider() would not work.
1219	 */
1220	g_reparent_provider(newpp, gp, NULL);
1221}
1222
1223/*
1224 * Complete the proxy destruction, linking the old provider to its
1225 * original geom, and destroying the proxy provider.  Also take care
1226 * of issuing the pending requests collected in me.gs_pending (if any).
1227 */
1228static int
1229g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1230{
1231	struct g_consumer *cp;
1232	struct g_provider *newpp;
1233
1234	do {
1235		cp = LIST_FIRST(&gp->consumer);
1236		if (cp == NULL)
1237			break;
1238		newpp = cp->provider;
1239		if (newpp == NULL)
1240			break;
1241
1242		/* Relink the provider to its original geom. */
1243		g_unproxy_provider(oldpp, newpp);
1244
1245		/* Detach consumer from provider, and destroy provider. */
1246		cp->acr = newpp->acr = 0;
1247		cp->acw = newpp->acw = 0;
1248		cp->ace = newpp->ace = 0;
1249		g_detach(cp);
1250
1251		/* Send the pending bios through the right start function. */
1252		g_sched_flush_pending(oldpp->geom->start);
1253
1254		return (0);
1255	} while (0);
1256	printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1257
1258	/* We cannot send the pending bios anywhere... */
1259	g_sched_flush_pending(g_sched_blackhole);
1260
1261	return (EINVAL);
1262}
1263
1264static int
1265g_sched_destroy(struct g_geom *gp, boolean_t force)
1266{
1267	struct g_provider *pp, *oldpp = NULL;
1268	struct g_sched_softc *sc;
1269	struct g_gsched *gsp;
1270	int error;
1271
1272	g_topology_assert();
1273	sc = gp->softc;
1274	if (sc == NULL)
1275		return (ENXIO);
1276	if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1277		pp = LIST_FIRST(&gp->provider);
1278		if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1279			const char *msg = force ?
1280				"but we force removal" : "cannot remove";
1281
1282			G_SCHED_DEBUG(!force,
1283			    "Device %s is still open (r%dw%de%d), %s.",
1284			    pp->name, pp->acr, pp->acw, pp->ace, msg);
1285			if (!force)
1286				return (EBUSY);
1287		} else {
1288			G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1289		}
1290	} else
1291		oldpp = g_detach_proxy(gp);
1292
1293	gsp = sc->sc_gsched;
1294	if (gsp) {
1295		/*
1296		 * XXX bad hack here: force a dispatch to release
1297		 * any reference to the hash table still held by
1298		 * the scheduler.
1299		 */
1300		g_sched_lock(gp);
1301		/*
1302		 * We are dying here, no new requests should enter
1303		 * the scheduler.  This is granted by the topolgy,
1304		 * either in case we were proxying (new bios are
1305		 * being redirected) or not (see the access check
1306		 * above).
1307		 */
1308		g_sched_forced_dispatch(gp);
1309		error = g_sched_wait_pending(gp);
1310
1311		if (error) {
1312			/*
1313			 * Not all the requests came home: this might happen
1314			 * under heavy load, or if we were waiting for any
1315			 * bio which is served in the event path (see
1316			 * geom_slice.c for an example of how this can
1317			 * happen).  Try to restore a working configuration
1318			 * if we can fail.
1319			 */
1320			if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1321				g_sched_flush_pending(force ?
1322				    g_sched_blackhole : g_sched_start);
1323			}
1324
1325			/*
1326			 * In the forced destroy case there is not so much
1327			 * we can do, we have pending bios that will call
1328			 * g_sched_done() somehow, and we don't want them
1329			 * to crash the system using freed memory.  We tell
1330			 * the user that something went wrong, and leak some
1331			 * memory here.
1332			 * Note: the callers using force = 1 ignore the
1333			 * return value.
1334			 */
1335			if (force) {
1336				G_SCHED_DEBUG(0, "Pending requests while "
1337				    " destroying geom, some memory leaked.");
1338			}
1339
1340			return (error);
1341		}
1342
1343		g_sched_unlock(gp);
1344		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1345		    gsp, sc->sc_data);
1346		sc->sc_hash = NULL;
1347		gsp->gs_fini(sc->sc_data);
1348		g_gsched_unref(gsp);
1349		sc->sc_gsched = NULL;
1350	}
1351
1352	if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1353		error = g_destroy_proxy(gp, oldpp);
1354
1355		if (error) {
1356			if (force) {
1357				G_SCHED_DEBUG(0, "Unrecoverable error while "
1358				    "destroying a proxy geom, leaking some "
1359				    " memory.");
1360			}
1361
1362			return (error);
1363		}
1364	}
1365
1366	mtx_destroy(&sc->sc_mtx);
1367
1368	g_free(gp->softc);
1369	gp->softc = NULL;
1370	g_wither_geom(gp, ENXIO);
1371
1372	return (error);
1373}
1374
1375static int
1376g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1377    struct g_geom *gp)
1378{
1379
1380	return (g_sched_destroy(gp, 0));
1381}
1382
1383/*
1384 * Functions related to the classification of requests.
1385 *
1386 * On recent FreeBSD versions (8.0 and above), we store a reference
1387 * to the issuer of a request in bp->bio_classifier1 as soon
1388 * as the bio is posted to the geom queue (and not later, because
1389 * requests are managed by the g_down thread afterwards).
1390 *
1391 * On older versions of the system (but this code is not used
1392 * in any existing release), we [ab]use the caller1 field in the
1393 * root element of the bio tree to store the classification info.
1394 * The marking is done at the beginning of g_io_request()
1395 * and only if we find that the field is NULL.
1396 *
1397 * To avoid rebuilding the kernel, this module will patch the
1398 * initial part of g_io_request() so it jumps to some hand-coded
1399 * assembly that does the marking and then executes the original
1400 * body of g_io_request().
1401 *
1402 * fake_ioreq[] is architecture-specific machine code
1403 * that implements the above. CODE_SIZE, STORE_SIZE etc.
1404 * are constants used in the patching routine. Look at the
1405 * code in g_ioreq_patch() for the details.
1406 */
1407
1408#ifndef HAVE_BIO_CLASSIFIER
1409/*
1410 * Support for old FreeBSD versions
1411 */
1412#if defined(__i386__)
1413#define	CODE_SIZE	29
1414#define	STORE_SIZE	5
1415#define	EPILOGUE	5
1416#define	SIZE		(CODE_SIZE + STORE_SIZE + EPILOGUE)
1417
1418static u_char fake_ioreq[SIZE] = {
1419	0x8b, 0x44, 0x24, 0x04,		/* mov bp, %eax */
1420	/* 1: */
1421	0x89, 0xc2,			/* mov %eax, %edx # edx = bp */
1422	0x8b, 0x40, 0x64,		/* mov bp->bio_parent, %eax */
1423	0x85, 0xc0,			/* test %eax, %eax */
1424	0x75, 0xf7,			/* jne 1b */
1425	0x8b, 0x42, 0x30,		/* mov bp->bp_caller1, %eax */
1426	0x85, 0xc0,			/* test %eax, %eax */
1427	0x75, 0x09,			/* jne 2f */
1428	0x64, 0xa1, 0x00, 0x00,		/* mov %fs:0, %eax */
1429	0x00, 0x00,
1430	0x89, 0x42, 0x30,		/* mov %eax, bp->bio_caller1 */
1431	/* 2: */
1432        0x55, 0x89, 0xe5, 0x57, 0x56,
1433	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp back... */
1434};
1435#elif defined(__amd64)
1436#define	CODE_SIZE	38
1437#define	STORE_SIZE	6
1438#define	EPILOGUE	5
1439#define	SIZE		(CODE_SIZE + STORE_SIZE + EPILOGUE)
1440
1441static u_char fake_ioreq[SIZE] = {
1442	0x48, 0x89, 0xf8,		/* mov bp, %rax */
1443	/* 1: */
1444	0x48, 0x89, 0xc2,		/* mov %rax, %rdx # rdx = bp */
1445	0x48, 0x8b, 0x82, 0xa8,		/* mov bp->bio_parent, %rax */
1446	0x00, 0x00, 0x00,
1447	0x48, 0x85, 0xc0,		/* test %rax, %rax */
1448	0x75, 0xf1,			/* jne 1b */
1449	0x48, 0x83, 0x7a, 0x58,		/* cmp $0, bp->bp_caller1 */
1450	0x00,
1451	0x75, 0x0d,			/* jne 2f */
1452	0x65, 0x48, 0x8b, 0x04,		/* mov %gs:0, %rax */
1453	0x25, 0x00, 0x00, 0x00,
1454	0x00,
1455	0x48, 0x89, 0x42, 0x58,		/* mov %rax, bp->bio_caller1 */
1456	/* 2: */
1457	0x55, 0x48, 0x89, 0xe5, 0x41, 0x56,
1458	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp back... */
1459};
1460#else /* neither x86 nor amd64 */
1461static void
1462g_new_io_request(struct bio *bp, struct g_consumer *cp)
1463{
1464	struct bio *top = bp;
1465
1466        /*
1467         * bio classification: if bio_caller1 is available in the
1468         * root of the 'struct bio' tree, store there the thread id
1469         * of the thread that originated the request.
1470         * More sophisticated classification schemes can be used.
1471         */
1472	while (top->bio_parent)
1473		top = top->bio_parent;
1474
1475	if (top->bio_caller1 == NULL)
1476		top->bio_caller1 = curthread;
1477}
1478
1479#error please add the code above in g_new_io_request() to the beginning of \
1480	/sys/geom/geom_io.c::g_io_request(), and remove this line.
1481#endif /* end of arch-specific code */
1482
1483static int
1484g_ioreq_patch(void)
1485{
1486	u_char *original;
1487	u_long ofs;
1488	int found;
1489
1490	if (me.gs_patched)
1491		return (-1);
1492
1493	original = (u_char *)g_io_request;
1494
1495	found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE);
1496	if (!found)
1497		return (-1);
1498
1499	/* Jump back to the original + STORE_SIZE. */
1500	ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE);
1501	bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4);
1502
1503	/* Patch the original address with a jump to the trampoline. */
1504	*original = 0xe9;     /* jump opcode */
1505	ofs = fake_ioreq - (original + 5);
1506	bcopy(&ofs, original + 1, 4);
1507
1508	me.gs_patched = 1;
1509
1510	return (0);
1511}
1512
1513/*
1514 * Restore the original code, this is easy.
1515 */
1516static void
1517g_ioreq_restore(void)
1518{
1519	u_char *original;
1520
1521	if (me.gs_patched) {
1522		original = (u_char *)g_io_request;
1523		bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE);
1524		me.gs_patched = 0;
1525	}
1526}
1527
1528static inline void
1529g_classifier_ini(void)
1530{
1531
1532	g_ioreq_patch();
1533}
1534
1535static inline void
1536g_classifier_fini(void)
1537{
1538
1539	g_ioreq_restore();
1540}
1541
1542/*--- end of support code for older FreeBSD versions */
1543
1544#else /* HAVE_BIO_CLASSIFIER */
1545
1546/*
1547 * Classifier support for recent FreeBSD versions: we use
1548 * a very simple classifier, only use curthread to tag a request.
1549 * The classifier is registered at module load, and unregistered
1550 * at module unload.
1551 */
1552static int
1553g_sched_tag(void *arg, struct bio *bp)
1554{
1555
1556	bp->bio_classifier1 = curthread;
1557	return (1);
1558}
1559
1560static struct g_classifier_hook g_sched_classifier = {
1561	.func =	g_sched_tag,
1562};
1563
1564static inline void
1565g_classifier_ini(void)
1566{
1567
1568	g_register_classifier(&g_sched_classifier);
1569}
1570
1571static inline void
1572g_classifier_fini(void)
1573{
1574
1575	g_unregister_classifier(&g_sched_classifier);
1576}
1577#endif /* HAVE_BIO_CLASSIFIER */
1578
1579static void
1580g_sched_init(struct g_class *mp)
1581{
1582
1583	g_gsched_global_init();
1584
1585	G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1586	    mp, &g_sched_class);
1587
1588	/* Patch g_io_request to store classification info in the bio. */
1589	g_classifier_ini();
1590}
1591
1592static void
1593g_sched_fini(struct g_class *mp)
1594{
1595
1596	g_classifier_fini();
1597
1598	G_SCHED_DEBUG(0, "Unloading...");
1599
1600	KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1601	mtx_destroy(&me.gs_mtx);
1602}
1603
1604/*
1605 * Read the i-th argument for a request, skipping the /dev/
1606 * prefix if present.
1607 */
1608static const char *
1609g_sched_argi(struct gctl_req *req, int i)
1610{
1611	static const char *dev_prefix = "/dev/";
1612	const char *name;
1613	char param[16];
1614	int l = strlen(dev_prefix);
1615
1616	snprintf(param, sizeof(param), "arg%d", i);
1617	name = gctl_get_asciiparam(req, param);
1618	if (name == NULL)
1619		gctl_error(req, "No 'arg%d' argument", i);
1620	else if (strncmp(name, dev_prefix, l) == 0)
1621		name += l;
1622	return (name);
1623}
1624
1625/*
1626 * Fetch nargs and do appropriate checks.
1627 */
1628static int
1629g_sched_get_nargs(struct gctl_req *req)
1630{
1631	int *nargs;
1632
1633	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1634	if (nargs == NULL) {
1635		gctl_error(req, "No 'nargs' argument");
1636		return (0);
1637	}
1638	if (*nargs <= 0)
1639		gctl_error(req, "Missing device(s).");
1640	return (*nargs);
1641}
1642
1643/*
1644 * Check whether we should add the class on certain volumes when
1645 * this geom is created. Right now this is under control of a kenv
1646 * variable containing the names of all devices that we care about.
1647 * Probably we should only support transparent insertion as the
1648 * preferred mode of operation.
1649 */
1650static struct g_geom *
1651g_sched_taste(struct g_class *mp, struct g_provider *pp,
1652		int flags __unused)
1653{
1654	struct g_gsched *gsp = NULL;	/* the . algorithm we want */
1655	const char *s;			/* generic string pointer */
1656	const char *taste_names;	/* devices we like */
1657	int l;
1658
1659        g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1660	    mp->name, pp->name);
1661        g_topology_assert();
1662
1663        G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1664
1665	do {
1666		/* do not taste on ourselves */
1667		if (pp->geom->class == mp)
1668                	break;
1669
1670		taste_names = getenv("geom.sched.taste");
1671		if (taste_names == NULL)
1672			break;
1673
1674		l = strlen(pp->name);
1675		for (s = taste_names; *s &&
1676		    (s = strstr(s, pp->name)); s++) {
1677			/* further checks for an exact match */
1678			if ( (s == taste_names || s[-1] == ' ') &&
1679			     (s[l] == '\0' || s[l] == ' ') )
1680				break;
1681		}
1682		if (s == NULL)
1683			break;
1684		G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1685		    pp->name, s);
1686
1687		/* look up the provider name in the list */
1688		s = getenv("geom.sched.algo");
1689		if (s == NULL)
1690			s = "rr";
1691
1692		gsp = g_gsched_find(s);	/* also get a reference */
1693		if (gsp == NULL) {
1694			G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1695			break;
1696		}
1697
1698		/* XXX create with 1 as last argument ? */
1699		g_sched_create(NULL, mp, pp, gsp, 0);
1700		g_gsched_unref(gsp);
1701	} while (0);
1702	return NULL;
1703}
1704
1705static void
1706g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1707{
1708	struct g_provider *pp;
1709	struct g_gsched *gsp;
1710	const char *name;
1711	int i, nargs;
1712
1713	g_topology_assert();
1714
1715	name = gctl_get_asciiparam(req, "algo");
1716	if (name == NULL) {
1717		gctl_error(req, "No '%s' argument", "algo");
1718		return;
1719	}
1720
1721	gsp = g_gsched_find(name);	/* also get a reference */
1722	if (gsp == NULL) {
1723		gctl_error(req, "Bad algorithm '%s'", name);
1724		return;
1725	}
1726
1727	nargs = g_sched_get_nargs(req);
1728
1729	/*
1730	 * Run on the arguments, and break on any error.
1731	 * We look for a device name, but skip the /dev/ prefix if any.
1732	 */
1733	for (i = 0; i < nargs; i++) {
1734		name = g_sched_argi(req, i);
1735		if (name == NULL)
1736			break;
1737		pp = g_provider_by_name(name);
1738		if (pp == NULL) {
1739			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1740			gctl_error(req, "Provider %s is invalid.", name);
1741			break;
1742		}
1743		if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1744			break;
1745	}
1746
1747	g_gsched_unref(gsp);
1748}
1749
1750static void
1751g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1752{
1753	struct g_provider *pp;
1754	struct g_gsched *gsp;
1755	const char *name;
1756	int i, nargs;
1757
1758	g_topology_assert();
1759
1760	name = gctl_get_asciiparam(req, "algo");
1761	if (name == NULL) {
1762		gctl_error(req, "No '%s' argument", "algo");
1763		return;
1764	}
1765
1766	gsp = g_gsched_find(name);	/* also get a reference */
1767	if (gsp == NULL) {
1768		gctl_error(req, "Bad algorithm '%s'", name);
1769		return;
1770	}
1771
1772	nargs = g_sched_get_nargs(req);
1773
1774	/*
1775	 * Run on the arguments, and break on any error.
1776	 * We look for a device name, but skip the /dev/ prefix if any.
1777	 */
1778	for (i = 0; i < nargs; i++) {
1779		name = g_sched_argi(req, i);
1780		if (name == NULL)
1781			break;
1782		pp = g_provider_by_name(name);
1783		if (pp == NULL || pp->geom->class != mp) {
1784			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1785			gctl_error(req, "Provider %s is invalid.", name);
1786			break;
1787		}
1788		if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1789			break;
1790	}
1791
1792	g_gsched_unref(gsp);
1793}
1794
1795static struct g_geom *
1796g_sched_find_geom(struct g_class *mp, const char *name)
1797{
1798	struct g_geom *gp;
1799
1800	LIST_FOREACH(gp, &mp->geom, geom) {
1801		if (strcmp(gp->name, name) == 0)
1802			return (gp);
1803	}
1804	return (NULL);
1805}
1806
1807static void
1808g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1809{
1810	int nargs, *force, error, i;
1811	struct g_geom *gp;
1812	const char *name;
1813
1814	g_topology_assert();
1815
1816	nargs = g_sched_get_nargs(req);
1817
1818	force = gctl_get_paraml(req, "force", sizeof(*force));
1819	if (force == NULL) {
1820		gctl_error(req, "No 'force' argument");
1821		return;
1822	}
1823
1824	for (i = 0; i < nargs; i++) {
1825		name = g_sched_argi(req, i);
1826		if (name == NULL)
1827			break;
1828
1829		gp = g_sched_find_geom(mp, name);
1830		if (gp == NULL) {
1831			G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1832			gctl_error(req, "Device %s is invalid.", name);
1833			break;
1834		}
1835
1836		error = g_sched_destroy(gp, *force);
1837		if (error != 0) {
1838			gctl_error(req, "Cannot destroy device %s (error=%d).",
1839			    gp->name, error);
1840			break;
1841		}
1842	}
1843}
1844
1845static void
1846g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1847{
1848	uint32_t *version;
1849
1850	g_topology_assert();
1851
1852	version = gctl_get_paraml(req, "version", sizeof(*version));
1853	if (version == NULL) {
1854		gctl_error(req, "No '%s' argument.", "version");
1855		return;
1856	}
1857
1858	if (*version != G_SCHED_VERSION) {
1859		gctl_error(req, "Userland and kernel parts are "
1860		    "out of sync.");
1861		return;
1862	}
1863
1864	if (strcmp(verb, "create") == 0) {
1865		g_sched_ctl_create(req, mp, 0);
1866		return;
1867	} else if (strcmp(verb, "insert") == 0) {
1868		g_sched_ctl_create(req, mp, 1);
1869		return;
1870	} else if (strcmp(verb, "configure") == 0) {
1871		g_sched_ctl_configure(req, mp);
1872		return;
1873	} else if (strcmp(verb, "destroy") == 0) {
1874		g_sched_ctl_destroy(req, mp);
1875		return;
1876	}
1877
1878	gctl_error(req, "Unknown verb.");
1879}
1880
1881static void
1882g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1883    struct g_consumer *cp, struct g_provider *pp)
1884{
1885	struct g_sched_softc *sc = gp->softc;
1886	struct g_gsched *gsp = sc->sc_gsched;
1887	if (indent == NULL) {	/* plaintext */
1888		sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1889	}
1890	if (gsp->gs_dumpconf)
1891		gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1892}
1893
1894DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1895MODULE_VERSION(geom_sched, 0);
1896