g_sched.c revision 206552
1/*- 2 * Copyright (c) 2009-2010 Fabio Checconi 3 * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28/* 29 * $Id$ 30 * $FreeBSD: head/sys/geom/sched/g_sched.c 206552 2010-04-13 09:56:17Z luigi $ 31 * 32 * Main control module for geom-based disk schedulers ('sched'). 33 * 34 * USER VIEW 35 * A 'sched' node is typically inserted transparently between 36 * an existing provider pp and its original geom gp 37 * 38 * [pp --> gp ..] 39 * 40 * using the command "geom sched insert <provider>" and 41 * resulting in the following topology 42 * 43 * [pp --> sched_gp --> cp] [new_pp --> gp ... ] 44 * 45 * Deletion "geom sched destroy <provider>.sched." restores the 46 * original chain. The normal "geom sched create <provide>" 47 * is also supported. 48 * 49 * INTERNALS 50 * Internally, the 'sched' uses the following data structures 51 * 52 * geom{} g_sched_softc{} g_gsched{} 53 * +----------+ +---------------+ +-------------+ 54 * | softc *-|--->| sc_gsched *-|-->| gs_init | 55 * | ... | | | | gs_fini | 56 * | | | [ hash table] | | gs_start | 57 * +----------+ | | | ... | 58 * | | +-------------+ 59 * | | 60 * | | g_*_softc{} 61 * | | +-------------+ 62 * | sc_data *-|-->| | 63 * +---------------+ | algorithm- | 64 * | specific | 65 * +-------------+ 66 * 67 * A g_sched_softc{} is created with a "geom sched insert" call. 68 * In turn this instantiates a specific scheduling algorithm, 69 * which sets sc_gsched to point to the algorithm callbacks, 70 * and calls gs_init() to create the g_*_softc{} . 71 * The other callbacks (gs_start, gs_next, ...) are invoked 72 * as needed 73 * 74 * g_sched_softc{} is defined in g_sched.h and mostly used here; 75 * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h; 76 * g_*_softc{} is defined/implemented by each algorithm (gs_*.c) 77 * 78 * DATA MOVING 79 * When a bio is received on the provider, it goes to the 80 * g_sched_start() which calls gs_start() to initially queue it; 81 * then we call g_sched_dispatch() that loops around gs_next() 82 * to select zero or more bio's to be sent downstream. 83 * 84 * g_sched_dispatch() can also be called as a result of a timeout, 85 * e.g. when doing anticipation or pacing requests. 86 * 87 * When a bio comes back, it goes to g_sched_done() which in turn 88 * calls gs_done(). The latter does any necessary housekeeping in 89 * the scheduling algorithm, and may decide to call g_sched_dispatch() 90 * to send more bio's downstream. 91 * 92 * If an algorithm needs per-flow queues, these are created 93 * calling gs_init_class() and destroyed with gs_fini_class(), 94 * and they are also inserted in the hash table implemented in 95 * the g_sched_softc{} 96 * 97 * If an algorithm is replaced, or a transparently-inserted node is 98 * removed with "geom sched destroy", we need to remove all references 99 * to the g_*_softc{} and g_sched_softc from the bio's still in 100 * the scheduler. g_sched_forced_dispatch() helps doing this. 101 * XXX need to explain better. 102 */ 103 104#include <sys/cdefs.h> 105#include <sys/param.h> 106#include <sys/systm.h> 107#include <sys/kernel.h> 108#include <sys/module.h> 109#include <sys/lock.h> 110#include <sys/mutex.h> 111#include <sys/bio.h> 112#include <sys/limits.h> 113#include <sys/hash.h> 114#include <sys/sysctl.h> 115#include <sys/malloc.h> 116#include <sys/proc.h> /* we access curthread */ 117#include <geom/geom.h> 118#include "gs_scheduler.h" 119#include "g_sched.h" /* geom hooks */ 120 121/* 122 * Size of the per-geom hash table storing traffic classes. 123 * We may decide to change it at a later time, it has no ABI 124 * implications as it is only used for run-time allocations. 125 */ 126#define G_SCHED_HASH_SIZE 32 127 128static int g_sched_destroy(struct g_geom *gp, boolean_t force); 129static int g_sched_destroy_geom(struct gctl_req *req, 130 struct g_class *mp, struct g_geom *gp); 131static void g_sched_config(struct gctl_req *req, struct g_class *mp, 132 const char *verb); 133static struct g_geom *g_sched_taste(struct g_class *mp, 134 struct g_provider *pp, int flags __unused); 135static void g_sched_dumpconf(struct sbuf *sb, const char *indent, 136 struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); 137static void g_sched_init(struct g_class *mp); 138static void g_sched_fini(struct g_class *mp); 139 140struct g_class g_sched_class = { 141 .name = G_SCHED_CLASS_NAME, 142 .version = G_VERSION, 143 .ctlreq = g_sched_config, 144 .taste = g_sched_taste, 145 .destroy_geom = g_sched_destroy_geom, 146 .init = g_sched_init, 147 .fini = g_sched_fini 148}; 149 150MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures"); 151 152/* 153 * Global variables describing the state of the geom_sched module. 154 * There is only one static instance of this structure. 155 */ 156LIST_HEAD(gs_list, g_gsched); /* type, link field */ 157struct geom_sched_vars { 158 struct mtx gs_mtx; 159 struct gs_list gs_scheds; /* list of algorithms */ 160 u_int gs_debug; 161 u_int gs_sched_count; /* how many algorithms ? */ 162 u_int gs_patched; /* g_io_request was patched */ 163 164 u_int gs_initialized; 165 u_int gs_expire_secs; /* expiration of hash entries */ 166 167 struct bio_queue_head gs_pending; 168 u_int gs_npending; 169 170 /* The following are for stats, usually protected by gs_mtx. */ 171 u_long gs_requests; /* total requests */ 172 u_long gs_done; /* total done */ 173 u_int gs_in_flight; /* requests in flight */ 174 u_int gs_writes_in_flight; 175 u_int gs_bytes_in_flight; 176 u_int gs_write_bytes_in_flight; 177 178 char gs_names[256]; /* names of schedulers */ 179}; 180 181static struct geom_sched_vars me = { 182 .gs_expire_secs = 10, 183}; 184 185SYSCTL_DECL(_kern_geom); 186SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0, 187 "GEOM_SCHED stuff"); 188 189SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD, 190 &me.gs_write_bytes_in_flight, 0, "Write bytes in flight"); 191 192SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD, 193 &me.gs_bytes_in_flight, 0, "Bytes in flight"); 194 195SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD, 196 &me.gs_writes_in_flight, 0, "Write Requests in flight"); 197 198SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD, 199 &me.gs_in_flight, 0, "Requests in flight"); 200 201SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD, 202 &me.gs_done, 0, "Total done"); 203 204SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD, 205 &me.gs_requests, 0, "Total requests"); 206 207SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD, 208 &me.gs_names, 0, "Algorithm names"); 209 210SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD, 211 &me.gs_sched_count, 0, "Number of algorithms"); 212 213SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW, 214 &me.gs_debug, 0, "Debug level"); 215 216SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW, 217 &me.gs_expire_secs, 0, "Expire time in seconds"); 218 219/* 220 * g_sched calls the scheduler algorithms with this lock held. 221 * The locking functions are exposed so the scheduler algorithms can also 222 * protect themselves e.g. when running a callout handler. 223 */ 224void 225g_sched_lock(struct g_geom *gp) 226{ 227 struct g_sched_softc *sc = gp->softc; 228 229 mtx_lock(&sc->sc_mtx); 230} 231 232void 233g_sched_unlock(struct g_geom *gp) 234{ 235 struct g_sched_softc *sc = gp->softc; 236 237 mtx_unlock(&sc->sc_mtx); 238} 239 240/* 241 * Support functions to handle references to the module, 242 * which are coming from devices using this scheduler. 243 */ 244static inline void 245g_gsched_ref(struct g_gsched *gsp) 246{ 247 248 atomic_add_int(&gsp->gs_refs, 1); 249} 250 251static inline void 252g_gsched_unref(struct g_gsched *gsp) 253{ 254 255 atomic_add_int(&gsp->gs_refs, -1); 256} 257 258/* 259 * Update the stats when this request is done. 260 */ 261static void 262g_sched_update_stats(struct bio *bio) 263{ 264 265 me.gs_done++; 266 me.gs_in_flight--; 267 me.gs_bytes_in_flight -= bio->bio_length; 268 if (bio->bio_cmd & BIO_WRITE) { 269 me.gs_writes_in_flight--; 270 me.gs_write_bytes_in_flight -= bio->bio_length; 271 } 272} 273 274/* 275 * Dispatch any pending request. 276 */ 277static void 278g_sched_forced_dispatch(struct g_geom *gp) 279{ 280 struct g_sched_softc *sc = gp->softc; 281 struct g_gsched *gsp = sc->sc_gsched; 282 struct bio *bp; 283 284 KASSERT(mtx_owned(&sc->sc_mtx), 285 ("sc_mtx not owned during forced dispatch")); 286 287 while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL) 288 g_io_request(bp, LIST_FIRST(&gp->consumer)); 289} 290 291/* 292 * The main dispatch loop, called either here after the start 293 * routine, or by scheduling algorithms when they receive a timeout 294 * or a 'done' notification. Does not share code with the forced 295 * dispatch path, since the gs_done() callback can call us. 296 */ 297void 298g_sched_dispatch(struct g_geom *gp) 299{ 300 struct g_sched_softc *sc = gp->softc; 301 struct g_gsched *gsp = sc->sc_gsched; 302 struct bio *bp; 303 304 KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch")); 305 306 if ((sc->sc_flags & G_SCHED_FLUSHING)) 307 return; 308 309 while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL) 310 g_io_request(bp, LIST_FIRST(&gp->consumer)); 311} 312 313/* 314 * Recent (8.0 and above) versions of FreeBSD have support to 315 * register classifiers of disk requests. The classifier is 316 * invoked by g_io_request(), and stores the information into 317 * bp->bio_classifier1. 318 * 319 * Support for older versions, which is left here only for 320 * documentation purposes, relies on two hacks: 321 * 1. classification info is written into the bio_caller1 322 * field of the topmost node in the bio chain. This field 323 * is rarely used, but this module is incompatible with 324 * those that use bio_caller1 for other purposes, 325 * such as ZFS and gjournal; 326 * 2. g_io_request() is patched in-memory when the module is 327 * loaded, so that the function calls a classifier as its 328 * first thing. g_io_request() is restored when the module 329 * is unloaded. This functionality is only supported for 330 * x86 and amd64, other architectures need source code changes. 331 */ 332 333/* 334 * Lookup the identity of the issuer of the original request. 335 * In the current implementation we use the curthread of the 336 * issuer, but different mechanisms may be implemented later 337 * so we do not make assumptions on the return value which for 338 * us is just an opaque identifier. 339 */ 340 341static inline u_long 342g_sched_classify(struct bio *bp) 343{ 344 345#if __FreeBSD_version > 800098 346 /* we have classifier fields in the struct bio */ 347#define HAVE_BIO_CLASSIFIER 348 return ((u_long)bp->bio_classifier1); 349#else 350#warning old version!!! 351 while (bp->bio_parent != NULL) 352 bp = bp->bio_parent; 353 354 return ((u_long)bp->bio_caller1); 355#endif 356} 357 358/* Return the hash chain for the given key. */ 359static inline struct g_hash * 360g_sched_hash(struct g_sched_softc *sc, u_long key) 361{ 362 363 return (&sc->sc_hash[key & sc->sc_mask]); 364} 365 366/* 367 * Helper function for the children classes, which takes 368 * a geom and a bio and returns the private descriptor 369 * associated to the request. This involves fetching 370 * the classification field and [al]locating the 371 * corresponding entry in the hash table. 372 */ 373void * 374g_sched_get_class(struct g_geom *gp, struct bio *bp) 375{ 376 struct g_sched_softc *sc; 377 struct g_sched_class *gsc; 378 struct g_gsched *gsp; 379 struct g_hash *bucket; 380 u_long key; 381 382 sc = gp->softc; 383 key = g_sched_classify(bp); 384 bucket = g_sched_hash(sc, key); 385 LIST_FOREACH(gsc, bucket, gsc_clist) { 386 if (key == gsc->gsc_key) { 387 gsc->gsc_refs++; 388 return (gsc->gsc_priv); 389 } 390 } 391 392 gsp = sc->sc_gsched; 393 gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size, 394 M_GEOM_SCHED, M_NOWAIT | M_ZERO); 395 if (!gsc) 396 return (NULL); 397 398 if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) { 399 free(gsc, M_GEOM_SCHED); 400 return (NULL); 401 } 402 403 gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */ 404 gsc->gsc_key = key; 405 LIST_INSERT_HEAD(bucket, gsc, gsc_clist); 406 407 gsc->gsc_expire = ticks + me.gs_expire_secs * hz; 408 409 return (gsc->gsc_priv); 410} 411 412/* 413 * Release a reference to the per-client descriptor, 414 */ 415void 416g_sched_put_class(struct g_geom *gp, void *priv) 417{ 418 struct g_sched_class *gsc; 419 struct g_sched_softc *sc; 420 421 gsc = g_sched_priv2class(priv); 422 gsc->gsc_expire = ticks + me.gs_expire_secs * hz; 423 424 if (--gsc->gsc_refs > 0) 425 return; 426 427 sc = gp->softc; 428 sc->sc_gsched->gs_fini_class(sc->sc_data, priv); 429 430 LIST_REMOVE(gsc, gsc_clist); 431 free(gsc, M_GEOM_SCHED); 432} 433 434static void 435g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask, 436 struct g_gsched *gsp, void *data) 437{ 438 struct g_sched_class *cp, *cp2; 439 int i; 440 441 if (!hp) 442 return; 443 444 if (data && gsp->gs_hash_unref) 445 gsp->gs_hash_unref(data); 446 447 for (i = 0; i < G_SCHED_HASH_SIZE; i++) { 448 LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2) 449 g_sched_put_class(gp, cp->gsc_priv); 450 } 451 452 hashdestroy(hp, M_GEOM_SCHED, mask); 453} 454 455static struct g_hash * 456g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags) 457{ 458 struct g_hash *hash; 459 460 if (gsp->gs_priv_size == 0) 461 return (NULL); 462 463 hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags); 464 465 return (hash); 466} 467 468static void 469g_sched_flush_classes(struct g_geom *gp) 470{ 471 struct g_sched_softc *sc; 472 struct g_sched_class *cp, *cp2; 473 int i; 474 475 sc = gp->softc; 476 477 if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0) 478 return; 479 480 for (i = 0; i < G_SCHED_HASH_SIZE; i++) { 481 LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) { 482 if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0) 483 g_sched_put_class(gp, cp->gsc_priv); 484 } 485 } 486 487 sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz; 488} 489 490/* 491 * Wait for the completion of any outstanding request. To ensure 492 * that this does not take forever the caller has to make sure that 493 * no new request enter the scehduler before calling us. 494 * 495 * Must be called with the gp mutex held and topology locked. 496 */ 497static int 498g_sched_wait_pending(struct g_geom *gp) 499{ 500 struct g_sched_softc *sc = gp->softc; 501 int endticks = ticks + hz; 502 503 g_topology_assert(); 504 505 while (sc->sc_pending && endticks - ticks >= 0) 506 msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4); 507 508 return (sc->sc_pending ? ETIMEDOUT : 0); 509} 510 511static int 512g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp) 513{ 514 struct g_sched_softc *sc = gp->softc; 515 int error; 516 517 /* Set the flushing flag: new bios will not enter the scheduler. */ 518 sc->sc_flags |= G_SCHED_FLUSHING; 519 520 g_sched_forced_dispatch(gp); 521 error = g_sched_wait_pending(gp); 522 if (error) 523 goto failed; 524 525 /* No more requests pending or in flight from the old gsp. */ 526 527 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); 528 sc->sc_hash = NULL; 529 530 /* 531 * Avoid deadlock here by releasing the gp mutex and reacquiring 532 * it once done. It should be safe, since no reconfiguration or 533 * destruction can take place due to the geom topology lock; no 534 * new request can use the current sc_data since we flagged the 535 * geom as being flushed. 536 */ 537 g_sched_unlock(gp); 538 gsp->gs_fini(sc->sc_data); 539 g_sched_lock(gp); 540 541 sc->sc_gsched = NULL; 542 sc->sc_data = NULL; 543 g_gsched_unref(gsp); 544 545failed: 546 sc->sc_flags &= ~G_SCHED_FLUSHING; 547 548 return (error); 549} 550 551static int 552g_sched_remove(struct g_geom *gp, struct g_gsched *gsp) 553{ 554 int error; 555 556 g_sched_lock(gp); 557 error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */ 558 g_sched_unlock(gp); 559 560 return (error); 561} 562 563/* 564 * Support function for create/taste -- locate the desired 565 * algorithm and grab a reference to it. 566 */ 567static struct g_gsched * 568g_gsched_find(const char *name) 569{ 570 struct g_gsched *gsp = NULL; 571 572 mtx_lock(&me.gs_mtx); 573 LIST_FOREACH(gsp, &me.gs_scheds, glist) { 574 if (strcmp(name, gsp->gs_name) == 0) { 575 g_gsched_ref(gsp); 576 break; 577 } 578 } 579 mtx_unlock(&me.gs_mtx); 580 581 return (gsp); 582} 583 584/* 585 * Rebuild the list of scheduler names. 586 * To be called with me.gs_mtx lock held. 587 */ 588static void 589g_gsched_build_names(struct g_gsched *gsp) 590{ 591 int pos, l; 592 struct g_gsched *cur; 593 594 pos = 0; 595 LIST_FOREACH(cur, &me.gs_scheds, glist) { 596 l = strlen(cur->gs_name); 597 if (l + pos + 1 + 1 < sizeof(me.gs_names)) { 598 if (pos != 0) 599 me.gs_names[pos++] = ' '; 600 strcpy(me.gs_names + pos, cur->gs_name); 601 pos += l; 602 } 603 } 604 me.gs_names[pos] = '\0'; 605} 606 607/* 608 * Register or unregister individual scheduling algorithms. 609 */ 610static int 611g_gsched_register(struct g_gsched *gsp) 612{ 613 struct g_gsched *cur; 614 int error = 0; 615 616 mtx_lock(&me.gs_mtx); 617 LIST_FOREACH(cur, &me.gs_scheds, glist) { 618 if (strcmp(gsp->gs_name, cur->gs_name) == 0) 619 break; 620 } 621 if (cur != NULL) { 622 G_SCHED_DEBUG(0, "A scheduler named %s already" 623 "exists.", gsp->gs_name); 624 error = EEXIST; 625 } else { 626 LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist); 627 gsp->gs_refs = 1; 628 me.gs_sched_count++; 629 g_gsched_build_names(gsp); 630 } 631 mtx_unlock(&me.gs_mtx); 632 633 return (error); 634} 635 636struct g_gsched_unregparm { 637 struct g_gsched *gup_gsp; 638 int gup_error; 639}; 640 641static void 642g_gsched_unregister(void *arg, int flag) 643{ 644 struct g_gsched_unregparm *parm = arg; 645 struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp; 646 struct g_sched_softc *sc; 647 struct g_geom *gp, *gp_tmp; 648 int error; 649 650 parm->gup_error = 0; 651 652 g_topology_assert(); 653 654 if (flag == EV_CANCEL) 655 return; 656 657 mtx_lock(&me.gs_mtx); 658 659 LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) { 660 if (gp->class != &g_sched_class) 661 continue; /* Should not happen. */ 662 663 sc = gp->softc; 664 if (sc->sc_gsched == gsp) { 665 error = g_sched_remove(gp, gsp); 666 if (error) 667 goto failed; 668 } 669 } 670 671 LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) { 672 if (cur != gsp) 673 continue; 674 675 if (gsp->gs_refs != 1) { 676 G_SCHED_DEBUG(0, "%s still in use.", 677 gsp->gs_name); 678 parm->gup_error = EBUSY; 679 } else { 680 LIST_REMOVE(gsp, glist); 681 me.gs_sched_count--; 682 g_gsched_build_names(gsp); 683 } 684 break; 685 } 686 687 if (cur == NULL) { 688 G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name); 689 parm->gup_error = ENOENT; 690 } 691 692failed: 693 mtx_unlock(&me.gs_mtx); 694} 695 696static inline void 697g_gsched_global_init(void) 698{ 699 700 if (!me.gs_initialized) { 701 G_SCHED_DEBUG(0, "Initializing global data."); 702 mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF); 703 LIST_INIT(&me.gs_scheds); 704 gs_bioq_init(&me.gs_pending); 705 me.gs_initialized = 1; 706 } 707} 708 709/* 710 * Module event called when a scheduling algorithm module is loaded or 711 * unloaded. 712 */ 713int 714g_gsched_modevent(module_t mod, int cmd, void *arg) 715{ 716 struct g_gsched *gsp = arg; 717 struct g_gsched_unregparm parm; 718 int error; 719 720 G_SCHED_DEBUG(0, "Modevent %d.", cmd); 721 722 /* 723 * If the module is loaded at boot, the geom thread that calls 724 * g_sched_init() might actually run after g_gsched_modevent(), 725 * so make sure that the module is properly initialized. 726 */ 727 g_gsched_global_init(); 728 729 error = EOPNOTSUPP; 730 switch (cmd) { 731 case MOD_LOAD: 732 error = g_gsched_register(gsp); 733 G_SCHED_DEBUG(0, "Loaded module %s error %d.", 734 gsp->gs_name, error); 735 if (error == 0) 736 g_retaste(&g_sched_class); 737 break; 738 739 case MOD_UNLOAD: 740 parm.gup_gsp = gsp; 741 parm.gup_error = 0; 742 743 error = g_waitfor_event(g_gsched_unregister, 744 &parm, M_WAITOK, NULL); 745 if (error == 0) 746 error = parm.gup_error; 747 G_SCHED_DEBUG(0, "Unloaded module %s error %d.", 748 gsp->gs_name, error); 749 break; 750 }; 751 752 return (error); 753} 754 755#ifdef KTR 756#define TRC_BIO_EVENT(e, bp) g_sched_trace_bio_ ## e (bp) 757 758static inline char 759g_sched_type(struct bio *bp) 760{ 761 762 if (0 != (bp->bio_cmd & BIO_READ)) 763 return ('R'); 764 else if (0 != (bp->bio_cmd & BIO_WRITE)) 765 return ('W'); 766 return ('U'); 767} 768 769static inline void 770g_sched_trace_bio_START(struct bio *bp) 771{ 772 773 CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp), 774 g_sched_type(bp), bp->bio_offset / ULONG_MAX, 775 bp->bio_offset, bp->bio_length); 776} 777 778static inline void 779g_sched_trace_bio_DONE(struct bio *bp) 780{ 781 782 CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp), 783 g_sched_type(bp), bp->bio_offset / ULONG_MAX, 784 bp->bio_offset, bp->bio_length); 785} 786#else /* !KTR */ 787#define TRC_BIO_EVENT(e, bp) 788#endif /* !KTR */ 789 790/* 791 * g_sched_done() and g_sched_start() dispatch the geom requests to 792 * the scheduling algorithm in use. 793 */ 794static void 795g_sched_done(struct bio *bio) 796{ 797 struct g_geom *gp = bio->bio_caller2; 798 struct g_sched_softc *sc = gp->softc; 799 800 TRC_BIO_EVENT(DONE, bio); 801 802 KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done")); 803 804 g_sched_lock(gp); 805 806 g_sched_update_stats(bio); 807 sc->sc_gsched->gs_done(sc->sc_data, bio); 808 if (!--sc->sc_pending) 809 wakeup(gp); 810 811 g_sched_flush_classes(gp); 812 g_sched_unlock(gp); 813 814 g_std_done(bio); 815} 816 817static void 818g_sched_start(struct bio *bp) 819{ 820 struct g_geom *gp = bp->bio_to->geom; 821 struct g_sched_softc *sc = gp->softc; 822 struct bio *cbp; 823 824 TRC_BIO_EVENT(START, bp); 825 G_SCHED_LOGREQ(bp, "Request received."); 826 827 cbp = g_clone_bio(bp); 828 if (cbp == NULL) { 829 g_io_deliver(bp, ENOMEM); 830 return; 831 } 832 cbp->bio_done = g_sched_done; 833 cbp->bio_to = LIST_FIRST(&gp->provider); 834 KASSERT(cbp->bio_to != NULL, ("NULL provider")); 835 836 /* We only schedule reads and writes. */ 837 if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE))) 838 goto bypass; 839 840 G_SCHED_LOGREQ(cbp, "Sending request."); 841 842 g_sched_lock(gp); 843 /* 844 * Call the algorithm's gs_start to queue the request in the 845 * scheduler. If gs_start fails then pass the request down, 846 * otherwise call g_sched_dispatch() which tries to push 847 * one or more requests down. 848 */ 849 if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) || 850 sc->sc_gsched->gs_start(sc->sc_data, cbp)) { 851 g_sched_unlock(gp); 852 goto bypass; 853 } 854 /* 855 * We use bio_caller1 to mark requests that are scheduled 856 * so make sure it is not NULL. 857 */ 858 if (cbp->bio_caller1 == NULL) 859 cbp->bio_caller1 = &me; /* anything not NULL */ 860 861 cbp->bio_caller2 = gp; 862 sc->sc_pending++; 863 864 /* Update general stats. */ 865 me.gs_in_flight++; 866 me.gs_requests++; 867 me.gs_bytes_in_flight += bp->bio_length; 868 if (bp->bio_cmd & BIO_WRITE) { 869 me.gs_writes_in_flight++; 870 me.gs_write_bytes_in_flight += bp->bio_length; 871 } 872 g_sched_dispatch(gp); 873 g_sched_unlock(gp); 874 return; 875 876bypass: 877 cbp->bio_done = g_std_done; 878 cbp->bio_caller1 = NULL; /* not scheduled */ 879 g_io_request(cbp, LIST_FIRST(&gp->consumer)); 880} 881 882/* 883 * The next few functions are the geom glue. 884 */ 885static void 886g_sched_orphan(struct g_consumer *cp) 887{ 888 889 g_topology_assert(); 890 g_sched_destroy(cp->geom, 1); 891} 892 893static int 894g_sched_access(struct g_provider *pp, int dr, int dw, int de) 895{ 896 struct g_geom *gp; 897 struct g_consumer *cp; 898 int error; 899 900 gp = pp->geom; 901 cp = LIST_FIRST(&gp->consumer); 902 error = g_access(cp, dr, dw, de); 903 904 return (error); 905} 906 907static void 908g_sched_temporary_start(struct bio *bio) 909{ 910 911 mtx_lock(&me.gs_mtx); 912 me.gs_npending++; 913 gs_bioq_disksort(&me.gs_pending, bio); 914 mtx_unlock(&me.gs_mtx); 915} 916 917static void 918g_sched_flush_pending(g_start_t *start) 919{ 920 struct bio *bp; 921 922 while ((bp = gs_bioq_takefirst(&me.gs_pending))) 923 start(bp); 924} 925 926static int 927g_insert_proxy(struct g_geom *gp, struct g_provider *newpp, 928 struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp) 929{ 930 struct g_sched_softc *sc = gp->softc; 931 g_start_t *saved_start, *flush = g_sched_start; 932 int error = 0, endticks = ticks + hz; 933 934 g_cancel_event(newpp); /* prevent taste() */ 935 /* copy private fields */ 936 newpp->private = pp->private; 937 newpp->index = pp->index; 938 939 /* Queue all the early requests coming for us. */ 940 me.gs_npending = 0; 941 saved_start = pp->geom->start; 942 dstgp->start = g_sched_temporary_start; 943 944 while (pp->nstart - pp->nend != me.gs_npending && 945 endticks - ticks >= 0) 946 tsleep(pp, PRIBIO, "-", hz/10); 947 948 if (pp->nstart - pp->nend != me.gs_npending) { 949 flush = saved_start; 950 error = ETIMEDOUT; 951 goto fail; 952 } 953 954 /* link pp to this geom */ 955 LIST_REMOVE(pp, provider); 956 pp->geom = gp; 957 LIST_INSERT_HEAD(&gp->provider, pp, provider); 958 959 /* 960 * replicate the counts from the parent in the 961 * new provider and consumer nodes 962 */ 963 cp->acr = newpp->acr = pp->acr; 964 cp->acw = newpp->acw = pp->acw; 965 cp->ace = newpp->ace = pp->ace; 966 sc->sc_flags |= G_SCHED_PROXYING; 967 968fail: 969 dstgp->start = saved_start; 970 971 g_sched_flush_pending(flush); 972 973 return (error); 974} 975 976/* 977 * Create a geom node for the device passed as *pp. 978 * If successful, add a reference to this gsp. 979 */ 980static int 981g_sched_create(struct gctl_req *req, struct g_class *mp, 982 struct g_provider *pp, struct g_gsched *gsp, int proxy) 983{ 984 struct g_sched_softc *sc = NULL; 985 struct g_geom *gp, *dstgp; 986 struct g_provider *newpp = NULL; 987 struct g_consumer *cp = NULL; 988 char name[64]; 989 int error; 990 991 g_topology_assert(); 992 993 snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX); 994 LIST_FOREACH(gp, &mp->geom, geom) { 995 if (strcmp(gp->name, name) == 0) { 996 gctl_error(req, "Geom %s already exists.", 997 name); 998 return (EEXIST); 999 } 1000 } 1001 1002 gp = g_new_geomf(mp, name); 1003 dstgp = proxy ? pp->geom : gp; /* where do we link the provider */ 1004 if (gp == NULL) { 1005 gctl_error(req, "Cannot create geom %s.", name); 1006 error = ENOMEM; 1007 goto fail; 1008 } 1009 1010 sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); 1011 sc->sc_gsched = gsp; 1012 sc->sc_data = gsp->gs_init(gp); 1013 if (sc->sc_data == NULL) { 1014 error = ENOMEM; 1015 goto fail; 1016 } 1017 1018 sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK); 1019 1020 /* 1021 * Do not initialize the flush mechanism, will be initialized 1022 * on the first insertion on the hash table. 1023 */ 1024 1025 mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF); 1026 1027 gp->softc = sc; 1028 gp->start = g_sched_start; 1029 gp->orphan = g_sched_orphan; 1030 gp->access = g_sched_access; 1031 gp->dumpconf = g_sched_dumpconf; 1032 1033 newpp = g_new_providerf(dstgp, gp->name); 1034 if (newpp == NULL) { 1035 gctl_error(req, "Cannot create provider %s.", name); 1036 error = ENOMEM; 1037 goto fail; 1038 } 1039 1040 newpp->mediasize = pp->mediasize; 1041 newpp->sectorsize = pp->sectorsize; 1042 1043 cp = g_new_consumer(gp); 1044 if (cp == NULL) { 1045 gctl_error(req, "Cannot create consumer for %s.", 1046 gp->name); 1047 error = ENOMEM; 1048 goto fail; 1049 } 1050 1051 error = g_attach(cp, proxy ? newpp : pp); 1052 if (error != 0) { 1053 gctl_error(req, "Cannot attach to provider %s.", 1054 pp->name); 1055 goto fail; 1056 } 1057 1058 g_error_provider(newpp, 0); 1059 if (proxy) { 1060 error = g_insert_proxy(gp, newpp, dstgp, pp, cp); 1061 if (error) 1062 goto fail; 1063 } 1064 G_SCHED_DEBUG(0, "Device %s created.", gp->name); 1065 1066 g_gsched_ref(gsp); 1067 1068 return (0); 1069 1070fail: 1071 if (cp != NULL) { 1072 if (cp->provider != NULL) 1073 g_detach(cp); 1074 g_destroy_consumer(cp); 1075 } 1076 1077 if (newpp != NULL) 1078 g_destroy_provider(newpp); 1079 1080 if (sc && sc->sc_hash) { 1081 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, 1082 gsp, sc->sc_data); 1083 } 1084 1085 if (sc && sc->sc_data) 1086 gsp->gs_fini(sc->sc_data); 1087 1088 if (gp != NULL) { 1089 if (gp->softc != NULL) 1090 g_free(gp->softc); 1091 g_destroy_geom(gp); 1092 } 1093 1094 return (error); 1095} 1096 1097/* 1098 * Support for dynamic switching of scheduling algorithms. 1099 * First initialize the data structures for the new algorithm, 1100 * then call g_sched_remove_locked() to flush all references 1101 * to the old one, finally link the new algorithm. 1102 */ 1103static int 1104g_sched_change_algo(struct gctl_req *req, struct g_class *mp, 1105 struct g_provider *pp, struct g_gsched *gsp) 1106{ 1107 struct g_sched_softc *sc; 1108 struct g_geom *gp; 1109 struct g_hash *newh; 1110 void *data; 1111 u_long mask; 1112 int error = 0; 1113 1114 gp = pp->geom; 1115 sc = gp->softc; 1116 1117 data = gsp->gs_init(gp); 1118 if (data == NULL) 1119 return (ENOMEM); 1120 1121 newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK); 1122 if (gsp->gs_priv_size && !newh) { 1123 error = ENOMEM; 1124 goto fail; 1125 } 1126 1127 g_sched_lock(gp); 1128 if (sc->sc_gsched) { /* can be NULL in some cases */ 1129 error = g_sched_remove_locked(gp, sc->sc_gsched); 1130 if (error) 1131 goto fail; 1132 } 1133 1134 g_gsched_ref(gsp); 1135 sc->sc_gsched = gsp; 1136 sc->sc_data = data; 1137 sc->sc_hash = newh; 1138 sc->sc_mask = mask; 1139 1140 g_sched_unlock(gp); 1141 1142 return (0); 1143 1144fail: 1145 if (newh) 1146 g_sched_hash_fini(gp, newh, mask, gsp, data); 1147 1148 if (data) 1149 gsp->gs_fini(data); 1150 1151 g_sched_unlock(gp); 1152 1153 return (error); 1154} 1155 1156/* 1157 * Stop the request flow directed to the proxy, redirecting the new 1158 * requests to the me.gs_pending queue. 1159 */ 1160static struct g_provider * 1161g_detach_proxy(struct g_geom *gp) 1162{ 1163 struct g_consumer *cp; 1164 struct g_provider *pp, *newpp; 1165 1166 do { 1167 pp = LIST_FIRST(&gp->provider); 1168 if (pp == NULL) 1169 break; 1170 cp = LIST_FIRST(&gp->consumer); 1171 if (cp == NULL) 1172 break; 1173 newpp = cp->provider; 1174 if (newpp == NULL) 1175 break; 1176 1177 me.gs_npending = 0; 1178 pp->geom->start = g_sched_temporary_start; 1179 1180 return (pp); 1181 } while (0); 1182 printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name); 1183 1184 return (NULL); 1185} 1186 1187static void 1188g_sched_blackhole(struct bio *bp) 1189{ 1190 1191 g_io_deliver(bp, ENXIO); 1192} 1193 1194static inline void 1195g_reparent_provider(struct g_provider *pp, struct g_geom *gp, 1196 struct g_provider *newpp) 1197{ 1198 1199 LIST_REMOVE(pp, provider); 1200 if (newpp) { 1201 pp->private = newpp->private; 1202 pp->index = newpp->index; 1203 } 1204 pp->geom = gp; 1205 LIST_INSERT_HEAD(&gp->provider, pp, provider); 1206} 1207 1208static inline void 1209g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp) 1210{ 1211 struct g_geom *gp = oldpp->geom; 1212 1213 g_reparent_provider(oldpp, newpp->geom, newpp); 1214 1215 /* 1216 * Hackish: let the system destroy the old provider for us, just 1217 * in case someone attached a consumer to it, in which case a 1218 * direct call to g_destroy_provider() would not work. 1219 */ 1220 g_reparent_provider(newpp, gp, NULL); 1221} 1222 1223/* 1224 * Complete the proxy destruction, linking the old provider to its 1225 * original geom, and destroying the proxy provider. Also take care 1226 * of issuing the pending requests collected in me.gs_pending (if any). 1227 */ 1228static int 1229g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp) 1230{ 1231 struct g_consumer *cp; 1232 struct g_provider *newpp; 1233 1234 do { 1235 cp = LIST_FIRST(&gp->consumer); 1236 if (cp == NULL) 1237 break; 1238 newpp = cp->provider; 1239 if (newpp == NULL) 1240 break; 1241 1242 /* Relink the provider to its original geom. */ 1243 g_unproxy_provider(oldpp, newpp); 1244 1245 /* Detach consumer from provider, and destroy provider. */ 1246 cp->acr = newpp->acr = 0; 1247 cp->acw = newpp->acw = 0; 1248 cp->ace = newpp->ace = 0; 1249 g_detach(cp); 1250 1251 /* Send the pending bios through the right start function. */ 1252 g_sched_flush_pending(oldpp->geom->start); 1253 1254 return (0); 1255 } while (0); 1256 printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name); 1257 1258 /* We cannot send the pending bios anywhere... */ 1259 g_sched_flush_pending(g_sched_blackhole); 1260 1261 return (EINVAL); 1262} 1263 1264static int 1265g_sched_destroy(struct g_geom *gp, boolean_t force) 1266{ 1267 struct g_provider *pp, *oldpp = NULL; 1268 struct g_sched_softc *sc; 1269 struct g_gsched *gsp; 1270 int error; 1271 1272 g_topology_assert(); 1273 sc = gp->softc; 1274 if (sc == NULL) 1275 return (ENXIO); 1276 if (!(sc->sc_flags & G_SCHED_PROXYING)) { 1277 pp = LIST_FIRST(&gp->provider); 1278 if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { 1279 const char *msg = force ? 1280 "but we force removal" : "cannot remove"; 1281 1282 G_SCHED_DEBUG(!force, 1283 "Device %s is still open (r%dw%de%d), %s.", 1284 pp->name, pp->acr, pp->acw, pp->ace, msg); 1285 if (!force) 1286 return (EBUSY); 1287 } else { 1288 G_SCHED_DEBUG(0, "Device %s removed.", gp->name); 1289 } 1290 } else 1291 oldpp = g_detach_proxy(gp); 1292 1293 gsp = sc->sc_gsched; 1294 if (gsp) { 1295 /* 1296 * XXX bad hack here: force a dispatch to release 1297 * any reference to the hash table still held by 1298 * the scheduler. 1299 */ 1300 g_sched_lock(gp); 1301 /* 1302 * We are dying here, no new requests should enter 1303 * the scheduler. This is granted by the topolgy, 1304 * either in case we were proxying (new bios are 1305 * being redirected) or not (see the access check 1306 * above). 1307 */ 1308 g_sched_forced_dispatch(gp); 1309 error = g_sched_wait_pending(gp); 1310 1311 if (error) { 1312 /* 1313 * Not all the requests came home: this might happen 1314 * under heavy load, or if we were waiting for any 1315 * bio which is served in the event path (see 1316 * geom_slice.c for an example of how this can 1317 * happen). Try to restore a working configuration 1318 * if we can fail. 1319 */ 1320 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { 1321 g_sched_flush_pending(force ? 1322 g_sched_blackhole : g_sched_start); 1323 } 1324 1325 /* 1326 * In the forced destroy case there is not so much 1327 * we can do, we have pending bios that will call 1328 * g_sched_done() somehow, and we don't want them 1329 * to crash the system using freed memory. We tell 1330 * the user that something went wrong, and leak some 1331 * memory here. 1332 * Note: the callers using force = 1 ignore the 1333 * return value. 1334 */ 1335 if (force) { 1336 G_SCHED_DEBUG(0, "Pending requests while " 1337 " destroying geom, some memory leaked."); 1338 } 1339 1340 return (error); 1341 } 1342 1343 g_sched_unlock(gp); 1344 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, 1345 gsp, sc->sc_data); 1346 sc->sc_hash = NULL; 1347 gsp->gs_fini(sc->sc_data); 1348 g_gsched_unref(gsp); 1349 sc->sc_gsched = NULL; 1350 } 1351 1352 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { 1353 error = g_destroy_proxy(gp, oldpp); 1354 1355 if (error) { 1356 if (force) { 1357 G_SCHED_DEBUG(0, "Unrecoverable error while " 1358 "destroying a proxy geom, leaking some " 1359 " memory."); 1360 } 1361 1362 return (error); 1363 } 1364 } 1365 1366 mtx_destroy(&sc->sc_mtx); 1367 1368 g_free(gp->softc); 1369 gp->softc = NULL; 1370 g_wither_geom(gp, ENXIO); 1371 1372 return (error); 1373} 1374 1375static int 1376g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, 1377 struct g_geom *gp) 1378{ 1379 1380 return (g_sched_destroy(gp, 0)); 1381} 1382 1383/* 1384 * Functions related to the classification of requests. 1385 * 1386 * On recent FreeBSD versions (8.0 and above), we store a reference 1387 * to the issuer of a request in bp->bio_classifier1 as soon 1388 * as the bio is posted to the geom queue (and not later, because 1389 * requests are managed by the g_down thread afterwards). 1390 * 1391 * On older versions of the system (but this code is not used 1392 * in any existing release), we [ab]use the caller1 field in the 1393 * root element of the bio tree to store the classification info. 1394 * The marking is done at the beginning of g_io_request() 1395 * and only if we find that the field is NULL. 1396 * 1397 * To avoid rebuilding the kernel, this module will patch the 1398 * initial part of g_io_request() so it jumps to some hand-coded 1399 * assembly that does the marking and then executes the original 1400 * body of g_io_request(). 1401 * 1402 * fake_ioreq[] is architecture-specific machine code 1403 * that implements the above. CODE_SIZE, STORE_SIZE etc. 1404 * are constants used in the patching routine. Look at the 1405 * code in g_ioreq_patch() for the details. 1406 */ 1407 1408#ifndef HAVE_BIO_CLASSIFIER 1409/* 1410 * Support for old FreeBSD versions 1411 */ 1412#if defined(__i386__) 1413#define CODE_SIZE 29 1414#define STORE_SIZE 5 1415#define EPILOGUE 5 1416#define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) 1417 1418static u_char fake_ioreq[SIZE] = { 1419 0x8b, 0x44, 0x24, 0x04, /* mov bp, %eax */ 1420 /* 1: */ 1421 0x89, 0xc2, /* mov %eax, %edx # edx = bp */ 1422 0x8b, 0x40, 0x64, /* mov bp->bio_parent, %eax */ 1423 0x85, 0xc0, /* test %eax, %eax */ 1424 0x75, 0xf7, /* jne 1b */ 1425 0x8b, 0x42, 0x30, /* mov bp->bp_caller1, %eax */ 1426 0x85, 0xc0, /* test %eax, %eax */ 1427 0x75, 0x09, /* jne 2f */ 1428 0x64, 0xa1, 0x00, 0x00, /* mov %fs:0, %eax */ 1429 0x00, 0x00, 1430 0x89, 0x42, 0x30, /* mov %eax, bp->bio_caller1 */ 1431 /* 2: */ 1432 0x55, 0x89, 0xe5, 0x57, 0x56, 1433 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ 1434}; 1435#elif defined(__amd64) 1436#define CODE_SIZE 38 1437#define STORE_SIZE 6 1438#define EPILOGUE 5 1439#define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) 1440 1441static u_char fake_ioreq[SIZE] = { 1442 0x48, 0x89, 0xf8, /* mov bp, %rax */ 1443 /* 1: */ 1444 0x48, 0x89, 0xc2, /* mov %rax, %rdx # rdx = bp */ 1445 0x48, 0x8b, 0x82, 0xa8, /* mov bp->bio_parent, %rax */ 1446 0x00, 0x00, 0x00, 1447 0x48, 0x85, 0xc0, /* test %rax, %rax */ 1448 0x75, 0xf1, /* jne 1b */ 1449 0x48, 0x83, 0x7a, 0x58, /* cmp $0, bp->bp_caller1 */ 1450 0x00, 1451 0x75, 0x0d, /* jne 2f */ 1452 0x65, 0x48, 0x8b, 0x04, /* mov %gs:0, %rax */ 1453 0x25, 0x00, 0x00, 0x00, 1454 0x00, 1455 0x48, 0x89, 0x42, 0x58, /* mov %rax, bp->bio_caller1 */ 1456 /* 2: */ 1457 0x55, 0x48, 0x89, 0xe5, 0x41, 0x56, 1458 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ 1459}; 1460#else /* neither x86 nor amd64 */ 1461static void 1462g_new_io_request(struct bio *bp, struct g_consumer *cp) 1463{ 1464 struct bio *top = bp; 1465 1466 /* 1467 * bio classification: if bio_caller1 is available in the 1468 * root of the 'struct bio' tree, store there the thread id 1469 * of the thread that originated the request. 1470 * More sophisticated classification schemes can be used. 1471 */ 1472 while (top->bio_parent) 1473 top = top->bio_parent; 1474 1475 if (top->bio_caller1 == NULL) 1476 top->bio_caller1 = curthread; 1477} 1478 1479#error please add the code above in g_new_io_request() to the beginning of \ 1480 /sys/geom/geom_io.c::g_io_request(), and remove this line. 1481#endif /* end of arch-specific code */ 1482 1483static int 1484g_ioreq_patch(void) 1485{ 1486 u_char *original; 1487 u_long ofs; 1488 int found; 1489 1490 if (me.gs_patched) 1491 return (-1); 1492 1493 original = (u_char *)g_io_request; 1494 1495 found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE); 1496 if (!found) 1497 return (-1); 1498 1499 /* Jump back to the original + STORE_SIZE. */ 1500 ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE); 1501 bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4); 1502 1503 /* Patch the original address with a jump to the trampoline. */ 1504 *original = 0xe9; /* jump opcode */ 1505 ofs = fake_ioreq - (original + 5); 1506 bcopy(&ofs, original + 1, 4); 1507 1508 me.gs_patched = 1; 1509 1510 return (0); 1511} 1512 1513/* 1514 * Restore the original code, this is easy. 1515 */ 1516static void 1517g_ioreq_restore(void) 1518{ 1519 u_char *original; 1520 1521 if (me.gs_patched) { 1522 original = (u_char *)g_io_request; 1523 bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE); 1524 me.gs_patched = 0; 1525 } 1526} 1527 1528static inline void 1529g_classifier_ini(void) 1530{ 1531 1532 g_ioreq_patch(); 1533} 1534 1535static inline void 1536g_classifier_fini(void) 1537{ 1538 1539 g_ioreq_restore(); 1540} 1541 1542/*--- end of support code for older FreeBSD versions */ 1543 1544#else /* HAVE_BIO_CLASSIFIER */ 1545 1546/* 1547 * Classifier support for recent FreeBSD versions: we use 1548 * a very simple classifier, only use curthread to tag a request. 1549 * The classifier is registered at module load, and unregistered 1550 * at module unload. 1551 */ 1552static int 1553g_sched_tag(void *arg, struct bio *bp) 1554{ 1555 1556 bp->bio_classifier1 = curthread; 1557 return (1); 1558} 1559 1560static struct g_classifier_hook g_sched_classifier = { 1561 .func = g_sched_tag, 1562}; 1563 1564static inline void 1565g_classifier_ini(void) 1566{ 1567 1568 g_register_classifier(&g_sched_classifier); 1569} 1570 1571static inline void 1572g_classifier_fini(void) 1573{ 1574 1575 g_unregister_classifier(&g_sched_classifier); 1576} 1577#endif /* HAVE_BIO_CLASSIFIER */ 1578 1579static void 1580g_sched_init(struct g_class *mp) 1581{ 1582 1583 g_gsched_global_init(); 1584 1585 G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.", 1586 mp, &g_sched_class); 1587 1588 /* Patch g_io_request to store classification info in the bio. */ 1589 g_classifier_ini(); 1590} 1591 1592static void 1593g_sched_fini(struct g_class *mp) 1594{ 1595 1596 g_classifier_fini(); 1597 1598 G_SCHED_DEBUG(0, "Unloading..."); 1599 1600 KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers")); 1601 mtx_destroy(&me.gs_mtx); 1602} 1603 1604/* 1605 * Read the i-th argument for a request, skipping the /dev/ 1606 * prefix if present. 1607 */ 1608static const char * 1609g_sched_argi(struct gctl_req *req, int i) 1610{ 1611 static const char *dev_prefix = "/dev/"; 1612 const char *name; 1613 char param[16]; 1614 int l = strlen(dev_prefix); 1615 1616 snprintf(param, sizeof(param), "arg%d", i); 1617 name = gctl_get_asciiparam(req, param); 1618 if (name == NULL) 1619 gctl_error(req, "No 'arg%d' argument", i); 1620 else if (strncmp(name, dev_prefix, l) == 0) 1621 name += l; 1622 return (name); 1623} 1624 1625/* 1626 * Fetch nargs and do appropriate checks. 1627 */ 1628static int 1629g_sched_get_nargs(struct gctl_req *req) 1630{ 1631 int *nargs; 1632 1633 nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); 1634 if (nargs == NULL) { 1635 gctl_error(req, "No 'nargs' argument"); 1636 return (0); 1637 } 1638 if (*nargs <= 0) 1639 gctl_error(req, "Missing device(s)."); 1640 return (*nargs); 1641} 1642 1643/* 1644 * Check whether we should add the class on certain volumes when 1645 * this geom is created. Right now this is under control of a kenv 1646 * variable containing the names of all devices that we care about. 1647 * Probably we should only support transparent insertion as the 1648 * preferred mode of operation. 1649 */ 1650static struct g_geom * 1651g_sched_taste(struct g_class *mp, struct g_provider *pp, 1652 int flags __unused) 1653{ 1654 struct g_gsched *gsp = NULL; /* the . algorithm we want */ 1655 const char *s; /* generic string pointer */ 1656 const char *taste_names; /* devices we like */ 1657 int l; 1658 1659 g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, 1660 mp->name, pp->name); 1661 g_topology_assert(); 1662 1663 G_SCHED_DEBUG(2, "Tasting %s.", pp->name); 1664 1665 do { 1666 /* do not taste on ourselves */ 1667 if (pp->geom->class == mp) 1668 break; 1669 1670 taste_names = getenv("geom.sched.taste"); 1671 if (taste_names == NULL) 1672 break; 1673 1674 l = strlen(pp->name); 1675 for (s = taste_names; *s && 1676 (s = strstr(s, pp->name)); s++) { 1677 /* further checks for an exact match */ 1678 if ( (s == taste_names || s[-1] == ' ') && 1679 (s[l] == '\0' || s[l] == ' ') ) 1680 break; 1681 } 1682 if (s == NULL) 1683 break; 1684 G_SCHED_DEBUG(0, "Attach device %s match [%s]\n", 1685 pp->name, s); 1686 1687 /* look up the provider name in the list */ 1688 s = getenv("geom.sched.algo"); 1689 if (s == NULL) 1690 s = "rr"; 1691 1692 gsp = g_gsched_find(s); /* also get a reference */ 1693 if (gsp == NULL) { 1694 G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s); 1695 break; 1696 } 1697 1698 /* XXX create with 1 as last argument ? */ 1699 g_sched_create(NULL, mp, pp, gsp, 0); 1700 g_gsched_unref(gsp); 1701 } while (0); 1702 return NULL; 1703} 1704 1705static void 1706g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy) 1707{ 1708 struct g_provider *pp; 1709 struct g_gsched *gsp; 1710 const char *name; 1711 int i, nargs; 1712 1713 g_topology_assert(); 1714 1715 name = gctl_get_asciiparam(req, "algo"); 1716 if (name == NULL) { 1717 gctl_error(req, "No '%s' argument", "algo"); 1718 return; 1719 } 1720 1721 gsp = g_gsched_find(name); /* also get a reference */ 1722 if (gsp == NULL) { 1723 gctl_error(req, "Bad algorithm '%s'", name); 1724 return; 1725 } 1726 1727 nargs = g_sched_get_nargs(req); 1728 1729 /* 1730 * Run on the arguments, and break on any error. 1731 * We look for a device name, but skip the /dev/ prefix if any. 1732 */ 1733 for (i = 0; i < nargs; i++) { 1734 name = g_sched_argi(req, i); 1735 if (name == NULL) 1736 break; 1737 pp = g_provider_by_name(name); 1738 if (pp == NULL) { 1739 G_SCHED_DEBUG(1, "Provider %s is invalid.", name); 1740 gctl_error(req, "Provider %s is invalid.", name); 1741 break; 1742 } 1743 if (g_sched_create(req, mp, pp, gsp, proxy) != 0) 1744 break; 1745 } 1746 1747 g_gsched_unref(gsp); 1748} 1749 1750static void 1751g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp) 1752{ 1753 struct g_provider *pp; 1754 struct g_gsched *gsp; 1755 const char *name; 1756 int i, nargs; 1757 1758 g_topology_assert(); 1759 1760 name = gctl_get_asciiparam(req, "algo"); 1761 if (name == NULL) { 1762 gctl_error(req, "No '%s' argument", "algo"); 1763 return; 1764 } 1765 1766 gsp = g_gsched_find(name); /* also get a reference */ 1767 if (gsp == NULL) { 1768 gctl_error(req, "Bad algorithm '%s'", name); 1769 return; 1770 } 1771 1772 nargs = g_sched_get_nargs(req); 1773 1774 /* 1775 * Run on the arguments, and break on any error. 1776 * We look for a device name, but skip the /dev/ prefix if any. 1777 */ 1778 for (i = 0; i < nargs; i++) { 1779 name = g_sched_argi(req, i); 1780 if (name == NULL) 1781 break; 1782 pp = g_provider_by_name(name); 1783 if (pp == NULL || pp->geom->class != mp) { 1784 G_SCHED_DEBUG(1, "Provider %s is invalid.", name); 1785 gctl_error(req, "Provider %s is invalid.", name); 1786 break; 1787 } 1788 if (g_sched_change_algo(req, mp, pp, gsp) != 0) 1789 break; 1790 } 1791 1792 g_gsched_unref(gsp); 1793} 1794 1795static struct g_geom * 1796g_sched_find_geom(struct g_class *mp, const char *name) 1797{ 1798 struct g_geom *gp; 1799 1800 LIST_FOREACH(gp, &mp->geom, geom) { 1801 if (strcmp(gp->name, name) == 0) 1802 return (gp); 1803 } 1804 return (NULL); 1805} 1806 1807static void 1808g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp) 1809{ 1810 int nargs, *force, error, i; 1811 struct g_geom *gp; 1812 const char *name; 1813 1814 g_topology_assert(); 1815 1816 nargs = g_sched_get_nargs(req); 1817 1818 force = gctl_get_paraml(req, "force", sizeof(*force)); 1819 if (force == NULL) { 1820 gctl_error(req, "No 'force' argument"); 1821 return; 1822 } 1823 1824 for (i = 0; i < nargs; i++) { 1825 name = g_sched_argi(req, i); 1826 if (name == NULL) 1827 break; 1828 1829 gp = g_sched_find_geom(mp, name); 1830 if (gp == NULL) { 1831 G_SCHED_DEBUG(1, "Device %s is invalid.", name); 1832 gctl_error(req, "Device %s is invalid.", name); 1833 break; 1834 } 1835 1836 error = g_sched_destroy(gp, *force); 1837 if (error != 0) { 1838 gctl_error(req, "Cannot destroy device %s (error=%d).", 1839 gp->name, error); 1840 break; 1841 } 1842 } 1843} 1844 1845static void 1846g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb) 1847{ 1848 uint32_t *version; 1849 1850 g_topology_assert(); 1851 1852 version = gctl_get_paraml(req, "version", sizeof(*version)); 1853 if (version == NULL) { 1854 gctl_error(req, "No '%s' argument.", "version"); 1855 return; 1856 } 1857 1858 if (*version != G_SCHED_VERSION) { 1859 gctl_error(req, "Userland and kernel parts are " 1860 "out of sync."); 1861 return; 1862 } 1863 1864 if (strcmp(verb, "create") == 0) { 1865 g_sched_ctl_create(req, mp, 0); 1866 return; 1867 } else if (strcmp(verb, "insert") == 0) { 1868 g_sched_ctl_create(req, mp, 1); 1869 return; 1870 } else if (strcmp(verb, "configure") == 0) { 1871 g_sched_ctl_configure(req, mp); 1872 return; 1873 } else if (strcmp(verb, "destroy") == 0) { 1874 g_sched_ctl_destroy(req, mp); 1875 return; 1876 } 1877 1878 gctl_error(req, "Unknown verb."); 1879} 1880 1881static void 1882g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 1883 struct g_consumer *cp, struct g_provider *pp) 1884{ 1885 struct g_sched_softc *sc = gp->softc; 1886 struct g_gsched *gsp = sc->sc_gsched; 1887 if (indent == NULL) { /* plaintext */ 1888 sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--"); 1889 } 1890 if (gsp->gs_dumpconf) 1891 gsp->gs_dumpconf(sb, indent, gp, cp, pp); 1892} 1893 1894DECLARE_GEOM_CLASS(g_sched_class, g_sched); 1895MODULE_VERSION(geom_sched, 0); 1896