g_sched.c revision 206551
1/*- 2 * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27/* 28 * $Id$ 29 * $FreeBSD: head/sys/geom/sched/g_sched.c 206551 2010-04-13 09:53:08Z luigi $ 30 * 31 * Main control module for geom-based disk schedulers ('sched'). 32 * 33 * USER VIEW 34 * A 'sched' node is typically inserted transparently between 35 * an existing provider pp and its original geom gp 36 * 37 * [pp --> gp ..] 38 * 39 * using the command "geom sched insert <provider>" and 40 * resulting in the following topology 41 * 42 * [pp --> sched_gp --> cp] [new_pp --> gp ... ] 43 * 44 * Deletion "geom sched destroy <provider>.sched." restores the 45 * original chain. The normal "geom sched create <provide>" 46 * is also supported. 47 * 48 * INTERNALS 49 * Internally, the 'sched' uses the following data structures 50 * 51 * geom{} g_sched_softc{} g_gsched{} 52 * +----------+ +---------------+ +-------------+ 53 * | softc *-|--->| sc_gsched *-|-->| gs_init | 54 * | ... | | | | gs_fini | 55 * | | | [ hash table] | | gs_start | 56 * +----------+ | | | ... | 57 * | | +-------------+ 58 * | | 59 * | | g_*_softc{} 60 * | | +-------------+ 61 * | sc_data *-|-->| | 62 * +---------------+ | algorithm- | 63 * | specific | 64 * +-------------+ 65 * 66 * A g_sched_softc{} is created with a "geom sched insert" call. 67 * In turn this instantiates a specific scheduling algorithm, 68 * which sets sc_gsched to point to the algorithm callbacks, 69 * and calls gs_init() to create the g_*_softc{} . 70 * The other callbacks (gs_start, gs_next, ...) are invoked 71 * as needed 72 * 73 * g_sched_softc{} is defined in g_sched.h and mostly used here; 74 * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h; 75 * g_*_softc{} is defined/implemented by each algorithm (gs_*.c) 76 * 77 * DATA MOVING 78 * When a bio is received on the provider, it goes to the 79 * g_sched_start() which calls gs_start() to initially queue it; 80 * then we call g_sched_dispatch() that loops around gs_next() 81 * to select zero or more bio's to be sent downstream. 82 * 83 * g_sched_dispatch() can also be called as a result of a timeout, 84 * e.g. when doing anticipation or pacing requests. 85 * 86 * When a bio comes back, it goes to g_sched_done() which in turn 87 * calls gs_done(). The latter does any necessary housekeeping in 88 * the scheduling algorithm, and may decide to call g_sched_dispatch() 89 * to send more bio's downstream. 90 * 91 * If an algorithm needs per-flow queues, these are created 92 * calling gs_init_class() and destroyed with gs_fini_class(), 93 * and they are also inserted in the hash table implemented in 94 * the g_sched_softc{} 95 * 96 * If an algorithm is replaced, or a transparently-inserted node is 97 * removed with "geom sched destroy", we need to remove all references 98 * to the g_*_softc{} and g_sched_softc from the bio's still in 99 * the scheduler. g_sched_forced_dispatch() helps doing this. 100 * XXX need to explain better. 101 */ 102 103#include <sys/cdefs.h> 104#include <sys/param.h> 105#include <sys/systm.h> 106#include <sys/kernel.h> 107#include <sys/module.h> 108#include <sys/lock.h> 109#include <sys/mutex.h> 110#include <sys/bio.h> 111#include <sys/limits.h> 112#include <sys/hash.h> 113#include <sys/sysctl.h> 114#include <sys/malloc.h> 115#include <sys/proc.h> /* we access curthread */ 116#include <geom/geom.h> 117#include "gs_scheduler.h" 118#include "g_sched.h" /* geom hooks */ 119 120/* 121 * Size of the per-geom hash table storing traffic classes. 122 * We may decide to change it at a later time, it has no ABI 123 * implications as it is only used for run-time allocations. 124 */ 125#define G_SCHED_HASH_SIZE 32 126 127static int g_sched_destroy(struct g_geom *gp, boolean_t force); 128static int g_sched_destroy_geom(struct gctl_req *req, 129 struct g_class *mp, struct g_geom *gp); 130static void g_sched_config(struct gctl_req *req, struct g_class *mp, 131 const char *verb); 132static struct g_geom *g_sched_taste(struct g_class *mp, 133 struct g_provider *pp, int flags __unused); 134static void g_sched_dumpconf(struct sbuf *sb, const char *indent, 135 struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); 136static void g_sched_init(struct g_class *mp); 137static void g_sched_fini(struct g_class *mp); 138 139struct g_class g_sched_class = { 140 .name = G_SCHED_CLASS_NAME, 141 .version = G_VERSION, 142 .ctlreq = g_sched_config, 143 .taste = g_sched_taste, 144 .destroy_geom = g_sched_destroy_geom, 145 .init = g_sched_init, 146 .fini = g_sched_fini 147}; 148 149MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures"); 150 151/* 152 * Global variables describing the state of the geom_sched module. 153 * There is only one static instance of this structure. 154 */ 155LIST_HEAD(gs_list, g_gsched); /* type, link field */ 156struct geom_sched_vars { 157 struct mtx gs_mtx; 158 struct gs_list gs_scheds; /* list of algorithms */ 159 u_int gs_debug; 160 u_int gs_sched_count; /* how many algorithms ? */ 161 u_int gs_patched; /* g_io_request was patched */ 162 163 u_int gs_initialized; 164 u_int gs_expire_secs; /* expiration of hash entries */ 165 166 struct bio_queue_head gs_pending; 167 u_int gs_npending; 168 169 /* The following are for stats, usually protected by gs_mtx. */ 170 u_long gs_requests; /* total requests */ 171 u_long gs_done; /* total done */ 172 u_int gs_in_flight; /* requests in flight */ 173 u_int gs_writes_in_flight; 174 u_int gs_bytes_in_flight; 175 u_int gs_write_bytes_in_flight; 176 177 char gs_names[256]; /* names of schedulers */ 178}; 179 180static struct geom_sched_vars me = { 181 .gs_expire_secs = 10, 182}; 183 184SYSCTL_DECL(_kern_geom); 185SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0, 186 "GEOM_SCHED stuff"); 187 188SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD, 189 &me.gs_write_bytes_in_flight, 0, "Write bytes in flight"); 190 191SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD, 192 &me.gs_bytes_in_flight, 0, "Bytes in flight"); 193 194SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD, 195 &me.gs_writes_in_flight, 0, "Write Requests in flight"); 196 197SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD, 198 &me.gs_in_flight, 0, "Requests in flight"); 199 200SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD, 201 &me.gs_done, 0, "Total done"); 202 203SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD, 204 &me.gs_requests, 0, "Total requests"); 205 206SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD, 207 &me.gs_names, 0, "Algorithm names"); 208 209SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD, 210 &me.gs_sched_count, 0, "Number of algorithms"); 211 212SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW, 213 &me.gs_debug, 0, "Debug level"); 214 215SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW, 216 &me.gs_expire_secs, 0, "Expire time in seconds"); 217 218/* 219 * g_sched calls the scheduler algorithms with this lock held. 220 * The locking functions are exposed so the scheduler algorithms can also 221 * protect themselves e.g. when running a callout handler. 222 */ 223void 224g_sched_lock(struct g_geom *gp) 225{ 226 struct g_sched_softc *sc = gp->softc; 227 228 mtx_lock(&sc->sc_mtx); 229} 230 231void 232g_sched_unlock(struct g_geom *gp) 233{ 234 struct g_sched_softc *sc = gp->softc; 235 236 mtx_unlock(&sc->sc_mtx); 237} 238 239/* 240 * Support functions to handle references to the module, 241 * which are coming from devices using this scheduler. 242 */ 243static inline void 244g_gsched_ref(struct g_gsched *gsp) 245{ 246 247 atomic_add_int(&gsp->gs_refs, 1); 248} 249 250static inline void 251g_gsched_unref(struct g_gsched *gsp) 252{ 253 254 atomic_add_int(&gsp->gs_refs, -1); 255} 256 257/* 258 * Update the stats when this request is done. 259 */ 260static void 261g_sched_update_stats(struct bio *bio) 262{ 263 264 me.gs_done++; 265 me.gs_in_flight--; 266 me.gs_bytes_in_flight -= bio->bio_length; 267 if (bio->bio_cmd & BIO_WRITE) { 268 me.gs_writes_in_flight--; 269 me.gs_write_bytes_in_flight -= bio->bio_length; 270 } 271} 272 273/* 274 * Dispatch any pending request. 275 */ 276static void 277g_sched_forced_dispatch(struct g_geom *gp) 278{ 279 struct g_sched_softc *sc = gp->softc; 280 struct g_gsched *gsp = sc->sc_gsched; 281 struct bio *bp; 282 283 KASSERT(mtx_owned(&sc->sc_mtx), 284 ("sc_mtx not owned during forced dispatch")); 285 286 while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL) 287 g_io_request(bp, LIST_FIRST(&gp->consumer)); 288} 289 290/* 291 * The main dispatch loop, called either here after the start 292 * routine, or by scheduling algorithms when they receive a timeout 293 * or a 'done' notification. Does not share code with the forced 294 * dispatch path, since the gs_done() callback can call us. 295 */ 296void 297g_sched_dispatch(struct g_geom *gp) 298{ 299 struct g_sched_softc *sc = gp->softc; 300 struct g_gsched *gsp = sc->sc_gsched; 301 struct bio *bp; 302 303 KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch")); 304 305 if ((sc->sc_flags & G_SCHED_FLUSHING)) 306 return; 307 308 while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL) 309 g_io_request(bp, LIST_FIRST(&gp->consumer)); 310} 311 312/* 313 * Recent (8.0 and above) versions of FreeBSD have support to 314 * register classifiers of disk requests. The classifier is 315 * invoked by g_io_request(), and stores the information into 316 * bp->bio_classifier1. 317 * 318 * Support for older versions, which is left here only for 319 * documentation purposes, relies on two hacks: 320 * 1. classification info is written into the bio_caller1 321 * field of the topmost node in the bio chain. This field 322 * is rarely used, but this module is incompatible with 323 * those that use bio_caller1 for other purposes, 324 * such as ZFS and gjournal; 325 * 2. g_io_request() is patched in-memory when the module is 326 * loaded, so that the function calls a classifier as its 327 * first thing. g_io_request() is restored when the module 328 * is unloaded. This functionality is only supported for 329 * x86 and amd64, other architectures need source code changes. 330 */ 331 332/* 333 * Lookup the identity of the issuer of the original request. 334 * In the current implementation we use the curthread of the 335 * issuer, but different mechanisms may be implemented later 336 * so we do not make assumptions on the return value which for 337 * us is just an opaque identifier. 338 */ 339 340static inline u_long 341g_sched_classify(struct bio *bp) 342{ 343 344#if __FreeBSD_version > 800098 345 /* we have classifier fields in the struct bio */ 346#define HAVE_BIO_CLASSIFIER 347 return ((u_long)bp->bio_classifier1); 348#else 349#warning old version!!! 350 while (bp->bio_parent != NULL) 351 bp = bp->bio_parent; 352 353 return ((u_long)bp->bio_caller1); 354#endif 355} 356 357/* Return the hash chain for the given key. */ 358static inline struct g_hash * 359g_sched_hash(struct g_sched_softc *sc, u_long key) 360{ 361 362 return (&sc->sc_hash[key & sc->sc_mask]); 363} 364 365/* 366 * Helper function for the children classes, which takes 367 * a geom and a bio and returns the private descriptor 368 * associated to the request. This involves fetching 369 * the classification field and [al]locating the 370 * corresponding entry in the hash table. 371 */ 372void * 373g_sched_get_class(struct g_geom *gp, struct bio *bp) 374{ 375 struct g_sched_softc *sc; 376 struct g_sched_class *gsc; 377 struct g_gsched *gsp; 378 struct g_hash *bucket; 379 u_long key; 380 381 sc = gp->softc; 382 key = g_sched_classify(bp); 383 bucket = g_sched_hash(sc, key); 384 LIST_FOREACH(gsc, bucket, gsc_clist) { 385 if (key == gsc->gsc_key) { 386 gsc->gsc_refs++; 387 return (gsc->gsc_priv); 388 } 389 } 390 391 gsp = sc->sc_gsched; 392 gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size, 393 M_GEOM_SCHED, M_NOWAIT | M_ZERO); 394 if (!gsc) 395 return (NULL); 396 397 if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) { 398 free(gsc, M_GEOM_SCHED); 399 return (NULL); 400 } 401 402 gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */ 403 gsc->gsc_key = key; 404 LIST_INSERT_HEAD(bucket, gsc, gsc_clist); 405 406 gsc->gsc_expire = ticks + me.gs_expire_secs * hz; 407 408 return (gsc->gsc_priv); 409} 410 411/* 412 * Release a reference to the per-client descriptor, 413 */ 414void 415g_sched_put_class(struct g_geom *gp, void *priv) 416{ 417 struct g_sched_class *gsc; 418 struct g_sched_softc *sc; 419 420 gsc = g_sched_priv2class(priv); 421 gsc->gsc_expire = ticks + me.gs_expire_secs * hz; 422 423 if (--gsc->gsc_refs > 0) 424 return; 425 426 sc = gp->softc; 427 sc->sc_gsched->gs_fini_class(sc->sc_data, priv); 428 429 LIST_REMOVE(gsc, gsc_clist); 430 free(gsc, M_GEOM_SCHED); 431} 432 433static void 434g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask, 435 struct g_gsched *gsp, void *data) 436{ 437 struct g_sched_class *cp, *cp2; 438 int i; 439 440 if (!hp) 441 return; 442 443 if (data && gsp->gs_hash_unref) 444 gsp->gs_hash_unref(data); 445 446 for (i = 0; i < G_SCHED_HASH_SIZE; i++) { 447 LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2) 448 g_sched_put_class(gp, cp->gsc_priv); 449 } 450 451 hashdestroy(hp, M_GEOM_SCHED, mask); 452} 453 454static struct g_hash * 455g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags) 456{ 457 struct g_hash *hash; 458 459 if (gsp->gs_priv_size == 0) 460 return (NULL); 461 462 hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags); 463 464 return (hash); 465} 466 467static void 468g_sched_flush_classes(struct g_geom *gp) 469{ 470 struct g_sched_softc *sc; 471 struct g_sched_class *cp, *cp2; 472 int i; 473 474 sc = gp->softc; 475 476 if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0) 477 return; 478 479 for (i = 0; i < G_SCHED_HASH_SIZE; i++) { 480 LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) { 481 if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0) 482 g_sched_put_class(gp, cp->gsc_priv); 483 } 484 } 485 486 sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz; 487} 488 489/* 490 * Wait for the completion of any outstanding request. To ensure 491 * that this does not take forever the caller has to make sure that 492 * no new request enter the scehduler before calling us. 493 * 494 * Must be called with the gp mutex held and topology locked. 495 */ 496static int 497g_sched_wait_pending(struct g_geom *gp) 498{ 499 struct g_sched_softc *sc = gp->softc; 500 int endticks = ticks + hz; 501 502 g_topology_assert(); 503 504 while (sc->sc_pending && endticks - ticks >= 0) 505 msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4); 506 507 return (sc->sc_pending ? ETIMEDOUT : 0); 508} 509 510static int 511g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp) 512{ 513 struct g_sched_softc *sc = gp->softc; 514 int error; 515 516 /* Set the flushing flag: new bios will not enter the scheduler. */ 517 sc->sc_flags |= G_SCHED_FLUSHING; 518 519 g_sched_forced_dispatch(gp); 520 error = g_sched_wait_pending(gp); 521 if (error) 522 goto failed; 523 524 /* No more requests pending or in flight from the old gsp. */ 525 526 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); 527 sc->sc_hash = NULL; 528 529 /* 530 * Avoid deadlock here by releasing the gp mutex and reacquiring 531 * it once done. It should be safe, since no reconfiguration or 532 * destruction can take place due to the geom topology lock; no 533 * new request can use the current sc_data since we flagged the 534 * geom as being flushed. 535 */ 536 g_sched_unlock(gp); 537 gsp->gs_fini(sc->sc_data); 538 g_sched_lock(gp); 539 540 sc->sc_gsched = NULL; 541 sc->sc_data = NULL; 542 g_gsched_unref(gsp); 543 544failed: 545 sc->sc_flags &= ~G_SCHED_FLUSHING; 546 547 return (error); 548} 549 550static int 551g_sched_remove(struct g_geom *gp, struct g_gsched *gsp) 552{ 553 int error; 554 555 g_sched_lock(gp); 556 error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */ 557 g_sched_unlock(gp); 558 559 return (error); 560} 561 562/* 563 * Support function for create/taste -- locate the desired 564 * algorithm and grab a reference to it. 565 */ 566static struct g_gsched * 567g_gsched_find(const char *name) 568{ 569 struct g_gsched *gsp = NULL; 570 571 mtx_lock(&me.gs_mtx); 572 LIST_FOREACH(gsp, &me.gs_scheds, glist) { 573 if (strcmp(name, gsp->gs_name) == 0) { 574 g_gsched_ref(gsp); 575 break; 576 } 577 } 578 mtx_unlock(&me.gs_mtx); 579 580 return (gsp); 581} 582 583/* 584 * Rebuild the list of scheduler names. 585 * To be called with me.gs_mtx lock held. 586 */ 587static void 588g_gsched_build_names(struct g_gsched *gsp) 589{ 590 int pos, l; 591 struct g_gsched *cur; 592 593 pos = 0; 594 LIST_FOREACH(cur, &me.gs_scheds, glist) { 595 l = strlen(cur->gs_name); 596 if (l + pos + 1 + 1 < sizeof(me.gs_names)) { 597 if (pos != 0) 598 me.gs_names[pos++] = ' '; 599 strcpy(me.gs_names + pos, cur->gs_name); 600 pos += l; 601 } 602 } 603 me.gs_names[pos] = '\0'; 604} 605 606/* 607 * Register or unregister individual scheduling algorithms. 608 */ 609static int 610g_gsched_register(struct g_gsched *gsp) 611{ 612 struct g_gsched *cur; 613 int error = 0; 614 615 mtx_lock(&me.gs_mtx); 616 LIST_FOREACH(cur, &me.gs_scheds, glist) { 617 if (strcmp(gsp->gs_name, cur->gs_name) == 0) 618 break; 619 } 620 if (cur != NULL) { 621 G_SCHED_DEBUG(0, "A scheduler named %s already" 622 "exists.", gsp->gs_name); 623 error = EEXIST; 624 } else { 625 LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist); 626 gsp->gs_refs = 1; 627 me.gs_sched_count++; 628 g_gsched_build_names(gsp); 629 } 630 mtx_unlock(&me.gs_mtx); 631 632 return (error); 633} 634 635struct g_gsched_unregparm { 636 struct g_gsched *gup_gsp; 637 int gup_error; 638}; 639 640static void 641g_gsched_unregister(void *arg, int flag) 642{ 643 struct g_gsched_unregparm *parm = arg; 644 struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp; 645 struct g_sched_softc *sc; 646 struct g_geom *gp, *gp_tmp; 647 int error; 648 649 parm->gup_error = 0; 650 651 g_topology_assert(); 652 653 if (flag == EV_CANCEL) 654 return; 655 656 mtx_lock(&me.gs_mtx); 657 658 LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) { 659 if (gp->class != &g_sched_class) 660 continue; /* Should not happen. */ 661 662 sc = gp->softc; 663 if (sc->sc_gsched == gsp) { 664 error = g_sched_remove(gp, gsp); 665 if (error) 666 goto failed; 667 } 668 } 669 670 LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) { 671 if (cur != gsp) 672 continue; 673 674 if (gsp->gs_refs != 1) { 675 G_SCHED_DEBUG(0, "%s still in use.", 676 gsp->gs_name); 677 parm->gup_error = EBUSY; 678 } else { 679 LIST_REMOVE(gsp, glist); 680 me.gs_sched_count--; 681 g_gsched_build_names(gsp); 682 } 683 break; 684 } 685 686 if (cur == NULL) { 687 G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name); 688 parm->gup_error = ENOENT; 689 } 690 691failed: 692 mtx_unlock(&me.gs_mtx); 693} 694 695static inline void 696g_gsched_global_init(void) 697{ 698 699 if (!me.gs_initialized) { 700 G_SCHED_DEBUG(0, "Initializing global data."); 701 mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF); 702 LIST_INIT(&me.gs_scheds); 703 gs_bioq_init(&me.gs_pending); 704 me.gs_initialized = 1; 705 } 706} 707 708/* 709 * Module event called when a scheduling algorithm module is loaded or 710 * unloaded. 711 */ 712int 713g_gsched_modevent(module_t mod, int cmd, void *arg) 714{ 715 struct g_gsched *gsp = arg; 716 struct g_gsched_unregparm parm; 717 int error; 718 719 G_SCHED_DEBUG(0, "Modevent %d.", cmd); 720 721 /* 722 * If the module is loaded at boot, the geom thread that calls 723 * g_sched_init() might actually run after g_gsched_modevent(), 724 * so make sure that the module is properly initialized. 725 */ 726 g_gsched_global_init(); 727 728 error = EOPNOTSUPP; 729 switch (cmd) { 730 case MOD_LOAD: 731 error = g_gsched_register(gsp); 732 G_SCHED_DEBUG(0, "Loaded module %s error %d.", 733 gsp->gs_name, error); 734 if (error == 0) 735 g_retaste(&g_sched_class); 736 break; 737 738 case MOD_UNLOAD: 739 parm.gup_gsp = gsp; 740 parm.gup_error = 0; 741 742 error = g_waitfor_event(g_gsched_unregister, 743 &parm, M_WAITOK, NULL); 744 if (error == 0) 745 error = parm.gup_error; 746 G_SCHED_DEBUG(0, "Unloaded module %s error %d.", 747 gsp->gs_name, error); 748 break; 749 }; 750 751 return (error); 752} 753 754#ifdef KTR 755#define TRC_BIO_EVENT(e, bp) g_sched_trace_bio_ ## e (bp) 756 757static inline char 758g_sched_type(struct bio *bp) 759{ 760 761 if (0 != (bp->bio_cmd & BIO_READ)) 762 return ('R'); 763 else if (0 != (bp->bio_cmd & BIO_WRITE)) 764 return ('W'); 765 return ('U'); 766} 767 768static inline void 769g_sched_trace_bio_START(struct bio *bp) 770{ 771 772 CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp), 773 g_sched_type(bp), bp->bio_offset / ULONG_MAX, 774 bp->bio_offset, bp->bio_length); 775} 776 777static inline void 778g_sched_trace_bio_DONE(struct bio *bp) 779{ 780 781 CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp), 782 g_sched_type(bp), bp->bio_offset / ULONG_MAX, 783 bp->bio_offset, bp->bio_length); 784} 785#else /* !KTR */ 786#define TRC_BIO_EVENT(e, bp) 787#endif /* !KTR */ 788 789/* 790 * g_sched_done() and g_sched_start() dispatch the geom requests to 791 * the scheduling algorithm in use. 792 */ 793static void 794g_sched_done(struct bio *bio) 795{ 796 struct g_geom *gp = bio->bio_caller2; 797 struct g_sched_softc *sc = gp->softc; 798 799 TRC_BIO_EVENT(DONE, bio); 800 801 KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done")); 802 803 g_sched_lock(gp); 804 805 g_sched_update_stats(bio); 806 sc->sc_gsched->gs_done(sc->sc_data, bio); 807 if (!--sc->sc_pending) 808 wakeup(gp); 809 810 g_sched_flush_classes(gp); 811 g_sched_unlock(gp); 812 813 g_std_done(bio); 814} 815 816static void 817g_sched_start(struct bio *bp) 818{ 819 struct g_geom *gp = bp->bio_to->geom; 820 struct g_sched_softc *sc = gp->softc; 821 struct bio *cbp; 822 823 TRC_BIO_EVENT(START, bp); 824 G_SCHED_LOGREQ(bp, "Request received."); 825 826 cbp = g_clone_bio(bp); 827 if (cbp == NULL) { 828 g_io_deliver(bp, ENOMEM); 829 return; 830 } 831 cbp->bio_done = g_sched_done; 832 cbp->bio_to = LIST_FIRST(&gp->provider); 833 KASSERT(cbp->bio_to != NULL, ("NULL provider")); 834 835 /* We only schedule reads and writes. */ 836 if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE))) 837 goto bypass; 838 839 G_SCHED_LOGREQ(cbp, "Sending request."); 840 841 g_sched_lock(gp); 842 /* 843 * Call the algorithm's gs_start to queue the request in the 844 * scheduler. If gs_start fails then pass the request down, 845 * otherwise call g_sched_dispatch() which tries to push 846 * one or more requests down. 847 */ 848 if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) || 849 sc->sc_gsched->gs_start(sc->sc_data, cbp)) { 850 g_sched_unlock(gp); 851 goto bypass; 852 } 853 /* 854 * We use bio_caller1 to mark requests that are scheduled 855 * so make sure it is not NULL. 856 */ 857 if (cbp->bio_caller1 == NULL) 858 cbp->bio_caller1 = &me; /* anything not NULL */ 859 860 cbp->bio_caller2 = gp; 861 sc->sc_pending++; 862 863 /* Update general stats. */ 864 me.gs_in_flight++; 865 me.gs_requests++; 866 me.gs_bytes_in_flight += bp->bio_length; 867 if (bp->bio_cmd & BIO_WRITE) { 868 me.gs_writes_in_flight++; 869 me.gs_write_bytes_in_flight += bp->bio_length; 870 } 871 g_sched_dispatch(gp); 872 g_sched_unlock(gp); 873 return; 874 875bypass: 876 cbp->bio_done = g_std_done; 877 cbp->bio_caller1 = NULL; /* not scheduled */ 878 g_io_request(cbp, LIST_FIRST(&gp->consumer)); 879} 880 881/* 882 * The next few functions are the geom glue. 883 */ 884static void 885g_sched_orphan(struct g_consumer *cp) 886{ 887 888 g_topology_assert(); 889 g_sched_destroy(cp->geom, 1); 890} 891 892static int 893g_sched_access(struct g_provider *pp, int dr, int dw, int de) 894{ 895 struct g_geom *gp; 896 struct g_consumer *cp; 897 int error; 898 899 gp = pp->geom; 900 cp = LIST_FIRST(&gp->consumer); 901 error = g_access(cp, dr, dw, de); 902 903 return (error); 904} 905 906static void 907g_sched_temporary_start(struct bio *bio) 908{ 909 910 mtx_lock(&me.gs_mtx); 911 me.gs_npending++; 912 gs_bioq_disksort(&me.gs_pending, bio); 913 mtx_unlock(&me.gs_mtx); 914} 915 916static void 917g_sched_flush_pending(g_start_t *start) 918{ 919 struct bio *bp; 920 921 while ((bp = gs_bioq_takefirst(&me.gs_pending))) 922 start(bp); 923} 924 925static int 926g_insert_proxy(struct g_geom *gp, struct g_provider *newpp, 927 struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp) 928{ 929 struct g_sched_softc *sc = gp->softc; 930 g_start_t *saved_start, *flush = g_sched_start; 931 int error = 0, endticks = ticks + hz; 932 933 g_cancel_event(newpp); /* prevent taste() */ 934 /* copy private fields */ 935 newpp->private = pp->private; 936 newpp->index = pp->index; 937 938 /* Queue all the early requests coming for us. */ 939 me.gs_npending = 0; 940 saved_start = pp->geom->start; 941 dstgp->start = g_sched_temporary_start; 942 943 while (pp->nstart - pp->nend != me.gs_npending && 944 endticks - ticks >= 0) 945 tsleep(pp, PRIBIO, "-", hz/10); 946 947 if (pp->nstart - pp->nend != me.gs_npending) { 948 flush = saved_start; 949 error = ETIMEDOUT; 950 goto fail; 951 } 952 953 /* link pp to this geom */ 954 LIST_REMOVE(pp, provider); 955 pp->geom = gp; 956 LIST_INSERT_HEAD(&gp->provider, pp, provider); 957 958 /* 959 * replicate the counts from the parent in the 960 * new provider and consumer nodes 961 */ 962 cp->acr = newpp->acr = pp->acr; 963 cp->acw = newpp->acw = pp->acw; 964 cp->ace = newpp->ace = pp->ace; 965 sc->sc_flags |= G_SCHED_PROXYING; 966 967fail: 968 dstgp->start = saved_start; 969 970 g_sched_flush_pending(flush); 971 972 return (error); 973} 974 975/* 976 * Create a geom node for the device passed as *pp. 977 * If successful, add a reference to this gsp. 978 */ 979static int 980g_sched_create(struct gctl_req *req, struct g_class *mp, 981 struct g_provider *pp, struct g_gsched *gsp, int proxy) 982{ 983 struct g_sched_softc *sc = NULL; 984 struct g_geom *gp, *dstgp; 985 struct g_provider *newpp = NULL; 986 struct g_consumer *cp = NULL; 987 char name[64]; 988 int error; 989 990 g_topology_assert(); 991 992 snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX); 993 LIST_FOREACH(gp, &mp->geom, geom) { 994 if (strcmp(gp->name, name) == 0) { 995 gctl_error(req, "Geom %s already exists.", 996 name); 997 return (EEXIST); 998 } 999 } 1000 1001 gp = g_new_geomf(mp, name); 1002 dstgp = proxy ? pp->geom : gp; /* where do we link the provider */ 1003 if (gp == NULL) { 1004 gctl_error(req, "Cannot create geom %s.", name); 1005 error = ENOMEM; 1006 goto fail; 1007 } 1008 1009 sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); 1010 sc->sc_gsched = gsp; 1011 sc->sc_data = gsp->gs_init(gp); 1012 if (sc->sc_data == NULL) { 1013 error = ENOMEM; 1014 goto fail; 1015 } 1016 1017 sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK); 1018 1019 /* 1020 * Do not initialize the flush mechanism, will be initialized 1021 * on the first insertion on the hash table. 1022 */ 1023 1024 mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF); 1025 1026 gp->softc = sc; 1027 gp->start = g_sched_start; 1028 gp->orphan = g_sched_orphan; 1029 gp->access = g_sched_access; 1030 gp->dumpconf = g_sched_dumpconf; 1031 1032 newpp = g_new_providerf(dstgp, gp->name); 1033 if (newpp == NULL) { 1034 gctl_error(req, "Cannot create provider %s.", name); 1035 error = ENOMEM; 1036 goto fail; 1037 } 1038 1039 newpp->mediasize = pp->mediasize; 1040 newpp->sectorsize = pp->sectorsize; 1041 1042 cp = g_new_consumer(gp); 1043 if (cp == NULL) { 1044 gctl_error(req, "Cannot create consumer for %s.", 1045 gp->name); 1046 error = ENOMEM; 1047 goto fail; 1048 } 1049 1050 error = g_attach(cp, proxy ? newpp : pp); 1051 if (error != 0) { 1052 gctl_error(req, "Cannot attach to provider %s.", 1053 pp->name); 1054 goto fail; 1055 } 1056 1057 g_error_provider(newpp, 0); 1058 if (proxy) { 1059 error = g_insert_proxy(gp, newpp, dstgp, pp, cp); 1060 if (error) 1061 goto fail; 1062 } 1063 G_SCHED_DEBUG(0, "Device %s created.", gp->name); 1064 1065 g_gsched_ref(gsp); 1066 1067 return (0); 1068 1069fail: 1070 if (cp != NULL) { 1071 if (cp->provider != NULL) 1072 g_detach(cp); 1073 g_destroy_consumer(cp); 1074 } 1075 1076 if (newpp != NULL) 1077 g_destroy_provider(newpp); 1078 1079 if (sc && sc->sc_hash) { 1080 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, 1081 gsp, sc->sc_data); 1082 } 1083 1084 if (sc && sc->sc_data) 1085 gsp->gs_fini(sc->sc_data); 1086 1087 if (gp != NULL) { 1088 if (gp->softc != NULL) 1089 g_free(gp->softc); 1090 g_destroy_geom(gp); 1091 } 1092 1093 return (error); 1094} 1095 1096/* 1097 * Support for dynamic switching of scheduling algorithms. 1098 * First initialize the data structures for the new algorithm, 1099 * then call g_sched_remove_locked() to flush all references 1100 * to the old one, finally link the new algorithm. 1101 */ 1102static int 1103g_sched_change_algo(struct gctl_req *req, struct g_class *mp, 1104 struct g_provider *pp, struct g_gsched *gsp) 1105{ 1106 struct g_sched_softc *sc; 1107 struct g_geom *gp; 1108 struct g_hash *newh; 1109 void *data; 1110 u_long mask; 1111 int error = 0; 1112 1113 gp = pp->geom; 1114 sc = gp->softc; 1115 1116 data = gsp->gs_init(gp); 1117 if (data == NULL) 1118 return (ENOMEM); 1119 1120 newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK); 1121 if (gsp->gs_priv_size && !newh) { 1122 error = ENOMEM; 1123 goto fail; 1124 } 1125 1126 g_sched_lock(gp); 1127 if (sc->sc_gsched) { /* can be NULL in some cases */ 1128 error = g_sched_remove_locked(gp, sc->sc_gsched); 1129 if (error) 1130 goto fail; 1131 } 1132 1133 g_gsched_ref(gsp); 1134 sc->sc_gsched = gsp; 1135 sc->sc_data = data; 1136 sc->sc_hash = newh; 1137 sc->sc_mask = mask; 1138 1139 g_sched_unlock(gp); 1140 1141 return (0); 1142 1143fail: 1144 if (newh) 1145 g_sched_hash_fini(gp, newh, mask, gsp, data); 1146 1147 if (data) 1148 gsp->gs_fini(data); 1149 1150 g_sched_unlock(gp); 1151 1152 return (error); 1153} 1154 1155/* 1156 * Stop the request flow directed to the proxy, redirecting the new 1157 * requests to the me.gs_pending queue. 1158 */ 1159static struct g_provider * 1160g_detach_proxy(struct g_geom *gp) 1161{ 1162 struct g_consumer *cp; 1163 struct g_provider *pp, *newpp; 1164 1165 do { 1166 pp = LIST_FIRST(&gp->provider); 1167 if (pp == NULL) 1168 break; 1169 cp = LIST_FIRST(&gp->consumer); 1170 if (cp == NULL) 1171 break; 1172 newpp = cp->provider; 1173 if (newpp == NULL) 1174 break; 1175 1176 me.gs_npending = 0; 1177 pp->geom->start = g_sched_temporary_start; 1178 1179 return (pp); 1180 } while (0); 1181 printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name); 1182 1183 return (NULL); 1184} 1185 1186static void 1187g_sched_blackhole(struct bio *bp) 1188{ 1189 1190 g_io_deliver(bp, ENXIO); 1191} 1192 1193static inline void 1194g_reparent_provider(struct g_provider *pp, struct g_geom *gp, 1195 struct g_provider *newpp) 1196{ 1197 1198 LIST_REMOVE(pp, provider); 1199 if (newpp) { 1200 pp->private = newpp->private; 1201 pp->index = newpp->index; 1202 } 1203 pp->geom = gp; 1204 LIST_INSERT_HEAD(&gp->provider, pp, provider); 1205} 1206 1207static inline void 1208g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp) 1209{ 1210 struct g_geom *gp = oldpp->geom; 1211 1212 g_reparent_provider(oldpp, newpp->geom, newpp); 1213 1214 /* 1215 * Hackish: let the system destroy the old provider for us, just 1216 * in case someone attached a consumer to it, in which case a 1217 * direct call to g_destroy_provider() would not work. 1218 */ 1219 g_reparent_provider(newpp, gp, NULL); 1220} 1221 1222/* 1223 * Complete the proxy destruction, linking the old provider to its 1224 * original geom, and destroying the proxy provider. Also take care 1225 * of issuing the pending requests collected in me.gs_pending (if any). 1226 */ 1227static int 1228g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp) 1229{ 1230 struct g_consumer *cp; 1231 struct g_provider *newpp; 1232 1233 do { 1234 cp = LIST_FIRST(&gp->consumer); 1235 if (cp == NULL) 1236 break; 1237 newpp = cp->provider; 1238 if (newpp == NULL) 1239 break; 1240 1241 /* Relink the provider to its original geom. */ 1242 g_unproxy_provider(oldpp, newpp); 1243 1244 /* Detach consumer from provider, and destroy provider. */ 1245 cp->acr = newpp->acr = 0; 1246 cp->acw = newpp->acw = 0; 1247 cp->ace = newpp->ace = 0; 1248 g_detach(cp); 1249 1250 /* Send the pending bios through the right start function. */ 1251 g_sched_flush_pending(oldpp->geom->start); 1252 1253 return (0); 1254 } while (0); 1255 printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name); 1256 1257 /* We cannot send the pending bios anywhere... */ 1258 g_sched_flush_pending(g_sched_blackhole); 1259 1260 return (EINVAL); 1261} 1262 1263static int 1264g_sched_destroy(struct g_geom *gp, boolean_t force) 1265{ 1266 struct g_provider *pp, *oldpp = NULL; 1267 struct g_sched_softc *sc; 1268 struct g_gsched *gsp; 1269 int error; 1270 1271 g_topology_assert(); 1272 sc = gp->softc; 1273 if (sc == NULL) 1274 return (ENXIO); 1275 if (!(sc->sc_flags & G_SCHED_PROXYING)) { 1276 pp = LIST_FIRST(&gp->provider); 1277 if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { 1278 const char *msg = force ? 1279 "but we force removal" : "cannot remove"; 1280 1281 G_SCHED_DEBUG(!force, 1282 "Device %s is still open (r%dw%de%d), %s.", 1283 pp->name, pp->acr, pp->acw, pp->ace, msg); 1284 if (!force) 1285 return (EBUSY); 1286 } else { 1287 G_SCHED_DEBUG(0, "Device %s removed.", gp->name); 1288 } 1289 } else 1290 oldpp = g_detach_proxy(gp); 1291 1292 gsp = sc->sc_gsched; 1293 if (gsp) { 1294 /* 1295 * XXX bad hack here: force a dispatch to release 1296 * any reference to the hash table still held by 1297 * the scheduler. 1298 */ 1299 g_sched_lock(gp); 1300 /* 1301 * We are dying here, no new requests should enter 1302 * the scheduler. This is granted by the topolgy, 1303 * either in case we were proxying (new bios are 1304 * being redirected) or not (see the access check 1305 * above). 1306 */ 1307 g_sched_forced_dispatch(gp); 1308 error = g_sched_wait_pending(gp); 1309 1310 if (error) { 1311 /* 1312 * Not all the requests came home: this might happen 1313 * under heavy load, or if we were waiting for any 1314 * bio which is served in the event path (see 1315 * geom_slice.c for an example of how this can 1316 * happen). Try to restore a working configuration 1317 * if we can fail. 1318 */ 1319 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { 1320 g_sched_flush_pending(force ? 1321 g_sched_blackhole : g_sched_start); 1322 } 1323 1324 /* 1325 * In the forced destroy case there is not so much 1326 * we can do, we have pending bios that will call 1327 * g_sched_done() somehow, and we don't want them 1328 * to crash the system using freed memory. We tell 1329 * the user that something went wrong, and leak some 1330 * memory here. 1331 * Note: the callers using force = 1 ignore the 1332 * return value. 1333 */ 1334 if (force) { 1335 G_SCHED_DEBUG(0, "Pending requests while " 1336 " destroying geom, some memory leaked."); 1337 } 1338 1339 return (error); 1340 } 1341 1342 g_sched_unlock(gp); 1343 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, 1344 gsp, sc->sc_data); 1345 sc->sc_hash = NULL; 1346 gsp->gs_fini(sc->sc_data); 1347 g_gsched_unref(gsp); 1348 sc->sc_gsched = NULL; 1349 } 1350 1351 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { 1352 error = g_destroy_proxy(gp, oldpp); 1353 1354 if (error) { 1355 if (force) { 1356 G_SCHED_DEBUG(0, "Unrecoverable error while " 1357 "destroying a proxy geom, leaking some " 1358 " memory."); 1359 } 1360 1361 return (error); 1362 } 1363 } 1364 1365 mtx_destroy(&sc->sc_mtx); 1366 1367 g_free(gp->softc); 1368 gp->softc = NULL; 1369 g_wither_geom(gp, ENXIO); 1370 1371 return (error); 1372} 1373 1374static int 1375g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, 1376 struct g_geom *gp) 1377{ 1378 1379 return (g_sched_destroy(gp, 0)); 1380} 1381 1382/* 1383 * Functions related to the classification of requests. 1384 * 1385 * On recent FreeBSD versions (8.0 and above), we store a reference 1386 * to the issuer of a request in bp->bio_classifier1 as soon 1387 * as the bio is posted to the geom queue (and not later, because 1388 * requests are managed by the g_down thread afterwards). 1389 * 1390 * On older versions of the system (but this code is not used 1391 * in any existing release), we [ab]use the caller1 field in the 1392 * root element of the bio tree to store the classification info. 1393 * The marking is done at the beginning of g_io_request() 1394 * and only if we find that the field is NULL. 1395 * 1396 * To avoid rebuilding the kernel, this module will patch the 1397 * initial part of g_io_request() so it jumps to some hand-coded 1398 * assembly that does the marking and then executes the original 1399 * body of g_io_request(). 1400 * 1401 * fake_ioreq[] is architecture-specific machine code 1402 * that implements the above. CODE_SIZE, STORE_SIZE etc. 1403 * are constants used in the patching routine. Look at the 1404 * code in g_ioreq_patch() for the details. 1405 */ 1406 1407#ifndef HAVE_BIO_CLASSIFIER 1408/* 1409 * Support for old FreeBSD versions 1410 */ 1411#if defined(__i386__) 1412#define CODE_SIZE 29 1413#define STORE_SIZE 5 1414#define EPILOGUE 5 1415#define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) 1416 1417static u_char fake_ioreq[SIZE] = { 1418 0x8b, 0x44, 0x24, 0x04, /* mov bp, %eax */ 1419 /* 1: */ 1420 0x89, 0xc2, /* mov %eax, %edx # edx = bp */ 1421 0x8b, 0x40, 0x64, /* mov bp->bio_parent, %eax */ 1422 0x85, 0xc0, /* test %eax, %eax */ 1423 0x75, 0xf7, /* jne 1b */ 1424 0x8b, 0x42, 0x30, /* mov bp->bp_caller1, %eax */ 1425 0x85, 0xc0, /* test %eax, %eax */ 1426 0x75, 0x09, /* jne 2f */ 1427 0x64, 0xa1, 0x00, 0x00, /* mov %fs:0, %eax */ 1428 0x00, 0x00, 1429 0x89, 0x42, 0x30, /* mov %eax, bp->bio_caller1 */ 1430 /* 2: */ 1431 0x55, 0x89, 0xe5, 0x57, 0x56, 1432 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ 1433}; 1434#elif defined(__amd64) 1435#define CODE_SIZE 38 1436#define STORE_SIZE 6 1437#define EPILOGUE 5 1438#define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) 1439 1440static u_char fake_ioreq[SIZE] = { 1441 0x48, 0x89, 0xf8, /* mov bp, %rax */ 1442 /* 1: */ 1443 0x48, 0x89, 0xc2, /* mov %rax, %rdx # rdx = bp */ 1444 0x48, 0x8b, 0x82, 0xa8, /* mov bp->bio_parent, %rax */ 1445 0x00, 0x00, 0x00, 1446 0x48, 0x85, 0xc0, /* test %rax, %rax */ 1447 0x75, 0xf1, /* jne 1b */ 1448 0x48, 0x83, 0x7a, 0x58, /* cmp $0, bp->bp_caller1 */ 1449 0x00, 1450 0x75, 0x0d, /* jne 2f */ 1451 0x65, 0x48, 0x8b, 0x04, /* mov %gs:0, %rax */ 1452 0x25, 0x00, 0x00, 0x00, 1453 0x00, 1454 0x48, 0x89, 0x42, 0x58, /* mov %rax, bp->bio_caller1 */ 1455 /* 2: */ 1456 0x55, 0x48, 0x89, 0xe5, 0x41, 0x56, 1457 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ 1458}; 1459#else /* neither x86 nor amd64 */ 1460static void 1461g_new_io_request(struct bio *bp, struct g_consumer *cp) 1462{ 1463 struct bio *top = bp; 1464 1465 /* 1466 * bio classification: if bio_caller1 is available in the 1467 * root of the 'struct bio' tree, store there the thread id 1468 * of the thread that originated the request. 1469 * More sophisticated classification schemes can be used. 1470 */ 1471 while (top->bio_parent) 1472 top = top->bio_parent; 1473 1474 if (top->bio_caller1 == NULL) 1475 top->bio_caller1 = curthread; 1476} 1477 1478#error please add the code above in g_new_io_request() to the beginning of \ 1479 /sys/geom/geom_io.c::g_io_request(), and remove this line. 1480#endif /* end of arch-specific code */ 1481 1482static int 1483g_ioreq_patch(void) 1484{ 1485 u_char *original; 1486 u_long ofs; 1487 int found; 1488 1489 if (me.gs_patched) 1490 return (-1); 1491 1492 original = (u_char *)g_io_request; 1493 1494 found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE); 1495 if (!found) 1496 return (-1); 1497 1498 /* Jump back to the original + STORE_SIZE. */ 1499 ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE); 1500 bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4); 1501 1502 /* Patch the original address with a jump to the trampoline. */ 1503 *original = 0xe9; /* jump opcode */ 1504 ofs = fake_ioreq - (original + 5); 1505 bcopy(&ofs, original + 1, 4); 1506 1507 me.gs_patched = 1; 1508 1509 return (0); 1510} 1511 1512/* 1513 * Restore the original code, this is easy. 1514 */ 1515static void 1516g_ioreq_restore(void) 1517{ 1518 u_char *original; 1519 1520 if (me.gs_patched) { 1521 original = (u_char *)g_io_request; 1522 bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE); 1523 me.gs_patched = 0; 1524 } 1525} 1526 1527static inline void 1528g_classifier_ini(void) 1529{ 1530 1531 g_ioreq_patch(); 1532} 1533 1534static inline void 1535g_classifier_fini(void) 1536{ 1537 1538 g_ioreq_restore(); 1539} 1540 1541/*--- end of support code for older FreeBSD versions */ 1542 1543#else /* HAVE_BIO_CLASSIFIER */ 1544 1545/* 1546 * Classifier support for recent FreeBSD versions: we use 1547 * a very simple classifier, only use curthread to tag a request. 1548 * The classifier is registered at module load, and unregistered 1549 * at module unload. 1550 */ 1551static int 1552g_sched_tag(void *arg, struct bio *bp) 1553{ 1554 1555 bp->bio_classifier1 = curthread; 1556 return (1); 1557} 1558 1559static struct g_classifier_hook g_sched_classifier = { 1560 .func = g_sched_tag, 1561}; 1562 1563static inline void 1564g_classifier_ini(void) 1565{ 1566 1567 g_register_classifier(&g_sched_classifier); 1568} 1569 1570static inline void 1571g_classifier_fini(void) 1572{ 1573 1574 g_unregister_classifier(&g_sched_classifier); 1575} 1576#endif /* HAVE_BIO_CLASSIFIER */ 1577 1578static void 1579g_sched_init(struct g_class *mp) 1580{ 1581 1582 g_gsched_global_init(); 1583 1584 G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.", 1585 mp, &g_sched_class); 1586 1587 /* Patch g_io_request to store classification info in the bio. */ 1588 g_classifier_ini(); 1589} 1590 1591static void 1592g_sched_fini(struct g_class *mp) 1593{ 1594 1595 g_classifier_fini(); 1596 1597 G_SCHED_DEBUG(0, "Unloading..."); 1598 1599 KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers")); 1600 mtx_destroy(&me.gs_mtx); 1601} 1602 1603/* 1604 * Read the i-th argument for a request, skipping the /dev/ 1605 * prefix if present. 1606 */ 1607static const char * 1608g_sched_argi(struct gctl_req *req, int i) 1609{ 1610 static const char *dev_prefix = "/dev/"; 1611 const char *name; 1612 char param[16]; 1613 int l = strlen(dev_prefix); 1614 1615 snprintf(param, sizeof(param), "arg%d", i); 1616 name = gctl_get_asciiparam(req, param); 1617 if (name == NULL) 1618 gctl_error(req, "No 'arg%d' argument", i); 1619 else if (strncmp(name, dev_prefix, l) == 0) 1620 name += l; 1621 return (name); 1622} 1623 1624/* 1625 * Fetch nargs and do appropriate checks. 1626 */ 1627static int 1628g_sched_get_nargs(struct gctl_req *req) 1629{ 1630 int *nargs; 1631 1632 nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); 1633 if (nargs == NULL) { 1634 gctl_error(req, "No 'nargs' argument"); 1635 return (0); 1636 } 1637 if (*nargs <= 0) 1638 gctl_error(req, "Missing device(s)."); 1639 return (*nargs); 1640} 1641 1642/* 1643 * Check whether we should add the class on certain volumes when 1644 * this geom is created. Right now this is under control of a kenv 1645 * variable containing the names of all devices that we care about. 1646 * Probably we should only support transparent insertion as the 1647 * preferred mode of operation. 1648 */ 1649static struct g_geom * 1650g_sched_taste(struct g_class *mp, struct g_provider *pp, 1651 int flags __unused) 1652{ 1653 struct g_gsched *gsp = NULL; /* the . algorithm we want */ 1654 const char *s; /* generic string pointer */ 1655 const char *taste_names; /* devices we like */ 1656 int l; 1657 1658 g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, 1659 mp->name, pp->name); 1660 g_topology_assert(); 1661 1662 G_SCHED_DEBUG(2, "Tasting %s.", pp->name); 1663 1664 do { 1665 /* do not taste on ourselves */ 1666 if (pp->geom->class == mp) 1667 break; 1668 1669 taste_names = getenv("geom.sched.taste"); 1670 if (taste_names == NULL) 1671 break; 1672 1673 l = strlen(pp->name); 1674 for (s = taste_names; *s && 1675 (s = strstr(s, pp->name)); s++) { 1676 /* further checks for an exact match */ 1677 if ( (s == taste_names || s[-1] == ' ') && 1678 (s[l] == '\0' || s[l] == ' ') ) 1679 break; 1680 } 1681 if (s == NULL) 1682 break; 1683 G_SCHED_DEBUG(0, "Attach device %s match [%s]\n", 1684 pp->name, s); 1685 1686 /* look up the provider name in the list */ 1687 s = getenv("geom.sched.algo"); 1688 if (s == NULL) 1689 s = "rr"; 1690 1691 gsp = g_gsched_find(s); /* also get a reference */ 1692 if (gsp == NULL) { 1693 G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s); 1694 break; 1695 } 1696 1697 /* XXX create with 1 as last argument ? */ 1698 g_sched_create(NULL, mp, pp, gsp, 0); 1699 g_gsched_unref(gsp); 1700 } while (0); 1701 return NULL; 1702} 1703 1704static void 1705g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy) 1706{ 1707 struct g_provider *pp; 1708 struct g_gsched *gsp; 1709 const char *name; 1710 int i, nargs; 1711 1712 g_topology_assert(); 1713 1714 name = gctl_get_asciiparam(req, "algo"); 1715 if (name == NULL) { 1716 gctl_error(req, "No '%s' argument", "algo"); 1717 return; 1718 } 1719 1720 gsp = g_gsched_find(name); /* also get a reference */ 1721 if (gsp == NULL) { 1722 gctl_error(req, "Bad algorithm '%s'", name); 1723 return; 1724 } 1725 1726 nargs = g_sched_get_nargs(req); 1727 1728 /* 1729 * Run on the arguments, and break on any error. 1730 * We look for a device name, but skip the /dev/ prefix if any. 1731 */ 1732 for (i = 0; i < nargs; i++) { 1733 name = g_sched_argi(req, i); 1734 if (name == NULL) 1735 break; 1736 pp = g_provider_by_name(name); 1737 if (pp == NULL) { 1738 G_SCHED_DEBUG(1, "Provider %s is invalid.", name); 1739 gctl_error(req, "Provider %s is invalid.", name); 1740 break; 1741 } 1742 if (g_sched_create(req, mp, pp, gsp, proxy) != 0) 1743 break; 1744 } 1745 1746 g_gsched_unref(gsp); 1747} 1748 1749static void 1750g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp) 1751{ 1752 struct g_provider *pp; 1753 struct g_gsched *gsp; 1754 const char *name; 1755 int i, nargs; 1756 1757 g_topology_assert(); 1758 1759 name = gctl_get_asciiparam(req, "algo"); 1760 if (name == NULL) { 1761 gctl_error(req, "No '%s' argument", "algo"); 1762 return; 1763 } 1764 1765 gsp = g_gsched_find(name); /* also get a reference */ 1766 if (gsp == NULL) { 1767 gctl_error(req, "Bad algorithm '%s'", name); 1768 return; 1769 } 1770 1771 nargs = g_sched_get_nargs(req); 1772 1773 /* 1774 * Run on the arguments, and break on any error. 1775 * We look for a device name, but skip the /dev/ prefix if any. 1776 */ 1777 for (i = 0; i < nargs; i++) { 1778 name = g_sched_argi(req, i); 1779 if (name == NULL) 1780 break; 1781 pp = g_provider_by_name(name); 1782 if (pp == NULL || pp->geom->class != mp) { 1783 G_SCHED_DEBUG(1, "Provider %s is invalid.", name); 1784 gctl_error(req, "Provider %s is invalid.", name); 1785 break; 1786 } 1787 if (g_sched_change_algo(req, mp, pp, gsp) != 0) 1788 break; 1789 } 1790 1791 g_gsched_unref(gsp); 1792} 1793 1794static struct g_geom * 1795g_sched_find_geom(struct g_class *mp, const char *name) 1796{ 1797 struct g_geom *gp; 1798 1799 LIST_FOREACH(gp, &mp->geom, geom) { 1800 if (strcmp(gp->name, name) == 0) 1801 return (gp); 1802 } 1803 return (NULL); 1804} 1805 1806static void 1807g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp) 1808{ 1809 int nargs, *force, error, i; 1810 struct g_geom *gp; 1811 const char *name; 1812 1813 g_topology_assert(); 1814 1815 nargs = g_sched_get_nargs(req); 1816 1817 force = gctl_get_paraml(req, "force", sizeof(*force)); 1818 if (force == NULL) { 1819 gctl_error(req, "No 'force' argument"); 1820 return; 1821 } 1822 1823 for (i = 0; i < nargs; i++) { 1824 name = g_sched_argi(req, i); 1825 if (name == NULL) 1826 break; 1827 1828 gp = g_sched_find_geom(mp, name); 1829 if (gp == NULL) { 1830 G_SCHED_DEBUG(1, "Device %s is invalid.", name); 1831 gctl_error(req, "Device %s is invalid.", name); 1832 break; 1833 } 1834 1835 error = g_sched_destroy(gp, *force); 1836 if (error != 0) { 1837 gctl_error(req, "Cannot destroy device %s (error=%d).", 1838 gp->name, error); 1839 break; 1840 } 1841 } 1842} 1843 1844static void 1845g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb) 1846{ 1847 uint32_t *version; 1848 1849 g_topology_assert(); 1850 1851 version = gctl_get_paraml(req, "version", sizeof(*version)); 1852 if (version == NULL) { 1853 gctl_error(req, "No '%s' argument.", "version"); 1854 return; 1855 } 1856 1857 if (*version != G_SCHED_VERSION) { 1858 gctl_error(req, "Userland and kernel parts are " 1859 "out of sync."); 1860 return; 1861 } 1862 1863 if (strcmp(verb, "create") == 0) { 1864 g_sched_ctl_create(req, mp, 0); 1865 return; 1866 } else if (strcmp(verb, "insert") == 0) { 1867 g_sched_ctl_create(req, mp, 1); 1868 return; 1869 } else if (strcmp(verb, "configure") == 0) { 1870 g_sched_ctl_configure(req, mp); 1871 return; 1872 } else if (strcmp(verb, "destroy") == 0) { 1873 g_sched_ctl_destroy(req, mp); 1874 return; 1875 } 1876 1877 gctl_error(req, "Unknown verb."); 1878} 1879 1880static void 1881g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 1882 struct g_consumer *cp, struct g_provider *pp) 1883{ 1884 struct g_sched_softc *sc = gp->softc; 1885 struct g_gsched *gsp = sc->sc_gsched; 1886 if (indent == NULL) { /* plaintext */ 1887 sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--"); 1888 } 1889 if (gsp->gs_dumpconf) 1890 gsp->gs_dumpconf(sb, indent, gp, cp, pp); 1891} 1892 1893DECLARE_GEOM_CLASS(g_sched_class, g_sched); 1894MODULE_VERSION(geom_sched, 0); 1895