geom_vinum_plex.c revision 135426
1130389Sle/*- 2130389Sle * Copyright (c) 2004 Lukas Ertl 3130389Sle * All rights reserved. 4130389Sle * 5130389Sle * Redistribution and use in source and binary forms, with or without 6130389Sle * modification, are permitted provided that the following conditions 7130389Sle * are met: 8130389Sle * 1. Redistributions of source code must retain the above copyright 9130389Sle * notice, this list of conditions and the following disclaimer. 10130389Sle * 2. Redistributions in binary form must reproduce the above copyright 11130389Sle * notice, this list of conditions and the following disclaimer in the 12130389Sle * documentation and/or other materials provided with the distribution. 13130389Sle * 14130389Sle * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15130389Sle * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16130389Sle * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17130389Sle * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18130389Sle * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19130389Sle * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20130389Sle * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21130389Sle * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22130389Sle * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23130389Sle * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24130389Sle * SUCH DAMAGE. 25130389Sle */ 26130389Sle 27130389Sle#include <sys/cdefs.h> 28130389Sle__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_plex.c 135426 2004-09-18 13:44:43Z le $"); 29130389Sle 30130389Sle#include <sys/param.h> 31130389Sle#include <sys/bio.h> 32130389Sle#include <sys/kernel.h> 33130389Sle#include <sys/kthread.h> 34130389Sle#include <sys/libkern.h> 35130389Sle#include <sys/lock.h> 36130389Sle#include <sys/malloc.h> 37130389Sle#include <sys/module.h> 38130389Sle#include <sys/mutex.h> 39130389Sle#include <sys/systm.h> 40130389Sle 41130389Sle#include <geom/geom.h> 42130389Sle#include <geom/vinum/geom_vinum_var.h> 43130389Sle#include <geom/vinum/geom_vinum_raid5.h> 44130389Sle#include <geom/vinum/geom_vinum.h> 45130389Sle 46135426Slestatic void gv_plex_completed_request(struct gv_plex *, struct bio *); 47135426Slestatic void gv_plex_normal_request(struct gv_plex *, struct bio *); 48135426Slestatic void gv_plex_worker(void *); 49135426Sle 50130389Sle/* XXX: is this the place to catch dying subdisks? */ 51130389Slestatic void 52130389Slegv_plex_orphan(struct g_consumer *cp) 53130389Sle{ 54130389Sle struct g_geom *gp; 55130389Sle struct gv_plex *p; 56130389Sle int error; 57130389Sle 58130389Sle g_topology_assert(); 59130389Sle gp = cp->geom; 60130389Sle g_trace(G_T_TOPOLOGY, "gv_plex_orphan(%s)", gp->name); 61130389Sle 62130389Sle if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) 63130389Sle g_access(cp, -cp->acr, -cp->acw, -cp->ace); 64130389Sle error = cp->provider->error; 65130389Sle if (error == 0) 66130389Sle error = ENXIO; 67130389Sle g_detach(cp); 68130389Sle g_destroy_consumer(cp); 69130389Sle if (!LIST_EMPTY(&gp->consumer)) 70130389Sle return; 71130389Sle 72130389Sle p = gp->softc; 73130697Sle if (p != NULL) { 74135164Sle gv_kill_plex_thread(p); 75130697Sle p->geom = NULL; 76130697Sle p->provider = NULL; 77130697Sle p->consumer = NULL; 78130697Sle } 79130597Sle gp->softc = NULL; 80130389Sle g_wither_geom(gp, error); 81130389Sle} 82130389Sle 83135426Slevoid 84130389Slegv_plex_done(struct bio *bp) 85130389Sle{ 86135426Sle struct gv_plex *p; 87135426Sle struct gv_bioq *bq; 88130389Sle 89135426Sle p = bp->bio_from->geom->softc; 90135426Sle bp->bio_cflags |= GV_BIO_DONE; 91135426Sle bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO); 92135426Sle bq->bp = bp; 93135426Sle mtx_lock(&p->bqueue_mtx); 94135426Sle TAILQ_INSERT_TAIL(&p->bqueue, bq, queue); 95135426Sle wakeup(p); 96135426Sle mtx_unlock(&p->bqueue_mtx); 97130389Sle} 98130389Sle 99130389Sle/* Find the correct subdisk to send the bio to and build a bio to send. */ 100130389Slestatic int 101135426Slegv_plexbuffer(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) 102130389Sle{ 103130389Sle struct g_geom *gp; 104130389Sle struct gv_sd *s; 105135426Sle struct bio *cbp, *pbp; 106130389Sle int i, sdno; 107135426Sle off_t len_left, real_len, real_off; 108135426Sle off_t stripeend, stripeno, stripestart; 109130389Sle 110135426Sle if (p == NULL || LIST_EMPTY(&p->subdisks)) 111135426Sle return (ENXIO); 112135426Sle 113130389Sle s = NULL; 114130389Sle gp = bp->bio_to->geom; 115130389Sle 116130389Sle /* 117130389Sle * We only handle concatenated and striped plexes here. RAID5 plexes 118130389Sle * are handled in build_raid5_request(). 119130389Sle */ 120130389Sle switch (p->org) { 121130389Sle case GV_PLEX_CONCAT: 122130389Sle /* 123130389Sle * Find the subdisk where this request starts. The subdisks in 124130389Sle * this list must be ordered by plex_offset. 125130389Sle */ 126130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 127130389Sle if (s->plex_offset <= boff && 128130389Sle s->plex_offset + s->size > boff) 129130389Sle break; 130130389Sle } 131130389Sle /* Subdisk not found. */ 132130389Sle if (s == NULL) 133130389Sle return (ENXIO); 134130389Sle 135130389Sle /* Calculate corresponding offsets on disk. */ 136130389Sle real_off = boff - s->plex_offset; 137130389Sle len_left = s->size - real_off; 138130389Sle real_len = (bcount > len_left) ? len_left : bcount; 139130389Sle break; 140130389Sle 141130389Sle case GV_PLEX_STRIPED: 142130389Sle /* The number of the stripe where the request starts. */ 143130389Sle stripeno = boff / p->stripesize; 144130389Sle 145130389Sle /* The number of the subdisk where the stripe resides. */ 146130389Sle sdno = stripeno % p->sdcount; 147130389Sle 148130389Sle /* Find the right subdisk. */ 149130389Sle i = 0; 150130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 151130389Sle if (i == sdno) 152130389Sle break; 153130389Sle i++; 154130389Sle } 155130389Sle 156130389Sle /* Subdisk not found. */ 157130389Sle if (s == NULL) 158130389Sle return (ENXIO); 159130389Sle 160130389Sle /* The offset of the stripe from the start of the subdisk. */ 161130389Sle stripestart = (stripeno / p->sdcount) * 162130389Sle p->stripesize; 163130389Sle 164130389Sle /* The offset at the end of the stripe. */ 165130389Sle stripeend = stripestart + p->stripesize; 166130389Sle 167130389Sle /* The offset of the request on this subdisk. */ 168130389Sle real_off = boff - (stripeno * p->stripesize) + 169130389Sle stripestart; 170130389Sle 171130389Sle /* The length left in this stripe. */ 172130389Sle len_left = stripeend - real_off; 173130389Sle 174130389Sle real_len = (bcount <= len_left) ? bcount : len_left; 175130389Sle break; 176130389Sle 177130389Sle default: 178130389Sle return (EINVAL); 179130389Sle } 180130389Sle 181130389Sle /* Now check if we can handle the request on this subdisk. */ 182130389Sle switch (s->state) { 183130389Sle case GV_SD_UP: 184130389Sle /* If the subdisk is up, just continue. */ 185130389Sle break; 186130389Sle 187130389Sle case GV_SD_STALE: 188135426Sle if (!(bp->bio_cflags & GV_BIO_SYNCREQ)) 189130389Sle return (ENXIO); 190130389Sle 191135426Sle printf("GEOM_VINUM: sd %s is initializing\n", s->name); 192130389Sle gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); 193130389Sle break; 194130389Sle 195130389Sle case GV_SD_INITIALIZING: 196130389Sle if (bp->bio_cmd == BIO_READ) 197130389Sle return (ENXIO); 198130389Sle break; 199130389Sle 200130389Sle default: 201130389Sle /* All other subdisk states mean it's not accessible. */ 202130389Sle return (ENXIO); 203130389Sle } 204130389Sle 205130389Sle /* Clone the bio and adjust the offsets and sizes. */ 206130389Sle cbp = g_clone_bio(bp); 207130389Sle if (cbp == NULL) 208130389Sle return (ENOMEM); 209130389Sle cbp->bio_offset = real_off; 210130389Sle cbp->bio_length = real_len; 211130389Sle cbp->bio_data = addr; 212135426Sle cbp->bio_done = g_std_done; 213135426Sle cbp->bio_caller2 = s->consumer; 214135426Sle if ((bp->bio_cflags & GV_BIO_SYNCREQ)) { 215135426Sle cbp->bio_cflags |= GV_BIO_SYNCREQ; 216130389Sle cbp->bio_done = gv_plex_done; 217135426Sle } 218135426Sle 219135426Sle if (bp->bio_driver1 == NULL) { 220135426Sle bp->bio_driver1 = cbp; 221135426Sle } else { 222135426Sle pbp = bp->bio_driver1; 223135426Sle while (pbp->bio_caller1 != NULL) 224135426Sle pbp = pbp->bio_caller1; 225135426Sle pbp->bio_caller1 = cbp; 226135426Sle } 227135426Sle 228130389Sle return (0); 229130389Sle} 230130389Sle 231130389Slestatic void 232130389Slegv_plex_start(struct bio *bp) 233130389Sle{ 234130389Sle struct gv_plex *p; 235135426Sle struct gv_bioq *bq; 236130389Sle 237135426Sle switch(bp->bio_cmd) { 238135426Sle case BIO_READ: 239135426Sle case BIO_WRITE: 240135426Sle case BIO_DELETE: 241135426Sle break; 242135426Sle case BIO_GETATTR: 243135426Sle default: 244135426Sle g_io_deliver(bp, EOPNOTSUPP); 245135426Sle return; 246135426Sle } 247130389Sle 248130389Sle /* 249130389Sle * We cannot handle this request if too many of our subdisks are 250130389Sle * inaccessible. 251130389Sle */ 252135426Sle p = bp->bio_to->geom->softc; 253135426Sle if ((p->state < GV_PLEX_DEGRADED) && 254135426Sle !(bp->bio_cflags & GV_BIO_SYNCREQ)) { 255135426Sle g_io_deliver(bp, ENXIO); 256130389Sle return; 257130389Sle } 258130389Sle 259135426Sle bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO); 260135426Sle bq->bp = bp; 261135426Sle mtx_lock(&p->bqueue_mtx); 262135426Sle TAILQ_INSERT_TAIL(&p->bqueue, bq, queue); 263135426Sle wakeup(p); 264135426Sle mtx_unlock(&p->bqueue_mtx); 265135426Sle} 266135426Sle 267135426Slestatic void 268135426Slegv_plex_worker(void *arg) 269135426Sle{ 270135426Sle struct bio *bp; 271135426Sle struct gv_plex *p; 272135426Sle struct gv_sd *s; 273135426Sle struct gv_bioq *bq; 274135426Sle 275135426Sle p = arg; 276135426Sle KASSERT(p != NULL, ("NULL p")); 277135426Sle 278135426Sle mtx_lock(&p->bqueue_mtx); 279135426Sle for (;;) { 280135426Sle /* We were signaled to exit. */ 281135426Sle if (p->flags & GV_PLEX_THREAD_DIE) 282135426Sle break; 283135426Sle 284135426Sle /* Take the first BIO from our queue. */ 285135426Sle bq = TAILQ_FIRST(&p->bqueue); 286135426Sle if (bq == NULL) { 287135426Sle msleep(p, &p->bqueue_mtx, PRIBIO, "-", hz/10); 288135426Sle continue; 289135426Sle } 290135426Sle TAILQ_REMOVE(&p->bqueue, bq, queue); 291135426Sle mtx_unlock(&p->bqueue_mtx); 292135426Sle 293135426Sle bp = bq->bp; 294135426Sle 295135426Sle /* A completed request. */ 296135426Sle if (bp->bio_cflags & GV_BIO_DONE) { 297135426Sle g_free(bq); 298135426Sle if (bp->bio_cflags & GV_BIO_SYNCREQ) { 299135426Sle s = bp->bio_to->private; 300135426Sle if (bp->bio_error == 0) 301135426Sle s->initialized += bp->bio_length; 302135426Sle if (s->initialized >= s->size) { 303135426Sle g_topology_lock(); 304135426Sle gv_set_sd_state(s, GV_SD_UP, 305135426Sle GV_SETSTATE_CONFIG); 306135426Sle g_topology_unlock(); 307135426Sle s->initialized = 0; 308135426Sle } 309135426Sle g_std_done(bp); 310135426Sle } else 311135426Sle gv_plex_completed_request(p, bp); 312130389Sle /* 313135426Sle * A sub-request that was hold back because it interfered with 314135426Sle * another sub-request. 315130389Sle */ 316135426Sle } else if (bp->bio_cflags & GV_BIO_ONHOLD) { 317135426Sle /* Is it still locked out? */ 318135426Sle if (gv_stripe_active(p, bp)) { 319135426Sle mtx_lock(&p->bqueue_mtx); 320135426Sle TAILQ_INSERT_TAIL(&p->bqueue, bq, queue); 321135426Sle mtx_unlock(&p->bqueue_mtx); 322135426Sle } else { 323135426Sle g_free(bq); 324135426Sle bp->bio_cflags &= ~GV_BIO_ONHOLD; 325135426Sle g_io_request(bp, bp->bio_caller2); 326135426Sle } 327130389Sle 328135426Sle /* A normal request to this plex. */ 329135426Sle } else { 330135426Sle g_free(bq); 331135426Sle gv_plex_normal_request(p, bp); 332135426Sle } 333135426Sle 334135426Sle mtx_lock(&p->bqueue_mtx); 335135426Sle } 336135426Sle mtx_unlock(&p->bqueue_mtx); 337135426Sle p->flags |= GV_PLEX_THREAD_DEAD; 338135426Sle wakeup(p); 339135426Sle 340135426Sle kthread_exit(ENXIO); 341135426Sle} 342135426Sle 343135426Slevoid 344135426Slegv_plex_completed_request(struct gv_plex *p, struct bio *bp) 345135426Sle{ 346135426Sle struct bio *cbp, *pbp; 347135426Sle struct gv_bioq *bq, *bq2; 348135426Sle struct gv_raid5_packet *wp; 349135426Sle int i; 350135426Sle 351135426Sle wp = bp->bio_driver1; 352135426Sle 353135426Sle switch (bp->bio_parent->bio_cmd) { 354135426Sle case BIO_READ: 355135426Sle if (wp == NULL) 356135426Sle break; 357135426Sle 358135426Sle TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 359135426Sle if (bq->bp == bp) { 360135426Sle TAILQ_REMOVE(&wp->bits, bq, queue); 361135426Sle g_free(bq); 362135426Sle for (i = 0; i < wp->length; i++) 363135426Sle wp->data[i] ^= bp->bio_data[i]; 364135426Sle break; 365130389Sle } 366135426Sle } 367135426Sle if (TAILQ_EMPTY(&wp->bits)) { 368135426Sle bp->bio_parent->bio_completed += wp->length; 369135426Sle if (wp->lockbase != -1) 370135426Sle TAILQ_REMOVE(&p->packets, wp, list); 371135426Sle g_free(wp); 372135426Sle } 373130389Sle 374135426Sle break; 375135426Sle 376135426Sle case BIO_WRITE: 377135426Sle if (wp == NULL) 378135426Sle break; 379135426Sle 380135426Sle /* Check if we need to handle parity data. */ 381135426Sle TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 382135426Sle if (bq->bp == bp) { 383135426Sle TAILQ_REMOVE(&wp->bits, bq, queue); 384135426Sle g_free(bq); 385135426Sle cbp = wp->parity; 386135426Sle if (cbp != NULL) { 387135426Sle for (i = 0; i < wp->length; i++) 388135426Sle cbp->bio_data[i] ^= 389135426Sle bp->bio_data[i]; 390135426Sle } 391135426Sle break; 392135426Sle } 393135426Sle } 394135426Sle 395135426Sle /* Handle parity data. */ 396135426Sle if (TAILQ_EMPTY(&wp->bits)) { 397135426Sle if (wp->waiting != NULL) { 398135426Sle pbp = wp->waiting; 399135426Sle wp->waiting = NULL; 400135426Sle cbp = wp->parity; 401135426Sle for (i = 0; i < wp->length; i++) 402135426Sle cbp->bio_data[i] ^= pbp->bio_data[i]; 403135426Sle g_io_request(pbp, pbp->bio_caller2); 404135426Sle } else if (wp->parity != NULL) { 405135426Sle cbp = wp->parity; 406135426Sle wp->parity = NULL; 407135426Sle g_io_request(cbp, cbp->bio_caller2); 408130389Sle } else { 409135426Sle bp->bio_parent->bio_completed += wp->length; 410135426Sle TAILQ_REMOVE(&p->packets, wp, list); 411135426Sle g_free(wp); 412130389Sle } 413135426Sle } 414130389Sle 415135426Sle break; 416135426Sle } 417135426Sle 418135426Sle pbp = bp->bio_parent; 419135426Sle if (pbp->bio_error == 0) 420135426Sle pbp->bio_error = bp->bio_error; 421135426Sle 422135426Sle /* When the original request is finished, we deliver it. */ 423135426Sle pbp->bio_inbed++; 424135426Sle if (pbp->bio_inbed == pbp->bio_children) 425135426Sle g_io_deliver(pbp, pbp->bio_error); 426135426Sle 427135426Sle /* Clean up what we allocated. */ 428135426Sle if (bp->bio_cflags & GV_BIO_MALLOC) 429135426Sle g_free(bp->bio_data); 430135426Sle g_destroy_bio(bp); 431135426Sle} 432135426Sle 433135426Slevoid 434135426Slegv_plex_normal_request(struct gv_plex *p, struct bio *bp) 435135426Sle{ 436135426Sle struct bio *cbp, *pbp; 437135426Sle struct gv_bioq *bq, *bq2; 438135426Sle struct gv_raid5_packet *wp, *wp2; 439135426Sle caddr_t addr; 440135426Sle off_t bcount, boff; 441135426Sle int err; 442135426Sle 443135426Sle bcount = bp->bio_length; 444135426Sle addr = bp->bio_data; 445135426Sle boff = bp->bio_offset; 446135426Sle 447135426Sle /* Walk over the whole length of the request, we might split it up. */ 448135426Sle while (bcount > 0) { 449135426Sle wp = NULL; 450135426Sle 451135426Sle /* 452135426Sle * RAID5 plexes need special treatment, as a single write 453135426Sle * request involves several read/write sub-requests. 454135426Sle */ 455135426Sle if (p->org == GV_PLEX_RAID5) { 456135426Sle wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); 457135426Sle wp->bio = bp; 458135426Sle TAILQ_INIT(&wp->bits); 459135426Sle 460135426Sle err = gv_build_raid5_req(p, wp, bp, addr, boff, bcount); 461135426Sle 462135426Sle /* 463135426Sle * Building the sub-request failed, we probably need to 464135426Sle * clean up a lot. 465135426Sle */ 466135426Sle if (err) { 467135426Sle printf("GEOM_VINUM: plex request failed for "); 468135426Sle g_print_bio(bp); 469135426Sle printf("\n"); 470135426Sle TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 471135426Sle TAILQ_REMOVE(&wp->bits, bq, queue); 472135426Sle g_free(bq); 473135426Sle } 474135426Sle if (wp->waiting != NULL) { 475135426Sle if (wp->waiting->bio_cflags & 476135426Sle GV_BIO_MALLOC) 477135426Sle g_free(wp->waiting->bio_data); 478135426Sle g_destroy_bio(wp->waiting); 479135426Sle } 480135426Sle if (wp->parity != NULL) { 481135426Sle if (wp->parity->bio_cflags & 482135426Sle GV_BIO_MALLOC) 483135426Sle g_free(wp->parity->bio_data); 484135426Sle g_destroy_bio(wp->parity); 485135426Sle } 486135426Sle g_free(wp); 487135426Sle 488135426Sle TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { 489135426Sle if (wp->bio == bp) { 490135426Sle TAILQ_REMOVE(&p->packets, wp, 491135426Sle list); 492135426Sle TAILQ_FOREACH_SAFE(bq, 493135426Sle &wp->bits, queue, bq2) { 494135426Sle TAILQ_REMOVE(&wp->bits, 495135426Sle bq, queue); 496135426Sle g_free(bq); 497135426Sle } 498135426Sle g_free(wp); 499135426Sle } 500135426Sle } 501135426Sle 502135426Sle cbp = bp->bio_driver1; 503135426Sle while (cbp != NULL) { 504135426Sle pbp = cbp->bio_caller1; 505135426Sle if (cbp->bio_cflags & GV_BIO_MALLOC) 506135426Sle g_free(cbp->bio_data); 507135426Sle g_destroy_bio(cbp); 508135426Sle cbp = pbp; 509135426Sle } 510135426Sle 511135426Sle g_io_deliver(bp, err); 512135426Sle return; 513135426Sle } 514135426Sle 515135426Sle if (TAILQ_EMPTY(&wp->bits)) 516135426Sle g_free(wp); 517135426Sle else if (wp->lockbase != -1) 518135426Sle TAILQ_INSERT_TAIL(&p->packets, wp, list); 519135426Sle 520135426Sle /* 521135426Sle * Requests to concatenated and striped plexes go straight 522135426Sle * through. 523135426Sle */ 524135426Sle } else { 525135426Sle err = gv_plexbuffer(p, bp, addr, boff, bcount); 526135426Sle 527135426Sle /* Building the sub-request failed. */ 528135426Sle if (err) { 529135426Sle printf("GEOM_VINUM: plex request failed for "); 530135426Sle g_print_bio(bp); 531135426Sle printf("\n"); 532135426Sle cbp = bp->bio_driver1; 533135426Sle while (cbp != NULL) { 534135426Sle pbp = cbp->bio_caller1; 535135426Sle g_destroy_bio(cbp); 536135426Sle cbp = pbp; 537135426Sle } 538135426Sle g_io_deliver(bp, err); 539135426Sle return; 540135426Sle } 541130389Sle } 542135426Sle 543135426Sle /* Abuse bio_caller1 as linked list. */ 544135426Sle pbp = bp->bio_driver1; 545135426Sle while (pbp->bio_caller1 != NULL) 546135426Sle pbp = pbp->bio_caller1; 547135426Sle bcount -= pbp->bio_length; 548135426Sle addr += pbp->bio_length; 549135426Sle boff += pbp->bio_length; 550135426Sle } 551130389Sle 552135426Sle /* Fire off all sub-requests. */ 553135426Sle pbp = bp->bio_driver1; 554135426Sle while (pbp != NULL) { 555135426Sle /* 556135426Sle * RAID5 sub-requests need to come in correct order, otherwise 557135426Sle * we trip over the parity, as it might be overwritten by 558135426Sle * another sub-request. 559135426Sle */ 560135426Sle if (pbp->bio_driver1 != NULL && 561135426Sle gv_stripe_active(p, pbp)) { 562135426Sle pbp->bio_cflags |= GV_BIO_ONHOLD; 563135426Sle bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 564135426Sle bq->bp = pbp; 565135426Sle mtx_lock(&p->bqueue_mtx); 566135426Sle TAILQ_INSERT_TAIL(&p->bqueue, bq, queue); 567135426Sle mtx_unlock(&p->bqueue_mtx); 568135426Sle } else 569135426Sle g_io_request(pbp, pbp->bio_caller2); 570135426Sle pbp = pbp->bio_caller1; 571130389Sle } 572130389Sle} 573130389Sle 574130389Slestatic int 575130389Slegv_plex_access(struct g_provider *pp, int dr, int dw, int de) 576130389Sle{ 577130389Sle struct g_geom *gp; 578130389Sle struct g_consumer *cp, *cp2; 579130389Sle int error; 580130389Sle 581130389Sle gp = pp->geom; 582130389Sle 583130389Sle error = ENXIO; 584130389Sle LIST_FOREACH(cp, &gp->consumer, consumer) { 585130389Sle error = g_access(cp, dr, dw, de); 586130389Sle if (error) { 587130389Sle LIST_FOREACH(cp2, &gp->consumer, consumer) { 588130389Sle if (cp == cp2) 589130389Sle break; 590130389Sle g_access(cp2, -dr, -dw, -de); 591130389Sle } 592130389Sle return (error); 593130389Sle } 594130389Sle } 595130389Sle return (error); 596130389Sle} 597130389Sle 598130389Slestatic struct g_geom * 599130389Slegv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) 600130389Sle{ 601130389Sle struct g_geom *gp; 602132906Sle struct g_consumer *cp, *cp2; 603130389Sle struct g_provider *pp2; 604130389Sle struct gv_plex *p; 605130389Sle struct gv_sd *s; 606130389Sle struct gv_softc *sc; 607132906Sle int error; 608130389Sle 609130389Sle g_trace(G_T_TOPOLOGY, "gv_plex_taste(%s, %s)", mp->name, pp->name); 610130389Sle g_topology_assert(); 611130389Sle 612130389Sle /* We only want to attach to subdisks. */ 613130389Sle if (strcmp(pp->geom->class->name, "VINUMDRIVE")) 614130389Sle return (NULL); 615130389Sle 616130389Sle /* Find the VINUM class and its associated geom. */ 617130389Sle gp = find_vinum_geom(); 618130389Sle if (gp == NULL) 619130389Sle return (NULL); 620130389Sle sc = gp->softc; 621130389Sle KASSERT(sc != NULL, ("gv_plex_taste: NULL sc")); 622130389Sle 623130389Sle /* Find out which subdisk the offered provider corresponds to. */ 624130389Sle s = pp->private; 625130389Sle KASSERT(s != NULL, ("gv_plex_taste: NULL s")); 626130389Sle 627130389Sle /* Now find the correct plex where this subdisk belongs to. */ 628130389Sle p = gv_find_plex(sc, s->plex); 629130389Sle KASSERT(p != NULL, ("gv_plex_taste: NULL p")); 630130389Sle 631130389Sle /* 632130389Sle * Add this subdisk to this plex. Since we trust the on-disk 633130389Sle * configuration, we don't check the given value (should we?). 634130389Sle * XXX: shouldn't be done here 635130389Sle */ 636130389Sle gv_sd_to_plex(p, s, 0); 637130389Sle 638130389Sle /* Now check if there's already a geom for this plex. */ 639130389Sle gp = p->geom; 640130389Sle 641130389Sle /* Yes, there is already a geom, so we just add the consumer. */ 642130389Sle if (gp != NULL) { 643132906Sle cp2 = LIST_FIRST(&gp->consumer); 644130389Sle /* Need to attach a new consumer to this subdisk. */ 645130389Sle cp = g_new_consumer(gp); 646132906Sle error = g_attach(cp, pp); 647132906Sle if (error) { 648132906Sle printf("geom_vinum: couldn't attach consumer to %s\n", 649132906Sle pp->name); 650132906Sle g_destroy_consumer(cp); 651132906Sle return (NULL); 652132906Sle } 653132906Sle /* Adjust the access counts of the new consumer. */ 654132906Sle if ((cp2 != NULL) && (cp2->acr || cp2->acw || cp2->ace)) { 655132906Sle error = g_access(cp, cp2->acr, cp2->acw, cp2->ace); 656132906Sle if (error) { 657132906Sle printf("geom_vinum: couldn't set access counts" 658132906Sle " for consumer on %s\n", pp->name); 659132906Sle g_detach(cp); 660132906Sle g_destroy_consumer(cp); 661132906Sle return (NULL); 662132906Sle } 663132906Sle } 664130389Sle s->consumer = cp; 665130389Sle 666130389Sle /* Adjust the size of the providers this plex has. */ 667130389Sle LIST_FOREACH(pp2, &gp->provider, provider) 668130389Sle pp2->mediasize = p->size; 669130389Sle 670132940Sle /* Update the size of the volume this plex is attached to. */ 671132940Sle if (p->vol_sc != NULL) 672132940Sle gv_update_vol_size(p->vol_sc, p->size); 673132940Sle 674130389Sle return (NULL); 675130389Sle 676130389Sle /* We need to create a new geom. */ 677130389Sle } else { 678130389Sle gp = g_new_geomf(mp, "%s", p->name); 679130389Sle gp->start = gv_plex_start; 680130389Sle gp->orphan = gv_plex_orphan; 681130389Sle gp->access = gv_plex_access; 682130389Sle gp->softc = p; 683130389Sle p->geom = gp; 684130389Sle 685135426Sle TAILQ_INIT(&p->packets); 686135426Sle TAILQ_INIT(&p->bqueue); 687135426Sle mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF); 688135426Sle kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s", 689135426Sle p->name); 690135426Sle p->flags |= GV_PLEX_THREAD_ACTIVE; 691130389Sle 692130389Sle /* Attach a consumer to this provider. */ 693130389Sle cp = g_new_consumer(gp); 694130389Sle g_attach(cp, pp); 695130389Sle s->consumer = cp; 696130389Sle 697130389Sle /* Create a provider for the outside world. */ 698130389Sle pp2 = g_new_providerf(gp, "gvinum/plex/%s", p->name); 699130389Sle pp2->mediasize = p->size; 700130389Sle pp2->sectorsize = pp->sectorsize; 701130389Sle p->provider = pp2; 702130389Sle g_error_provider(pp2, 0); 703130389Sle return (gp); 704130389Sle } 705130389Sle} 706130389Sle 707130389Slestatic int 708130389Slegv_plex_destroy_geom(struct gctl_req *req, struct g_class *mp, 709130389Sle struct g_geom *gp) 710130389Sle{ 711130389Sle struct gv_plex *p; 712130389Sle 713130389Sle g_trace(G_T_TOPOLOGY, "gv_plex_destroy_geom: %s", gp->name); 714130389Sle g_topology_assert(); 715130389Sle 716130389Sle p = gp->softc; 717130389Sle 718130389Sle KASSERT(p != NULL, ("gv_plex_destroy_geom: null p of '%s'", gp->name)); 719130389Sle 720130389Sle /* 721130389Sle * If this is a RAID5 plex, check if its worker thread is still active 722130389Sle * and signal it to self destruct. 723130389Sle */ 724135164Sle gv_kill_plex_thread(p); 725130389Sle /* g_free(sc); */ 726130389Sle g_wither_geom(gp, ENXIO); 727130389Sle return (0); 728130389Sle} 729130389Sle 730130389Sle#define VINUMPLEX_CLASS_NAME "VINUMPLEX" 731130389Sle 732130389Slestatic struct g_class g_vinum_plex_class = { 733130389Sle .name = VINUMPLEX_CLASS_NAME, 734133318Sphk .version = G_VERSION, 735130389Sle .taste = gv_plex_taste, 736130389Sle .destroy_geom = gv_plex_destroy_geom, 737130389Sle}; 738130389Sle 739130389SleDECLARE_GEOM_CLASS(g_vinum_plex_class, g_vinum_plex); 740