geom_vinum_plex.c revision 133318
1130389Sle/*- 2130389Sle * Copyright (c) 2004 Lukas Ertl 3130389Sle * All rights reserved. 4130389Sle * 5130389Sle * Redistribution and use in source and binary forms, with or without 6130389Sle * modification, are permitted provided that the following conditions 7130389Sle * are met: 8130389Sle * 1. Redistributions of source code must retain the above copyright 9130389Sle * notice, this list of conditions and the following disclaimer. 10130389Sle * 2. Redistributions in binary form must reproduce the above copyright 11130389Sle * notice, this list of conditions and the following disclaimer in the 12130389Sle * documentation and/or other materials provided with the distribution. 13130389Sle * 14130389Sle * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15130389Sle * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16130389Sle * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17130389Sle * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18130389Sle * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19130389Sle * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20130389Sle * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21130389Sle * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22130389Sle * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23130389Sle * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24130389Sle * SUCH DAMAGE. 25130389Sle */ 26130389Sle 27130389Sle#include <sys/cdefs.h> 28130389Sle__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_plex.c 133318 2004-08-08 07:57:53Z phk $"); 29130389Sle 30130389Sle#include <sys/param.h> 31130389Sle#include <sys/bio.h> 32130389Sle#include <sys/kernel.h> 33130389Sle#include <sys/kthread.h> 34130389Sle#include <sys/libkern.h> 35130389Sle#include <sys/lock.h> 36130389Sle#include <sys/malloc.h> 37130389Sle#include <sys/module.h> 38130389Sle#include <sys/mutex.h> 39130389Sle#include <sys/systm.h> 40130389Sle 41130389Sle#include <geom/geom.h> 42130389Sle#include <geom/vinum/geom_vinum_var.h> 43130389Sle#include <geom/vinum/geom_vinum_raid5.h> 44130389Sle#include <geom/vinum/geom_vinum.h> 45130389Sle 46130389Sle/* XXX: is this the place to catch dying subdisks? */ 47130389Slestatic void 48130389Slegv_plex_orphan(struct g_consumer *cp) 49130389Sle{ 50130389Sle struct g_geom *gp; 51130389Sle struct gv_plex *p; 52130389Sle int error; 53130389Sle 54130389Sle g_topology_assert(); 55130389Sle gp = cp->geom; 56130389Sle g_trace(G_T_TOPOLOGY, "gv_plex_orphan(%s)", gp->name); 57130389Sle 58130389Sle if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) 59130389Sle g_access(cp, -cp->acr, -cp->acw, -cp->ace); 60130389Sle error = cp->provider->error; 61130389Sle if (error == 0) 62130389Sle error = ENXIO; 63130389Sle g_detach(cp); 64130389Sle g_destroy_consumer(cp); 65130389Sle if (!LIST_EMPTY(&gp->consumer)) 66130389Sle return; 67130389Sle 68130389Sle p = gp->softc; 69130697Sle if (p != NULL) { 70130697Sle gv_kill_thread(p); 71130697Sle p->geom = NULL; 72130697Sle p->provider = NULL; 73130697Sle p->consumer = NULL; 74130697Sle } 75130597Sle gp->softc = NULL; 76130389Sle g_wither_geom(gp, error); 77130389Sle} 78130389Sle 79130389Slestatic void 80130389Slegv_plex_done(struct bio *bp) 81130389Sle{ 82130389Sle struct g_geom *gp; 83130389Sle struct gv_sd *s; 84130389Sle 85130389Sle gp = bp->bio_to->geom; 86130389Sle 87130389Sle s = bp->bio_caller1; 88130389Sle KASSERT(s != NULL, ("gv_plex_done: NULL s")); 89130389Sle 90130389Sle if (bp->bio_error == 0) 91130389Sle s->initialized += bp->bio_length; 92130389Sle 93130389Sle if (s->initialized >= s->size) { 94130389Sle gv_set_sd_state(s, GV_SD_UP, 0); 95130389Sle s->initialized = 0; 96130389Sle } 97130389Sle 98130389Sle g_std_done(bp); 99130389Sle} 100130389Sle 101130389Sle/* Find the correct subdisk to send the bio to and build a bio to send. */ 102130389Slestatic int 103130389Slegv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp, 104130389Sle caddr_t addr, long bcount, off_t boff) 105130389Sle{ 106130389Sle struct g_geom *gp; 107130389Sle struct gv_plex *p; 108130389Sle struct gv_sd *s; 109130389Sle struct bio *cbp; 110130389Sle int i, sdno; 111130389Sle off_t len_left, real_len, real_off, stripeend, stripeno, stripestart; 112130389Sle 113130389Sle s = NULL; 114130389Sle 115130389Sle gp = bp->bio_to->geom; 116130389Sle p = gp->softc; 117130389Sle 118130389Sle if (p == NULL || LIST_EMPTY(&p->subdisks)) 119130389Sle return (ENXIO); 120130389Sle 121130389Sle /* 122130389Sle * We only handle concatenated and striped plexes here. RAID5 plexes 123130389Sle * are handled in build_raid5_request(). 124130389Sle */ 125130389Sle switch (p->org) { 126130389Sle case GV_PLEX_CONCAT: 127130389Sle /* 128130389Sle * Find the subdisk where this request starts. The subdisks in 129130389Sle * this list must be ordered by plex_offset. 130130389Sle */ 131130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 132130389Sle if (s->plex_offset <= boff && 133130389Sle s->plex_offset + s->size > boff) 134130389Sle break; 135130389Sle } 136130389Sle /* Subdisk not found. */ 137130389Sle if (s == NULL) 138130389Sle return (ENXIO); 139130389Sle 140130389Sle /* Calculate corresponding offsets on disk. */ 141130389Sle real_off = boff - s->plex_offset; 142130389Sle len_left = s->size - real_off; 143130389Sle real_len = (bcount > len_left) ? len_left : bcount; 144130389Sle break; 145130389Sle 146130389Sle case GV_PLEX_STRIPED: 147130389Sle /* The number of the stripe where the request starts. */ 148130389Sle stripeno = boff / p->stripesize; 149130389Sle 150130389Sle /* The number of the subdisk where the stripe resides. */ 151130389Sle sdno = stripeno % p->sdcount; 152130389Sle 153130389Sle /* Find the right subdisk. */ 154130389Sle i = 0; 155130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 156130389Sle if (i == sdno) 157130389Sle break; 158130389Sle i++; 159130389Sle } 160130389Sle 161130389Sle /* Subdisk not found. */ 162130389Sle if (s == NULL) 163130389Sle return (ENXIO); 164130389Sle 165130389Sle /* The offset of the stripe from the start of the subdisk. */ 166130389Sle stripestart = (stripeno / p->sdcount) * 167130389Sle p->stripesize; 168130389Sle 169130389Sle /* The offset at the end of the stripe. */ 170130389Sle stripeend = stripestart + p->stripesize; 171130389Sle 172130389Sle /* The offset of the request on this subdisk. */ 173130389Sle real_off = boff - (stripeno * p->stripesize) + 174130389Sle stripestart; 175130389Sle 176130389Sle /* The length left in this stripe. */ 177130389Sle len_left = stripeend - real_off; 178130389Sle 179130389Sle real_len = (bcount <= len_left) ? bcount : len_left; 180130389Sle break; 181130389Sle 182130389Sle default: 183130389Sle return (EINVAL); 184130389Sle } 185130389Sle 186130389Sle /* Now check if we can handle the request on this subdisk. */ 187130389Sle switch (s->state) { 188130389Sle case GV_SD_UP: 189130389Sle /* If the subdisk is up, just continue. */ 190130389Sle break; 191130389Sle 192130389Sle case GV_SD_STALE: 193130389Sle if (bp->bio_caller1 != p) 194130389Sle return (ENXIO); 195130389Sle 196130389Sle printf("FOO: setting sd %s to GV_SD_INITIALIZING\n", s->name); 197130389Sle gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); 198130389Sle break; 199130389Sle 200130389Sle case GV_SD_INITIALIZING: 201130389Sle if (bp->bio_cmd == BIO_READ) 202130389Sle return (ENXIO); 203130389Sle break; 204130389Sle 205130389Sle default: 206130389Sle /* All other subdisk states mean it's not accessible. */ 207130389Sle return (ENXIO); 208130389Sle } 209130389Sle 210130389Sle /* Clone the bio and adjust the offsets and sizes. */ 211130389Sle cbp = g_clone_bio(bp); 212130389Sle if (cbp == NULL) 213130389Sle return (ENOMEM); 214130389Sle cbp->bio_offset = real_off; 215130389Sle cbp->bio_length = real_len; 216130389Sle cbp->bio_data = addr; 217130389Sle if (bp->bio_caller1 == p) { 218130389Sle cbp->bio_caller1 = s; 219130389Sle cbp->bio_done = gv_plex_done; 220130389Sle } else 221130389Sle cbp->bio_done = g_std_done; 222130389Sle *bp2 = cbp; 223130389Sle *cp = s->consumer; 224130389Sle return (0); 225130389Sle} 226130389Sle 227130389Slestatic void 228130389Slegv_plex_start(struct bio *bp) 229130389Sle{ 230130389Sle struct g_geom *gp; 231130389Sle struct g_consumer *cp; 232130389Sle struct gv_plex *p; 233130389Sle struct gv_raid5_packet *wp; 234130389Sle struct bio *bp2; 235130389Sle caddr_t addr; 236130389Sle off_t boff; 237130389Sle long bcount, rcount; 238130389Sle int err; 239130389Sle 240130389Sle gp = bp->bio_to->geom; 241130389Sle p = gp->softc; 242130389Sle 243130389Sle /* 244130389Sle * We cannot handle this request if too many of our subdisks are 245130389Sle * inaccessible. 246130389Sle */ 247130389Sle if ((p->state < GV_PLEX_DEGRADED) && (bp->bio_caller1 != p)) { 248130389Sle g_io_deliver(bp, ENXIO); /* XXX: correct way? */ 249130389Sle return; 250130389Sle } 251130389Sle 252130389Sle switch(bp->bio_cmd) { 253130389Sle case BIO_READ: 254130389Sle case BIO_WRITE: 255130389Sle case BIO_DELETE: 256130389Sle /* 257130389Sle * We split up the request in smaller packets and hand them 258130389Sle * down to our subdisks. 259130389Sle */ 260130389Sle wp = NULL; 261130389Sle addr = bp->bio_data; 262130389Sle boff = bp->bio_offset; 263130389Sle for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) { 264130389Sle /* 265130389Sle * RAID5 requests usually need to be split up in 266130389Sle * several subrequests. 267130389Sle */ 268130389Sle if (p->org == GV_PLEX_RAID5) { 269130389Sle wp = gv_new_raid5_packet(); 270130389Sle wp->bio = bp; 271130389Sle err = gv_build_raid5_req(wp, bp, addr, bcount, 272130389Sle boff); 273130389Sle } else 274130389Sle err = gv_plexbuffer(bp, &bp2, &cp, addr, bcount, 275130389Sle boff); 276130389Sle 277130389Sle if (err) { 278131000Sle if (p->org == GV_PLEX_RAID5) 279131000Sle gv_free_raid5_packet(wp); 280130389Sle bp->bio_completed += bcount; 281130389Sle if (bp->bio_error == 0) 282130389Sle bp->bio_error = err; 283130389Sle if (bp->bio_completed == bp->bio_length) 284130389Sle g_io_deliver(bp, bp->bio_error); 285130389Sle return; 286130389Sle } 287130389Sle 288130389Sle if (p->org != GV_PLEX_RAID5) { 289130389Sle rcount = bp2->bio_length; 290130389Sle g_io_request(bp2, cp); 291130389Sle 292130389Sle /* 293130389Sle * RAID5 subrequests are queued on a worklist 294130389Sle * and picked up from the worker thread. This 295130389Sle * ensures correct order. 296130389Sle */ 297130389Sle } else { 298130389Sle mtx_lock(&p->worklist_mtx); 299130389Sle TAILQ_INSERT_TAIL(&p->worklist, wp, 300130389Sle list); 301130389Sle mtx_unlock(&p->worklist_mtx); 302130389Sle wakeup(&p); 303130389Sle rcount = wp->length; 304130389Sle } 305130389Sle 306130389Sle boff += rcount; 307130389Sle addr += rcount; 308130389Sle } 309130389Sle return; 310130389Sle 311130389Sle default: 312130389Sle g_io_deliver(bp, EOPNOTSUPP); 313130389Sle return; 314130389Sle } 315130389Sle} 316130389Sle 317130389Slestatic int 318130389Slegv_plex_access(struct g_provider *pp, int dr, int dw, int de) 319130389Sle{ 320130389Sle struct g_geom *gp; 321130389Sle struct g_consumer *cp, *cp2; 322130389Sle int error; 323130389Sle 324130389Sle gp = pp->geom; 325130389Sle 326130389Sle error = ENXIO; 327130389Sle LIST_FOREACH(cp, &gp->consumer, consumer) { 328130389Sle error = g_access(cp, dr, dw, de); 329130389Sle if (error) { 330130389Sle LIST_FOREACH(cp2, &gp->consumer, consumer) { 331130389Sle if (cp == cp2) 332130389Sle break; 333130389Sle g_access(cp2, -dr, -dw, -de); 334130389Sle } 335130389Sle return (error); 336130389Sle } 337130389Sle } 338130389Sle return (error); 339130389Sle} 340130389Sle 341130389Slestatic struct g_geom * 342130389Slegv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) 343130389Sle{ 344130389Sle struct g_geom *gp; 345132906Sle struct g_consumer *cp, *cp2; 346130389Sle struct g_provider *pp2; 347130389Sle struct gv_plex *p; 348130389Sle struct gv_sd *s; 349130389Sle struct gv_softc *sc; 350132906Sle int error; 351130389Sle 352130389Sle g_trace(G_T_TOPOLOGY, "gv_plex_taste(%s, %s)", mp->name, pp->name); 353130389Sle g_topology_assert(); 354130389Sle 355130389Sle /* We only want to attach to subdisks. */ 356130389Sle if (strcmp(pp->geom->class->name, "VINUMDRIVE")) 357130389Sle return (NULL); 358130389Sle 359130389Sle /* Find the VINUM class and its associated geom. */ 360130389Sle gp = find_vinum_geom(); 361130389Sle if (gp == NULL) 362130389Sle return (NULL); 363130389Sle sc = gp->softc; 364130389Sle KASSERT(sc != NULL, ("gv_plex_taste: NULL sc")); 365130389Sle 366130389Sle /* Find out which subdisk the offered provider corresponds to. */ 367130389Sle s = pp->private; 368130389Sle KASSERT(s != NULL, ("gv_plex_taste: NULL s")); 369130389Sle 370130389Sle /* Now find the correct plex where this subdisk belongs to. */ 371130389Sle p = gv_find_plex(sc, s->plex); 372130389Sle KASSERT(p != NULL, ("gv_plex_taste: NULL p")); 373130389Sle 374130389Sle /* 375130389Sle * Add this subdisk to this plex. Since we trust the on-disk 376130389Sle * configuration, we don't check the given value (should we?). 377130389Sle * XXX: shouldn't be done here 378130389Sle */ 379130389Sle gv_sd_to_plex(p, s, 0); 380130389Sle 381130389Sle /* Now check if there's already a geom for this plex. */ 382130389Sle gp = p->geom; 383130389Sle 384130389Sle /* Yes, there is already a geom, so we just add the consumer. */ 385130389Sle if (gp != NULL) { 386132906Sle cp2 = LIST_FIRST(&gp->consumer); 387130389Sle /* Need to attach a new consumer to this subdisk. */ 388130389Sle cp = g_new_consumer(gp); 389132906Sle error = g_attach(cp, pp); 390132906Sle if (error) { 391132906Sle printf("geom_vinum: couldn't attach consumer to %s\n", 392132906Sle pp->name); 393132906Sle g_destroy_consumer(cp); 394132906Sle return (NULL); 395132906Sle } 396132906Sle /* Adjust the access counts of the new consumer. */ 397132906Sle if ((cp2 != NULL) && (cp2->acr || cp2->acw || cp2->ace)) { 398132906Sle error = g_access(cp, cp2->acr, cp2->acw, cp2->ace); 399132906Sle if (error) { 400132906Sle printf("geom_vinum: couldn't set access counts" 401132906Sle " for consumer on %s\n", pp->name); 402132906Sle g_detach(cp); 403132906Sle g_destroy_consumer(cp); 404132906Sle return (NULL); 405132906Sle } 406132906Sle } 407130389Sle s->consumer = cp; 408130389Sle 409130389Sle /* Adjust the size of the providers this plex has. */ 410130389Sle LIST_FOREACH(pp2, &gp->provider, provider) 411130389Sle pp2->mediasize = p->size; 412130389Sle 413132940Sle /* Update the size of the volume this plex is attached to. */ 414132940Sle if (p->vol_sc != NULL) 415132940Sle gv_update_vol_size(p->vol_sc, p->size); 416132940Sle 417130389Sle return (NULL); 418130389Sle 419130389Sle /* We need to create a new geom. */ 420130389Sle } else { 421130389Sle gp = g_new_geomf(mp, "%s", p->name); 422130389Sle gp->start = gv_plex_start; 423130389Sle gp->orphan = gv_plex_orphan; 424130389Sle gp->access = gv_plex_access; 425130389Sle gp->softc = p; 426130389Sle p->geom = gp; 427130389Sle 428130389Sle /* RAID5 plexes need a 'worker' thread, where IO is handled. */ 429130389Sle if (p->org == GV_PLEX_RAID5) { 430130389Sle TAILQ_INIT(&p->worklist); 431130389Sle mtx_init(&p->worklist_mtx, "gvinum_worklist", NULL, 432130389Sle MTX_DEF); 433130389Sle p->flags &= ~GV_PLEX_THREAD_DIE; 434130389Sle kthread_create(gv_raid5_worker, gp, NULL, 0, 0, 435130389Sle "gv_raid5"); 436130389Sle p->flags |= GV_PLEX_THREAD_ACTIVE; 437130389Sle } 438130389Sle 439130389Sle /* Attach a consumer to this provider. */ 440130389Sle cp = g_new_consumer(gp); 441130389Sle g_attach(cp, pp); 442130389Sle s->consumer = cp; 443130389Sle 444130389Sle /* Create a provider for the outside world. */ 445130389Sle pp2 = g_new_providerf(gp, "gvinum/plex/%s", p->name); 446130389Sle pp2->mediasize = p->size; 447130389Sle pp2->sectorsize = pp->sectorsize; 448130389Sle p->provider = pp2; 449130389Sle g_error_provider(pp2, 0); 450130389Sle return (gp); 451130389Sle } 452130389Sle} 453130389Sle 454130389Slestatic int 455130389Slegv_plex_destroy_geom(struct gctl_req *req, struct g_class *mp, 456130389Sle struct g_geom *gp) 457130389Sle{ 458130389Sle struct gv_plex *p; 459130389Sle 460130389Sle g_trace(G_T_TOPOLOGY, "gv_plex_destroy_geom: %s", gp->name); 461130389Sle g_topology_assert(); 462130389Sle 463130389Sle p = gp->softc; 464130389Sle 465130389Sle KASSERT(p != NULL, ("gv_plex_destroy_geom: null p of '%s'", gp->name)); 466130389Sle 467130389Sle /* 468130389Sle * If this is a RAID5 plex, check if its worker thread is still active 469130389Sle * and signal it to self destruct. 470130389Sle */ 471130389Sle gv_kill_thread(p); 472130389Sle mtx_destroy(&p->worklist_mtx); 473130389Sle /* g_free(sc); */ 474130389Sle g_wither_geom(gp, ENXIO); 475130389Sle return (0); 476130389Sle} 477130389Sle 478130389Sle#define VINUMPLEX_CLASS_NAME "VINUMPLEX" 479130389Sle 480130389Slestatic struct g_class g_vinum_plex_class = { 481130389Sle .name = VINUMPLEX_CLASS_NAME, 482133318Sphk .version = G_VERSION, 483130389Sle .taste = gv_plex_taste, 484130389Sle .destroy_geom = gv_plex_destroy_geom, 485130389Sle}; 486130389Sle 487130389SleDECLARE_GEOM_CLASS(g_vinum_plex_class, g_vinum_plex); 488