geom_vinum_plex.c revision 133450
1/*- 2 * Copyright (c) 2004 Lukas Ertl 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_plex.c 133450 2004-08-10 20:51:48Z le $"); 29 30#include <sys/param.h> 31#include <sys/bio.h> 32#include <sys/kernel.h> 33#include <sys/kthread.h> 34#include <sys/libkern.h> 35#include <sys/lock.h> 36#include <sys/malloc.h> 37#include <sys/module.h> 38#include <sys/mutex.h> 39#include <sys/systm.h> 40 41#include <geom/geom.h> 42#include <geom/vinum/geom_vinum_var.h> 43#include <geom/vinum/geom_vinum_raid5.h> 44#include <geom/vinum/geom_vinum.h> 45 46/* XXX: is this the place to catch dying subdisks? */ 47static void 48gv_plex_orphan(struct g_consumer *cp) 49{ 50 struct g_geom *gp; 51 struct gv_plex *p; 52 int error; 53 54 g_topology_assert(); 55 gp = cp->geom; 56 g_trace(G_T_TOPOLOGY, "gv_plex_orphan(%s)", gp->name); 57 58 if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) 59 g_access(cp, -cp->acr, -cp->acw, -cp->ace); 60 error = cp->provider->error; 61 if (error == 0) 62 error = ENXIO; 63 g_detach(cp); 64 g_destroy_consumer(cp); 65 if (!LIST_EMPTY(&gp->consumer)) 66 return; 67 68 p = gp->softc; 69 if (p != NULL) { 70 gv_kill_thread(p); 71 p->geom = NULL; 72 p->provider = NULL; 73 p->consumer = NULL; 74 } 75 gp->softc = NULL; 76 g_wither_geom(gp, error); 77} 78 79static void 80gv_plex_done(struct bio *bp) 81{ 82 struct g_geom *gp; 83 struct gv_sd *s; 84 85 gp = bp->bio_to->geom; 86 87 s = bp->bio_caller1; 88 KASSERT(s != NULL, ("gv_plex_done: NULL s")); 89 90 if (bp->bio_error == 0) 91 s->initialized += bp->bio_length; 92 93 if (s->initialized >= s->size) { 94 gv_set_sd_state(s, GV_SD_UP, 0); 95 s->initialized = 0; 96 } 97 98 g_std_done(bp); 99} 100 101/* Find the correct subdisk to send the bio to and build a bio to send. */ 102static int 103gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp, 104 caddr_t addr, long bcount, off_t boff) 105{ 106 struct g_geom *gp; 107 struct gv_plex *p; 108 struct gv_sd *s; 109 struct bio *cbp; 110 int i, sdno; 111 off_t len_left, real_len, real_off, stripeend, stripeno, stripestart; 112 113 s = NULL; 114 115 gp = bp->bio_to->geom; 116 p = gp->softc; 117 118 if (p == NULL || LIST_EMPTY(&p->subdisks)) 119 return (ENXIO); 120 121 /* 122 * We only handle concatenated and striped plexes here. RAID5 plexes 123 * are handled in build_raid5_request(). 124 */ 125 switch (p->org) { 126 case GV_PLEX_CONCAT: 127 /* 128 * Find the subdisk where this request starts. The subdisks in 129 * this list must be ordered by plex_offset. 130 */ 131 LIST_FOREACH(s, &p->subdisks, in_plex) { 132 if (s->plex_offset <= boff && 133 s->plex_offset + s->size > boff) 134 break; 135 } 136 /* Subdisk not found. */ 137 if (s == NULL) 138 return (ENXIO); 139 140 /* Calculate corresponding offsets on disk. */ 141 real_off = boff - s->plex_offset; 142 len_left = s->size - real_off; 143 real_len = (bcount > len_left) ? len_left : bcount; 144 break; 145 146 case GV_PLEX_STRIPED: 147 /* The number of the stripe where the request starts. */ 148 stripeno = boff / p->stripesize; 149 150 /* The number of the subdisk where the stripe resides. */ 151 sdno = stripeno % p->sdcount; 152 153 /* Find the right subdisk. */ 154 i = 0; 155 LIST_FOREACH(s, &p->subdisks, in_plex) { 156 if (i == sdno) 157 break; 158 i++; 159 } 160 161 /* Subdisk not found. */ 162 if (s == NULL) 163 return (ENXIO); 164 165 /* The offset of the stripe from the start of the subdisk. */ 166 stripestart = (stripeno / p->sdcount) * 167 p->stripesize; 168 169 /* The offset at the end of the stripe. */ 170 stripeend = stripestart + p->stripesize; 171 172 /* The offset of the request on this subdisk. */ 173 real_off = boff - (stripeno * p->stripesize) + 174 stripestart; 175 176 /* The length left in this stripe. */ 177 len_left = stripeend - real_off; 178 179 real_len = (bcount <= len_left) ? bcount : len_left; 180 break; 181 182 default: 183 return (EINVAL); 184 } 185 186 /* Now check if we can handle the request on this subdisk. */ 187 switch (s->state) { 188 case GV_SD_UP: 189 /* If the subdisk is up, just continue. */ 190 break; 191 192 case GV_SD_STALE: 193 if (bp->bio_caller1 != p) 194 return (ENXIO); 195 196 printf("FOO: setting sd %s to GV_SD_INITIALIZING\n", s->name); 197 gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); 198 break; 199 200 case GV_SD_INITIALIZING: 201 if (bp->bio_cmd == BIO_READ) 202 return (ENXIO); 203 break; 204 205 default: 206 /* All other subdisk states mean it's not accessible. */ 207 return (ENXIO); 208 } 209 210 /* Clone the bio and adjust the offsets and sizes. */ 211 cbp = g_clone_bio(bp); 212 if (cbp == NULL) 213 return (ENOMEM); 214 cbp->bio_offset = real_off; 215 cbp->bio_length = real_len; 216 cbp->bio_data = addr; 217 if (bp->bio_caller1 == p) { 218 cbp->bio_caller1 = s; 219 cbp->bio_done = gv_plex_done; 220 } else 221 cbp->bio_done = g_std_done; 222 *bp2 = cbp; 223 *cp = s->consumer; 224 return (0); 225} 226 227static void 228gv_plex_start(struct bio *bp) 229{ 230 struct g_geom *gp; 231 struct g_consumer *cp; 232 struct gv_plex *p; 233 struct gv_raid5_packet *wp; 234 struct bio *bp2; 235 caddr_t addr; 236 off_t boff; 237 long bcount, rcount; 238 int err; 239 240 gp = bp->bio_to->geom; 241 p = gp->softc; 242 243 /* 244 * We cannot handle this request if too many of our subdisks are 245 * inaccessible. 246 */ 247 if ((p->state < GV_PLEX_DEGRADED) && (bp->bio_caller1 != p)) { 248 g_io_deliver(bp, ENXIO); /* XXX: correct way? */ 249 return; 250 } 251 252 switch(bp->bio_cmd) { 253 case BIO_READ: 254 case BIO_WRITE: 255 case BIO_DELETE: 256 /* 257 * We split up the request in smaller packets and hand them 258 * down to our subdisks. 259 */ 260 wp = NULL; 261 addr = bp->bio_data; 262 boff = bp->bio_offset; 263 for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) { 264 /* 265 * RAID5 requests usually need to be split up in 266 * several subrequests. 267 */ 268 if (p->org == GV_PLEX_RAID5) { 269 wp = gv_new_raid5_packet(); 270 wp->bio = bp; 271 err = gv_build_raid5_req(wp, bp, addr, bcount, 272 boff); 273 } else 274 err = gv_plexbuffer(bp, &bp2, &cp, addr, bcount, 275 boff); 276 277 if (err) { 278 if (p->org == GV_PLEX_RAID5) 279 gv_free_raid5_packet(wp); 280 bp->bio_completed += bcount; 281 if (bp->bio_error == 0) 282 bp->bio_error = err; 283 if (bp->bio_completed == bp->bio_length) 284 g_io_deliver(bp, bp->bio_error); 285 return; 286 } 287 288 if (p->org != GV_PLEX_RAID5) { 289 rcount = bp2->bio_length; 290 g_io_request(bp2, cp); 291 292 /* 293 * RAID5 subrequests are queued on a worklist 294 * and picked up from the worker thread. This 295 * ensures correct order. 296 */ 297 } else { 298 mtx_lock(&p->worklist_mtx); 299 TAILQ_INSERT_TAIL(&p->worklist, wp, 300 list); 301 mtx_unlock(&p->worklist_mtx); 302 wakeup(&p); 303 rcount = wp->length; 304 } 305 306 boff += rcount; 307 addr += rcount; 308 } 309 return; 310 311 default: 312 g_io_deliver(bp, EOPNOTSUPP); 313 return; 314 } 315} 316 317static int 318gv_plex_access(struct g_provider *pp, int dr, int dw, int de) 319{ 320 struct g_geom *gp; 321 struct g_consumer *cp, *cp2; 322 int error; 323 324 gp = pp->geom; 325 326 error = ENXIO; 327 LIST_FOREACH(cp, &gp->consumer, consumer) { 328 error = g_access(cp, dr, dw, de); 329 if (error) { 330 LIST_FOREACH(cp2, &gp->consumer, consumer) { 331 if (cp == cp2) 332 break; 333 g_access(cp2, -dr, -dw, -de); 334 } 335 return (error); 336 } 337 } 338 return (error); 339} 340 341static struct g_geom * 342gv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) 343{ 344 struct g_geom *gp; 345 struct g_consumer *cp, *cp2; 346 struct g_provider *pp2; 347 struct gv_plex *p; 348 struct gv_sd *s; 349 struct gv_softc *sc; 350 int error; 351 352 g_trace(G_T_TOPOLOGY, "gv_plex_taste(%s, %s)", mp->name, pp->name); 353 g_topology_assert(); 354 355 /* We only want to attach to subdisks. */ 356 if (strcmp(pp->geom->class->name, "VINUMDRIVE")) 357 return (NULL); 358 359 /* Find the VINUM class and its associated geom. */ 360 gp = find_vinum_geom(); 361 if (gp == NULL) 362 return (NULL); 363 sc = gp->softc; 364 KASSERT(sc != NULL, ("gv_plex_taste: NULL sc")); 365 366 /* Find out which subdisk the offered provider corresponds to. */ 367 s = pp->private; 368 KASSERT(s != NULL, ("gv_plex_taste: NULL s")); 369 370 /* Now find the correct plex where this subdisk belongs to. */ 371 p = gv_find_plex(sc, s->plex); 372 KASSERT(p != NULL, ("gv_plex_taste: NULL p")); 373 374 /* 375 * Add this subdisk to this plex. Since we trust the on-disk 376 * configuration, we don't check the given value (should we?). 377 * XXX: shouldn't be done here 378 */ 379 gv_sd_to_plex(p, s, 0); 380 381 /* Now check if there's already a geom for this plex. */ 382 gp = p->geom; 383 384 /* Yes, there is already a geom, so we just add the consumer. */ 385 if (gp != NULL) { 386 cp2 = LIST_FIRST(&gp->consumer); 387 /* Need to attach a new consumer to this subdisk. */ 388 cp = g_new_consumer(gp); 389 error = g_attach(cp, pp); 390 if (error) { 391 printf("geom_vinum: couldn't attach consumer to %s\n", 392 pp->name); 393 g_destroy_consumer(cp); 394 return (NULL); 395 } 396 /* Adjust the access counts of the new consumer. */ 397 if ((cp2 != NULL) && (cp2->acr || cp2->acw || cp2->ace)) { 398 error = g_access(cp, cp2->acr, cp2->acw, cp2->ace); 399 if (error) { 400 printf("geom_vinum: couldn't set access counts" 401 " for consumer on %s\n", pp->name); 402 g_detach(cp); 403 g_destroy_consumer(cp); 404 return (NULL); 405 } 406 } 407 s->consumer = cp; 408 409 /* Adjust the size of the providers this plex has. */ 410 LIST_FOREACH(pp2, &gp->provider, provider) 411 pp2->mediasize = p->size; 412 413 /* Update the size of the volume this plex is attached to. */ 414 if (p->vol_sc != NULL) 415 gv_update_vol_size(p->vol_sc, p->size); 416 417 return (NULL); 418 419 /* We need to create a new geom. */ 420 } else { 421 gp = g_new_geomf(mp, "%s", p->name); 422 gp->start = gv_plex_start; 423 gp->orphan = gv_plex_orphan; 424 gp->access = gv_plex_access; 425 gp->softc = p; 426 p->geom = gp; 427 428 /* RAID5 plexes need a 'worker' thread, where IO is handled. */ 429 if (p->org == GV_PLEX_RAID5) { 430 TAILQ_INIT(&p->worklist); 431 mtx_init(&p->worklist_mtx, "gvinum_worklist", NULL, 432 MTX_DEF); 433 p->flags &= ~GV_PLEX_THREAD_DIE; 434 kthread_create(gv_raid5_worker, gp, NULL, 0, 0, 435 "gv_raid5"); 436 p->flags |= GV_PLEX_THREAD_ACTIVE; 437 } 438 439 /* Attach a consumer to this provider. */ 440 cp = g_new_consumer(gp); 441 g_attach(cp, pp); 442 s->consumer = cp; 443 444 /* Create a provider for the outside world. */ 445 pp2 = g_new_providerf(gp, "gvinum/plex/%s", p->name); 446 pp2->mediasize = p->size; 447 pp2->sectorsize = pp->sectorsize; 448 p->provider = pp2; 449 g_error_provider(pp2, 0); 450 return (gp); 451 } 452} 453 454static int 455gv_plex_destroy_geom(struct gctl_req *req, struct g_class *mp, 456 struct g_geom *gp) 457{ 458 struct gv_plex *p; 459 460 g_trace(G_T_TOPOLOGY, "gv_plex_destroy_geom: %s", gp->name); 461 g_topology_assert(); 462 463 p = gp->softc; 464 465 KASSERT(p != NULL, ("gv_plex_destroy_geom: null p of '%s'", gp->name)); 466 467 /* 468 * If this is a RAID5 plex, check if its worker thread is still active 469 * and signal it to self destruct. 470 */ 471 gv_kill_thread(p); 472 /* g_free(sc); */ 473 g_wither_geom(gp, ENXIO); 474 return (0); 475} 476 477#define VINUMPLEX_CLASS_NAME "VINUMPLEX" 478 479static struct g_class g_vinum_plex_class = { 480 .name = VINUMPLEX_CLASS_NAME, 481 .version = G_VERSION, 482 .taste = gv_plex_taste, 483 .destroy_geom = gv_plex_destroy_geom, 484}; 485 486DECLARE_GEOM_CLASS(g_vinum_plex_class, g_vinum_plex); 487