geom_vinum_plex.c revision 130389
1/*- 2 * Copyright (c) 2004 Lukas Ertl 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_plex.c 130389 2004-06-12 21:16:10Z le $"); 29 30#include <sys/param.h> 31#include <sys/bio.h> 32#include <sys/kernel.h> 33#include <sys/kthread.h> 34#include <sys/libkern.h> 35#include <sys/lock.h> 36#include <sys/malloc.h> 37#include <sys/module.h> 38#include <sys/mutex.h> 39#include <sys/systm.h> 40 41#include <geom/geom.h> 42#include <geom/vinum/geom_vinum_var.h> 43#include <geom/vinum/geom_vinum_raid5.h> 44#include <geom/vinum/geom_vinum.h> 45 46/* XXX: is this the place to catch dying subdisks? */ 47static void 48gv_plex_orphan(struct g_consumer *cp) 49{ 50 struct g_geom *gp; 51 struct gv_plex *p; 52 int error; 53 54 g_topology_assert(); 55 gp = cp->geom; 56 g_trace(G_T_TOPOLOGY, "gv_plex_orphan(%s)", gp->name); 57 58 if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) 59 g_access(cp, -cp->acr, -cp->acw, -cp->ace); 60 error = cp->provider->error; 61 if (error == 0) 62 error = ENXIO; 63 g_detach(cp); 64 g_destroy_consumer(cp); 65 if (!LIST_EMPTY(&gp->consumer)) 66 return; 67 68 p = gp->softc; 69 gv_kill_thread(p); 70 g_free(p); 71 g_wither_geom(gp, error); 72} 73 74static void 75gv_plex_done(struct bio *bp) 76{ 77 struct g_geom *gp; 78 struct gv_sd *s; 79 80 gp = bp->bio_to->geom; 81 82 s = bp->bio_caller1; 83 KASSERT(s != NULL, ("gv_plex_done: NULL s")); 84 85 if (bp->bio_error == 0) 86 s->initialized += bp->bio_length; 87 88 if (s->initialized >= s->size) { 89 gv_set_sd_state(s, GV_SD_UP, 0); 90 s->initialized = 0; 91 } 92 93 g_std_done(bp); 94} 95 96/* Find the correct subdisk to send the bio to and build a bio to send. */ 97static int 98gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp, 99 caddr_t addr, long bcount, off_t boff) 100{ 101 struct g_geom *gp; 102 struct gv_plex *p; 103 struct gv_sd *s; 104 struct bio *cbp; 105 int i, sdno; 106 off_t len_left, real_len, real_off, stripeend, stripeno, stripestart; 107 108 s = NULL; 109 110 gp = bp->bio_to->geom; 111 p = gp->softc; 112 113 if (p == NULL || LIST_EMPTY(&p->subdisks)) 114 return (ENXIO); 115 116 /* 117 * We only handle concatenated and striped plexes here. RAID5 plexes 118 * are handled in build_raid5_request(). 119 */ 120 switch (p->org) { 121 case GV_PLEX_CONCAT: 122 /* 123 * Find the subdisk where this request starts. The subdisks in 124 * this list must be ordered by plex_offset. 125 */ 126 LIST_FOREACH(s, &p->subdisks, in_plex) { 127 if (s->plex_offset <= boff && 128 s->plex_offset + s->size > boff) 129 break; 130 } 131 /* Subdisk not found. */ 132 if (s == NULL) 133 return (ENXIO); 134 135 /* Calculate corresponding offsets on disk. */ 136 real_off = boff - s->plex_offset; 137 len_left = s->size - real_off; 138 real_len = (bcount > len_left) ? len_left : bcount; 139 break; 140 141 case GV_PLEX_STRIPED: 142 /* The number of the stripe where the request starts. */ 143 stripeno = boff / p->stripesize; 144 145 /* The number of the subdisk where the stripe resides. */ 146 sdno = stripeno % p->sdcount; 147 148 /* Find the right subdisk. */ 149 i = 0; 150 LIST_FOREACH(s, &p->subdisks, in_plex) { 151 if (i == sdno) 152 break; 153 i++; 154 } 155 156 /* Subdisk not found. */ 157 if (s == NULL) 158 return (ENXIO); 159 160 /* The offset of the stripe from the start of the subdisk. */ 161 stripestart = (stripeno / p->sdcount) * 162 p->stripesize; 163 164 /* The offset at the end of the stripe. */ 165 stripeend = stripestart + p->stripesize; 166 167 /* The offset of the request on this subdisk. */ 168 real_off = boff - (stripeno * p->stripesize) + 169 stripestart; 170 171 /* The length left in this stripe. */ 172 len_left = stripeend - real_off; 173 174 real_len = (bcount <= len_left) ? bcount : len_left; 175 break; 176 177 default: 178 return (EINVAL); 179 } 180 181 /* Now check if we can handle the request on this subdisk. */ 182 switch (s->state) { 183 case GV_SD_UP: 184 /* If the subdisk is up, just continue. */ 185 break; 186 187 case GV_SD_STALE: 188 if (bp->bio_caller1 != p) 189 return (ENXIO); 190 191 printf("FOO: setting sd %s to GV_SD_INITIALIZING\n", s->name); 192 gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); 193 break; 194 195 case GV_SD_INITIALIZING: 196 if (bp->bio_cmd == BIO_READ) 197 return (ENXIO); 198 break; 199 200 default: 201 /* All other subdisk states mean it's not accessible. */ 202 return (ENXIO); 203 } 204 205 /* Clone the bio and adjust the offsets and sizes. */ 206 cbp = g_clone_bio(bp); 207 if (cbp == NULL) 208 return (ENOMEM); 209 cbp->bio_offset = real_off; 210 cbp->bio_length = real_len; 211 cbp->bio_data = addr; 212 if (bp->bio_caller1 == p) { 213 cbp->bio_caller1 = s; 214 cbp->bio_done = gv_plex_done; 215 } else 216 cbp->bio_done = g_std_done; 217 *bp2 = cbp; 218 *cp = s->consumer; 219 return (0); 220} 221 222static void 223gv_plex_start(struct bio *bp) 224{ 225 struct g_geom *gp; 226 struct g_consumer *cp; 227 struct gv_plex *p; 228 struct gv_raid5_packet *wp; 229 struct bio *bp2; 230 caddr_t addr; 231 off_t boff; 232 long bcount, rcount; 233 int err; 234 235 gp = bp->bio_to->geom; 236 p = gp->softc; 237 238 /* 239 * We cannot handle this request if too many of our subdisks are 240 * inaccessible. 241 */ 242 if ((p->state < GV_PLEX_DEGRADED) && (bp->bio_caller1 != p)) { 243 g_io_deliver(bp, ENXIO); /* XXX: correct way? */ 244 return; 245 } 246 247 switch(bp->bio_cmd) { 248 case BIO_READ: 249 case BIO_WRITE: 250 case BIO_DELETE: 251 /* 252 * We split up the request in smaller packets and hand them 253 * down to our subdisks. 254 */ 255 wp = NULL; 256 addr = bp->bio_data; 257 boff = bp->bio_offset; 258 for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) { 259 /* 260 * RAID5 requests usually need to be split up in 261 * several subrequests. 262 */ 263 if (p->org == GV_PLEX_RAID5) { 264 wp = gv_new_raid5_packet(); 265 wp->bio = bp; 266 err = gv_build_raid5_req(wp, bp, addr, bcount, 267 boff); 268 } else 269 err = gv_plexbuffer(bp, &bp2, &cp, addr, bcount, 270 boff); 271 272 if (err) { 273 bp->bio_completed += bcount; 274 if (bp->bio_error == 0) 275 bp->bio_error = err; 276 if (bp->bio_completed == bp->bio_length) 277 g_io_deliver(bp, bp->bio_error); 278 return; 279 } 280 281 if (p->org != GV_PLEX_RAID5) { 282 rcount = bp2->bio_length; 283 g_io_request(bp2, cp); 284 285 /* 286 * RAID5 subrequests are queued on a worklist 287 * and picked up from the worker thread. This 288 * ensures correct order. 289 */ 290 } else { 291 mtx_lock(&p->worklist_mtx); 292 TAILQ_INSERT_TAIL(&p->worklist, wp, 293 list); 294 mtx_unlock(&p->worklist_mtx); 295 wakeup(&p); 296 rcount = wp->length; 297 } 298 299 boff += rcount; 300 addr += rcount; 301 } 302 return; 303 304 default: 305 g_io_deliver(bp, EOPNOTSUPP); 306 return; 307 } 308} 309 310static int 311gv_plex_access(struct g_provider *pp, int dr, int dw, int de) 312{ 313 struct g_geom *gp; 314 struct g_consumer *cp, *cp2; 315 int error; 316 317 gp = pp->geom; 318 319 error = ENXIO; 320 LIST_FOREACH(cp, &gp->consumer, consumer) { 321 error = g_access(cp, dr, dw, de); 322 if (error) { 323 LIST_FOREACH(cp2, &gp->consumer, consumer) { 324 if (cp == cp2) 325 break; 326 g_access(cp2, -dr, -dw, -de); 327 } 328 return (error); 329 } 330 } 331 return (error); 332} 333 334static struct g_geom * 335gv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) 336{ 337 struct g_geom *gp; 338 struct g_consumer *cp; 339 struct g_provider *pp2; 340 struct gv_plex *p; 341 struct gv_sd *s; 342 struct gv_softc *sc; 343 344 g_trace(G_T_TOPOLOGY, "gv_plex_taste(%s, %s)", mp->name, pp->name); 345 g_topology_assert(); 346 347 /* We only want to attach to subdisks. */ 348 if (strcmp(pp->geom->class->name, "VINUMDRIVE")) 349 return (NULL); 350 351 /* Find the VINUM class and its associated geom. */ 352 gp = find_vinum_geom(); 353 if (gp == NULL) 354 return (NULL); 355 sc = gp->softc; 356 KASSERT(sc != NULL, ("gv_plex_taste: NULL sc")); 357 358 /* Find out which subdisk the offered provider corresponds to. */ 359 s = pp->private; 360 KASSERT(s != NULL, ("gv_plex_taste: NULL s")); 361 362 /* Now find the correct plex where this subdisk belongs to. */ 363 p = gv_find_plex(sc, s->plex); 364 KASSERT(p != NULL, ("gv_plex_taste: NULL p")); 365 366 /* 367 * Add this subdisk to this plex. Since we trust the on-disk 368 * configuration, we don't check the given value (should we?). 369 * XXX: shouldn't be done here 370 */ 371 gv_sd_to_plex(p, s, 0); 372 373 /* Now check if there's already a geom for this plex. */ 374 gp = p->geom; 375 376 /* Yes, there is already a geom, so we just add the consumer. */ 377 if (gp != NULL) { 378 /* Need to attach a new consumer to this subdisk. */ 379 cp = g_new_consumer(gp); 380 g_attach(cp, pp); 381 s->consumer = cp; 382 383 /* Adjust the size of the providers this plex has. */ 384 LIST_FOREACH(pp2, &gp->provider, provider) 385 pp2->mediasize = p->size; 386 387 return (NULL); 388 389 /* We need to create a new geom. */ 390 } else { 391 gp = g_new_geomf(mp, "%s", p->name); 392 gp->start = gv_plex_start; 393 gp->orphan = gv_plex_orphan; 394 gp->access = gv_plex_access; 395 gp->softc = p; 396 p->geom = gp; 397 398 /* RAID5 plexes need a 'worker' thread, where IO is handled. */ 399 if (p->org == GV_PLEX_RAID5) { 400 TAILQ_INIT(&p->worklist); 401 mtx_init(&p->worklist_mtx, "gvinum_worklist", NULL, 402 MTX_DEF); 403 p->flags &= ~GV_PLEX_THREAD_DIE; 404 kthread_create(gv_raid5_worker, gp, NULL, 0, 0, 405 "gv_raid5"); 406 p->flags |= GV_PLEX_THREAD_ACTIVE; 407 } 408 409 /* Attach a consumer to this provider. */ 410 cp = g_new_consumer(gp); 411 g_attach(cp, pp); 412 s->consumer = cp; 413 414 /* Create a provider for the outside world. */ 415 pp2 = g_new_providerf(gp, "gvinum/plex/%s", p->name); 416 pp2->mediasize = p->size; 417 pp2->sectorsize = pp->sectorsize; 418 p->provider = pp2; 419 g_error_provider(pp2, 0); 420 return (gp); 421 } 422} 423 424static int 425gv_plex_destroy_geom(struct gctl_req *req, struct g_class *mp, 426 struct g_geom *gp) 427{ 428 struct gv_plex *p; 429 430 g_trace(G_T_TOPOLOGY, "gv_plex_destroy_geom: %s", gp->name); 431 g_topology_assert(); 432 433 p = gp->softc; 434 435 KASSERT(p != NULL, ("gv_plex_destroy_geom: null p of '%s'", gp->name)); 436 437 /* 438 * If this is a RAID5 plex, check if its worker thread is still active 439 * and signal it to self destruct. 440 */ 441 gv_kill_thread(p); 442 mtx_destroy(&p->worklist_mtx); 443 /* g_free(sc); */ 444 g_wither_geom(gp, ENXIO); 445 return (0); 446} 447 448#define VINUMPLEX_CLASS_NAME "VINUMPLEX" 449 450static struct g_class g_vinum_plex_class = { 451 .name = VINUMPLEX_CLASS_NAME, 452 .taste = gv_plex_taste, 453 .destroy_geom = gv_plex_destroy_geom, 454}; 455 456DECLARE_GEOM_CLASS(g_vinum_plex_class, g_vinum_plex); 457