1/*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2004, 2007 Lukas Ertl 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include <sys/param.h> 33#include <sys/bio.h> 34#include <sys/lock.h> 35#include <sys/malloc.h> 36#include <sys/systm.h> 37 38#include <geom/geom.h> 39#include <geom/vinum/geom_vinum_var.h> 40#include <geom/vinum/geom_vinum_raid5.h> 41#include <geom/vinum/geom_vinum.h> 42 43static int gv_raid5_offset(struct gv_plex *, off_t, off_t, 44 off_t *, off_t *, int *, int *, int); 45static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, 46 struct gv_raid5_packet *, caddr_t, int); 47static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, 48 struct bio *, caddr_t, off_t, off_t, int *); 49static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, 50 struct bio *, caddr_t, off_t, off_t); 51static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, 52 struct bio *, caddr_t, off_t, off_t); 53 54struct gv_raid5_packet * 55gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, 56 off_t bcount) 57{ 58 struct bio *cbp; 59 struct gv_raid5_packet *wp, *wp2; 60 struct gv_bioq *bq, *bq2; 61 int err, delay; 62 63 delay = 0; 64 wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); 65 wp->bio = bp; 66 wp->waiting = NULL; 67 wp->parity = NULL; 68 TAILQ_INIT(&wp->bits); 69 70 if (bp->bio_pflags & GV_BIO_REBUILD) 71 err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); 72 else if (bp->bio_pflags & GV_BIO_CHECK) 73 err = gv_raid5_check(p, wp, bp, addr, boff, bcount); 74 else 75 err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); 76 77 /* Means we have a delayed request. */ 78 if (delay) { 79 g_free(wp); 80 return (NULL); 81 } 82 83 /* 84 * Building the sub-request failed, we probably need to clean up a lot. 85 */ 86 if (err) { 87 G_VINUM_LOGREQ(0, bp, "raid5 plex request failed."); 88 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 89 TAILQ_REMOVE(&wp->bits, bq, queue); 90 g_free(bq); 91 } 92 if (wp->waiting != NULL) { 93 if (wp->waiting->bio_cflags & GV_BIO_MALLOC) 94 g_free(wp->waiting->bio_data); 95 gv_drive_done(wp->waiting->bio_caller1); 96 g_destroy_bio(wp->waiting); 97 } 98 if (wp->parity != NULL) { 99 if (wp->parity->bio_cflags & GV_BIO_MALLOC) 100 g_free(wp->parity->bio_data); 101 gv_drive_done(wp->parity->bio_caller1); 102 g_destroy_bio(wp->parity); 103 } 104 g_free(wp); 105 106 TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { 107 if (wp->bio != bp) 108 continue; 109 110 TAILQ_REMOVE(&p->packets, wp, list); 111 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 112 TAILQ_REMOVE(&wp->bits, bq, queue); 113 g_free(bq); 114 } 115 g_free(wp); 116 } 117 118 cbp = bioq_takefirst(p->bqueue); 119 while (cbp != NULL) { 120 if (cbp->bio_cflags & GV_BIO_MALLOC) 121 g_free(cbp->bio_data); 122 gv_drive_done(cbp->bio_caller1); 123 g_destroy_bio(cbp); 124 cbp = bioq_takefirst(p->bqueue); 125 } 126 127 /* If internal, stop and reset state. */ 128 if (bp->bio_pflags & GV_BIO_INTERNAL) { 129 if (bp->bio_pflags & GV_BIO_MALLOC) 130 g_free(bp->bio_data); 131 g_destroy_bio(bp); 132 /* Reset flags. */ 133 p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | 134 GV_PLEX_GROWING); 135 return (NULL); 136 } 137 g_io_deliver(bp, err); 138 return (NULL); 139 } 140 141 return (wp); 142} 143 144/* 145 * Check if the stripe that the work packet wants is already being used by 146 * some other work packet. 147 */ 148int 149gv_stripe_active(struct gv_plex *p, struct bio *bp) 150{ 151 struct gv_raid5_packet *wp, *owp; 152 int overlap; 153 154 wp = bp->bio_caller2; 155 if (wp->lockbase == -1) 156 return (0); 157 158 overlap = 0; 159 TAILQ_FOREACH(owp, &p->packets, list) { 160 if (owp == wp) 161 break; 162 if ((wp->lockbase >= owp->lockbase) && 163 (wp->lockbase <= owp->lockbase + owp->length)) { 164 overlap++; 165 break; 166 } 167 if ((wp->lockbase <= owp->lockbase) && 168 (wp->lockbase + wp->length >= owp->lockbase)) { 169 overlap++; 170 break; 171 } 172 } 173 174 return (overlap); 175} 176 177static int 178gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 179 caddr_t addr, off_t boff, off_t bcount) 180{ 181 struct gv_sd *parity, *s; 182 struct gv_bioq *bq; 183 struct bio *cbp; 184 int i, psdno; 185 off_t real_len, real_off; 186 187 if (p == NULL || LIST_EMPTY(&p->subdisks)) 188 return (ENXIO); 189 190 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); 191 192 /* Find the right subdisk. */ 193 parity = NULL; 194 i = 0; 195 LIST_FOREACH(s, &p->subdisks, in_plex) { 196 if (i == psdno) { 197 parity = s; 198 break; 199 } 200 i++; 201 } 202 203 /* Parity stripe not found. */ 204 if (parity == NULL) 205 return (ENXIO); 206 207 if (parity->state != GV_SD_UP) 208 return (ENXIO); 209 210 wp->length = real_len; 211 wp->data = addr; 212 wp->lockbase = real_off; 213 214 /* Read all subdisks. */ 215 LIST_FOREACH(s, &p->subdisks, in_plex) { 216 /* Skip the parity subdisk. */ 217 if (s == parity) 218 continue; 219 /* Skip growing subdisks. */ 220 if (s->flags & GV_SD_GROW) 221 continue; 222 223 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 224 if (cbp == NULL) 225 return (ENOMEM); 226 cbp->bio_cmd = BIO_READ; 227 228 bioq_insert_tail(p->bqueue, cbp); 229 230 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 231 bq->bp = cbp; 232 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 233 } 234 235 /* Read the parity data. */ 236 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 237 if (cbp == NULL) 238 return (ENOMEM); 239 cbp->bio_cmd = BIO_READ; 240 wp->waiting = cbp; 241 242 /* 243 * In case we want to rebuild the parity, create an extra BIO to write 244 * it out. It also acts as buffer for the XOR operations. 245 */ 246 cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); 247 if (cbp == NULL) 248 return (ENOMEM); 249 wp->parity = cbp; 250 251 return (0); 252} 253 254/* Rebuild a degraded RAID5 plex. */ 255static int 256gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 257 caddr_t addr, off_t boff, off_t bcount) 258{ 259 struct gv_sd *broken, *s; 260 struct gv_bioq *bq; 261 struct bio *cbp; 262 off_t real_len, real_off; 263 264 if (p == NULL || LIST_EMPTY(&p->subdisks)) 265 return (ENXIO); 266 267 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); 268 269 /* Find the right subdisk. */ 270 broken = NULL; 271 LIST_FOREACH(s, &p->subdisks, in_plex) { 272 if (s->state != GV_SD_UP) 273 broken = s; 274 } 275 276 /* Broken stripe not found. */ 277 if (broken == NULL) 278 return (ENXIO); 279 280 switch (broken->state) { 281 case GV_SD_UP: 282 return (EINVAL); 283 284 case GV_SD_STALE: 285 if (!(bp->bio_pflags & GV_BIO_REBUILD)) 286 return (ENXIO); 287 288 G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); 289 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); 290 /* Set this bit now, but should be set at end. */ 291 broken->flags |= GV_SD_CANGOUP; 292 break; 293 294 case GV_SD_REVIVING: 295 break; 296 297 default: 298 /* All other subdisk states mean it's not accessible. */ 299 return (ENXIO); 300 } 301 302 wp->length = real_len; 303 wp->data = addr; 304 wp->lockbase = real_off; 305 306 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); 307 308 /* Read all subdisks. */ 309 LIST_FOREACH(s, &p->subdisks, in_plex) { 310 /* Skip the broken subdisk. */ 311 if (s == broken) 312 continue; 313 314 /* Skip growing subdisks. */ 315 if (s->flags & GV_SD_GROW) 316 continue; 317 318 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 319 if (cbp == NULL) 320 return (ENOMEM); 321 cbp->bio_cmd = BIO_READ; 322 323 bioq_insert_tail(p->bqueue, cbp); 324 325 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 326 bq->bp = cbp; 327 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 328 } 329 330 /* Write the parity data. */ 331 cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); 332 if (cbp == NULL) 333 return (ENOMEM); 334 wp->parity = cbp; 335 336 p->synced = boff; 337 338 /* Post notification that we're finished. */ 339 return (0); 340} 341 342/* Build a request group to perform (part of) a RAID5 request. */ 343static int 344gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, 345 struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) 346{ 347 struct g_geom *gp; 348 struct gv_sd *broken, *original, *parity, *s; 349 struct gv_bioq *bq; 350 struct bio *cbp; 351 int i, psdno, sdno, type, grow; 352 off_t real_len, real_off; 353 354 gp = bp->bio_to->geom; 355 356 if (p == NULL || LIST_EMPTY(&p->subdisks)) 357 return (ENXIO); 358 359 /* We are optimistic and assume that this request will be OK. */ 360#define REQ_TYPE_NORMAL 0 361#define REQ_TYPE_DEGRADED 1 362#define REQ_TYPE_NOPARITY 2 363 364 type = REQ_TYPE_NORMAL; 365 original = parity = broken = NULL; 366 367 /* XXX: The resize won't crash with rebuild or sync, but we should still 368 * be aware of it. Also this should perhaps be done on rebuild/check as 369 * well? 370 */ 371 /* If we're over, we must use the old. */ 372 if (boff >= p->synced) { 373 grow = 1; 374 /* Or if over the resized offset, we use all drives. */ 375 } else if (boff + bcount <= p->synced) { 376 grow = 0; 377 /* Else, we're in the middle, and must wait a bit. */ 378 } else { 379 bioq_disksort(p->rqueue, bp); 380 *delay = 1; 381 return (0); 382 } 383 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, 384 &sdno, &psdno, grow); 385 386 /* Find the right subdisks. */ 387 i = 0; 388 LIST_FOREACH(s, &p->subdisks, in_plex) { 389 if (i == sdno) 390 original = s; 391 if (i == psdno) 392 parity = s; 393 if (s->state != GV_SD_UP) 394 broken = s; 395 i++; 396 } 397 398 if ((original == NULL) || (parity == NULL)) 399 return (ENXIO); 400 401 /* Our data stripe is missing. */ 402 if (original->state != GV_SD_UP) 403 type = REQ_TYPE_DEGRADED; 404 405 /* If synchronizing request, just write it if disks are stale. */ 406 if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE && 407 bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) { 408 type = REQ_TYPE_NORMAL; 409 /* Our parity stripe is missing. */ 410 } else if (parity->state != GV_SD_UP) { 411 /* We cannot take another failure if we're already degraded. */ 412 if (type != REQ_TYPE_NORMAL) 413 return (ENXIO); 414 else 415 type = REQ_TYPE_NOPARITY; 416 } 417 418 wp->length = real_len; 419 wp->data = addr; 420 wp->lockbase = real_off; 421 422 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 423 424 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) 425 type = REQ_TYPE_NORMAL; 426 427 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { 428 bioq_disksort(p->rqueue, bp); 429 *delay = 1; 430 return (0); 431 } 432 433 switch (bp->bio_cmd) { 434 case BIO_READ: 435 /* 436 * For a degraded read we need to read in all stripes except 437 * the broken one plus the parity stripe and then recalculate 438 * the desired data. 439 */ 440 if (type == REQ_TYPE_DEGRADED) { 441 bzero(wp->data, wp->length); 442 LIST_FOREACH(s, &p->subdisks, in_plex) { 443 /* Skip the broken subdisk. */ 444 if (s == broken) 445 continue; 446 /* Skip growing if within offset. */ 447 if (grow && s->flags & GV_SD_GROW) 448 continue; 449 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 450 if (cbp == NULL) 451 return (ENOMEM); 452 453 bioq_insert_tail(p->bqueue, cbp); 454 455 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 456 bq->bp = cbp; 457 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 458 } 459 460 /* A normal read can be fulfilled with the original subdisk. */ 461 } else { 462 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); 463 if (cbp == NULL) 464 return (ENOMEM); 465 466 bioq_insert_tail(p->bqueue, cbp); 467 } 468 wp->lockbase = -1; 469 470 break; 471 472 case BIO_WRITE: 473 /* 474 * A degraded write means we cannot write to the original data 475 * subdisk. Thus we need to read in all valid stripes, 476 * recalculate the parity from the original data, and then 477 * write the parity stripe back out. 478 */ 479 if (type == REQ_TYPE_DEGRADED) { 480 /* Read all subdisks. */ 481 LIST_FOREACH(s, &p->subdisks, in_plex) { 482 /* Skip the broken and the parity subdisk. */ 483 if ((s == broken) || (s == parity)) 484 continue; 485 /* Skip growing if within offset. */ 486 if (grow && s->flags & GV_SD_GROW) 487 continue; 488 489 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 490 if (cbp == NULL) 491 return (ENOMEM); 492 cbp->bio_cmd = BIO_READ; 493 494 bioq_insert_tail(p->bqueue, cbp); 495 496 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 497 bq->bp = cbp; 498 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 499 } 500 501 /* Write the parity data. */ 502 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 503 if (cbp == NULL) 504 return (ENOMEM); 505 bcopy(addr, cbp->bio_data, wp->length); 506 wp->parity = cbp; 507 508 /* 509 * When the parity stripe is missing we just write out the data. 510 */ 511 } else if (type == REQ_TYPE_NOPARITY) { 512 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 513 if (cbp == NULL) 514 return (ENOMEM); 515 516 bioq_insert_tail(p->bqueue, cbp); 517 518 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 519 bq->bp = cbp; 520 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 521 522 /* 523 * A normal write request goes to the original subdisk, then we 524 * read in all other stripes, recalculate the parity and write 525 * out the parity again. 526 */ 527 } else { 528 /* Read old parity. */ 529 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 530 if (cbp == NULL) 531 return (ENOMEM); 532 cbp->bio_cmd = BIO_READ; 533 534 bioq_insert_tail(p->bqueue, cbp); 535 536 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 537 bq->bp = cbp; 538 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 539 540 /* Read old data. */ 541 cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); 542 if (cbp == NULL) 543 return (ENOMEM); 544 cbp->bio_cmd = BIO_READ; 545 546 bioq_insert_tail(p->bqueue, cbp); 547 548 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 549 bq->bp = cbp; 550 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 551 552 /* Write new data. */ 553 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 554 if (cbp == NULL) 555 return (ENOMEM); 556 557 /* 558 * We must not write the new data until the old data 559 * was read, so hold this BIO back until we're ready 560 * for it. 561 */ 562 wp->waiting = cbp; 563 564 /* The final bio for the parity. */ 565 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 566 if (cbp == NULL) 567 return (ENOMEM); 568 569 /* Remember that this is the BIO for the parity data. */ 570 wp->parity = cbp; 571 } 572 break; 573 574 default: 575 return (EINVAL); 576 } 577 578 return (0); 579} 580 581/* 582 * Calculate the offsets in the various subdisks for a RAID5 request. Also take 583 * care of new subdisks in an expanded RAID5 array. 584 * XXX: This assumes that the new subdisks are inserted after the others (which 585 * is okay as long as plex_offset is larger). If subdisks are inserted into the 586 * plexlist before, we get problems. 587 */ 588static int 589gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, 590 off_t *real_len, int *sdno, int *psdno, int growing) 591{ 592 struct gv_sd *s; 593 int sd, psd, sdcount; 594 off_t len_left, stripeend, stripeoff, stripestart; 595 596 sdcount = p->sdcount; 597 if (growing) { 598 LIST_FOREACH(s, &p->subdisks, in_plex) { 599 if (s->flags & GV_SD_GROW) 600 sdcount--; 601 } 602 } 603 604 /* The number of the subdisk containing the parity stripe. */ 605 psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % 606 sdcount; 607 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); 608 609 /* Offset of the start address from the start of the stripe. */ 610 stripeoff = boff % (p->stripesize * (sdcount - 1)); 611 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); 612 613 /* The number of the subdisk where the stripe resides. */ 614 sd = stripeoff / p->stripesize; 615 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); 616 617 /* At or past parity subdisk. */ 618 if (sd >= psd) 619 sd++; 620 621 /* The offset of the stripe on this subdisk. */ 622 stripestart = (boff - stripeoff) / (sdcount - 1); 623 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); 624 625 stripeoff %= p->stripesize; 626 627 /* The offset of the request on this subdisk. */ 628 *real_off = stripestart + stripeoff; 629 630 stripeend = stripestart + p->stripesize; 631 len_left = stripeend - *real_off; 632 KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); 633 634 *real_len = (bcount <= len_left) ? bcount : len_left; 635 636 if (sdno != NULL) 637 *sdno = sd; 638 if (psdno != NULL) 639 *psdno = psd; 640 641 return (0); 642} 643 644static struct bio * 645gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, 646 caddr_t addr, int use_wp) 647{ 648 struct bio *cbp; 649 650 cbp = g_clone_bio(bp); 651 if (cbp == NULL) 652 return (NULL); 653 if (addr == NULL) { 654 cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); 655 cbp->bio_cflags |= GV_BIO_MALLOC; 656 } else 657 cbp->bio_data = addr; 658 cbp->bio_offset = wp->lockbase + s->drive_offset; 659 cbp->bio_length = wp->length; 660 cbp->bio_done = gv_done; 661 cbp->bio_caller1 = s; 662 s->drive_sc->active++; 663 if (use_wp) 664 cbp->bio_caller2 = wp; 665 666 return (cbp); 667} 668