1130389Sle/*- 2190507Slulf * Copyright (c) 2004, 2007 Lukas Ertl 3130389Sle * All rights reserved. 4130389Sle * 5130389Sle * Redistribution and use in source and binary forms, with or without 6130389Sle * modification, are permitted provided that the following conditions 7130389Sle * are met: 8130389Sle * 1. Redistributions of source code must retain the above copyright 9130389Sle * notice, this list of conditions and the following disclaimer. 10130389Sle * 2. Redistributions in binary form must reproduce the above copyright 11130389Sle * notice, this list of conditions and the following disclaimer in the 12130389Sle * documentation and/or other materials provided with the distribution. 13130389Sle * 14130389Sle * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15130389Sle * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16130389Sle * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17130389Sle * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18130389Sle * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19130389Sle * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20130389Sle * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21130389Sle * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22130389Sle * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23130389Sle * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24130389Sle * SUCH DAMAGE. 25130389Sle */ 26130389Sle 27130389Sle#include <sys/cdefs.h> 28130389Sle__FBSDID("$FreeBSD$"); 29130389Sle 30130389Sle#include <sys/param.h> 31130389Sle#include <sys/bio.h> 32130389Sle#include <sys/lock.h> 33130389Sle#include <sys/malloc.h> 34130389Sle#include <sys/systm.h> 35130389Sle 36130389Sle#include <geom/geom.h> 37130389Sle#include <geom/vinum/geom_vinum_var.h> 38130389Sle#include <geom/vinum/geom_vinum_raid5.h> 39130389Sle#include <geom/vinum/geom_vinum.h> 40130389Sle 41190507Slulfstatic int gv_raid5_offset(struct gv_plex *, off_t, off_t, 42190507Slulf off_t *, off_t *, int *, int *, int); 43190507Slulfstatic struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, 44190507Slulf struct gv_raid5_packet *, caddr_t, int); 45190507Slulfstatic int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, 46190507Slulf struct bio *, caddr_t, off_t, off_t, int *); 47190507Slulfstatic int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, 48190507Slulf struct bio *, caddr_t, off_t, off_t); 49190507Slulfstatic int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, 50190507Slulf struct bio *, caddr_t, off_t, off_t); 51137730Sle 52190507Slulfstruct gv_raid5_packet * 53190507Slulfgv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, 54190507Slulf off_t bcount) 55190507Slulf{ 56190507Slulf struct bio *cbp; 57190507Slulf struct gv_raid5_packet *wp, *wp2; 58190507Slulf struct gv_bioq *bq, *bq2; 59190507Slulf int err, delay; 60190507Slulf 61190507Slulf delay = 0; 62190507Slulf wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); 63190507Slulf wp->bio = bp; 64190507Slulf wp->waiting = NULL; 65190507Slulf wp->parity = NULL; 66190507Slulf TAILQ_INIT(&wp->bits); 67190507Slulf 68191856Slulf if (bp->bio_pflags & GV_BIO_REBUILD) 69190507Slulf err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); 70191856Slulf else if (bp->bio_pflags & GV_BIO_CHECK) 71190507Slulf err = gv_raid5_check(p, wp, bp, addr, boff, bcount); 72190507Slulf else 73190507Slulf err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); 74190507Slulf 75190507Slulf /* Means we have a delayed request. */ 76190507Slulf if (delay) { 77190507Slulf g_free(wp); 78190507Slulf return (NULL); 79190507Slulf } 80190507Slulf 81190507Slulf /* 82190507Slulf * Building the sub-request failed, we probably need to clean up a lot. 83190507Slulf */ 84190507Slulf if (err) { 85190507Slulf G_VINUM_LOGREQ(0, bp, "raid5 plex request failed."); 86190507Slulf TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 87190507Slulf TAILQ_REMOVE(&wp->bits, bq, queue); 88190507Slulf g_free(bq); 89190507Slulf } 90190507Slulf if (wp->waiting != NULL) { 91190507Slulf if (wp->waiting->bio_cflags & GV_BIO_MALLOC) 92190507Slulf g_free(wp->waiting->bio_data); 93190507Slulf g_destroy_bio(wp->waiting); 94190507Slulf } 95190507Slulf if (wp->parity != NULL) { 96190507Slulf if (wp->parity->bio_cflags & GV_BIO_MALLOC) 97190507Slulf g_free(wp->parity->bio_data); 98190507Slulf g_destroy_bio(wp->parity); 99190507Slulf } 100190507Slulf g_free(wp); 101190507Slulf 102190507Slulf TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { 103190507Slulf if (wp->bio != bp) 104190507Slulf continue; 105190507Slulf 106190507Slulf TAILQ_REMOVE(&p->packets, wp, list); 107190507Slulf TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 108190507Slulf TAILQ_REMOVE(&wp->bits, bq, queue); 109190507Slulf g_free(bq); 110190507Slulf } 111190507Slulf g_free(wp); 112190507Slulf } 113190507Slulf 114190507Slulf cbp = bioq_takefirst(p->bqueue); 115190507Slulf while (cbp != NULL) { 116190507Slulf if (cbp->bio_cflags & GV_BIO_MALLOC) 117190507Slulf g_free(cbp->bio_data); 118190507Slulf g_destroy_bio(cbp); 119190507Slulf cbp = bioq_takefirst(p->bqueue); 120190507Slulf } 121190507Slulf 122190507Slulf /* If internal, stop and reset state. */ 123191856Slulf if (bp->bio_pflags & GV_BIO_INTERNAL) { 124191856Slulf if (bp->bio_pflags & GV_BIO_MALLOC) 125191852Slulf g_free(bp->bio_data); 126190507Slulf g_destroy_bio(bp); 127190507Slulf /* Reset flags. */ 128190507Slulf p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | 129190507Slulf GV_PLEX_GROWING); 130190507Slulf return (NULL); 131190507Slulf } 132190507Slulf g_io_deliver(bp, err); 133190507Slulf return (NULL); 134190507Slulf } 135190507Slulf 136190507Slulf return (wp); 137190507Slulf} 138190507Slulf 139130389Sle/* 140130389Sle * Check if the stripe that the work packet wants is already being used by 141130389Sle * some other work packet. 142130389Sle */ 143130389Sleint 144135426Slegv_stripe_active(struct gv_plex *p, struct bio *bp) 145130389Sle{ 146135426Sle struct gv_raid5_packet *wp, *owp; 147135426Sle int overlap; 148130389Sle 149190507Slulf wp = bp->bio_caller2; 150135426Sle if (wp->lockbase == -1) 151135426Sle return (0); 152130389Sle 153135426Sle overlap = 0; 154135426Sle TAILQ_FOREACH(owp, &p->packets, list) { 155135426Sle if (owp == wp) 156135426Sle break; 157135426Sle if ((wp->lockbase >= owp->lockbase) && 158135426Sle (wp->lockbase <= owp->lockbase + owp->length)) { 159135426Sle overlap++; 160135426Sle break; 161130389Sle } 162135426Sle if ((wp->lockbase <= owp->lockbase) && 163135426Sle (wp->lockbase + wp->length >= owp->lockbase)) { 164135426Sle overlap++; 165135426Sle break; 166130389Sle } 167130389Sle } 168130389Sle 169135426Sle return (overlap); 170130389Sle} 171130389Sle 172190507Slulfstatic int 173190507Slulfgv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 174138110Sle caddr_t addr, off_t boff, off_t bcount) 175138110Sle{ 176138110Sle struct gv_sd *parity, *s; 177138110Sle struct gv_bioq *bq; 178190507Slulf struct bio *cbp; 179138110Sle int i, psdno; 180138110Sle off_t real_len, real_off; 181138110Sle 182138110Sle if (p == NULL || LIST_EMPTY(&p->subdisks)) 183138110Sle return (ENXIO); 184138110Sle 185190507Slulf gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); 186138110Sle 187138110Sle /* Find the right subdisk. */ 188138110Sle parity = NULL; 189138110Sle i = 0; 190138110Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 191138110Sle if (i == psdno) { 192138110Sle parity = s; 193138110Sle break; 194138110Sle } 195138110Sle i++; 196138110Sle } 197138110Sle 198138110Sle /* Parity stripe not found. */ 199138110Sle if (parity == NULL) 200138110Sle return (ENXIO); 201138110Sle 202138110Sle if (parity->state != GV_SD_UP) 203138110Sle return (ENXIO); 204138110Sle 205138110Sle wp->length = real_len; 206138110Sle wp->data = addr; 207138110Sle wp->lockbase = real_off; 208138110Sle 209138110Sle /* Read all subdisks. */ 210138110Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 211138110Sle /* Skip the parity subdisk. */ 212138110Sle if (s == parity) 213138110Sle continue; 214190507Slulf /* Skip growing subdisks. */ 215190507Slulf if (s->flags & GV_SD_GROW) 216190507Slulf continue; 217138110Sle 218190507Slulf cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 219138110Sle if (cbp == NULL) 220138110Sle return (ENOMEM); 221138110Sle cbp->bio_cmd = BIO_READ; 222138110Sle 223190507Slulf bioq_insert_tail(p->bqueue, cbp); 224138110Sle 225138110Sle bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 226138110Sle bq->bp = cbp; 227138110Sle TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 228138110Sle } 229138110Sle 230138110Sle /* Read the parity data. */ 231190507Slulf cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 232138110Sle if (cbp == NULL) 233138110Sle return (ENOMEM); 234138110Sle cbp->bio_cmd = BIO_READ; 235138110Sle wp->waiting = cbp; 236138110Sle 237138110Sle /* 238138110Sle * In case we want to rebuild the parity, create an extra BIO to write 239138110Sle * it out. It also acts as buffer for the XOR operations. 240138110Sle */ 241190507Slulf cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); 242138110Sle if (cbp == NULL) 243138110Sle return (ENOMEM); 244138110Sle wp->parity = cbp; 245138110Sle 246138110Sle return (0); 247138110Sle} 248138110Sle 249138110Sle/* Rebuild a degraded RAID5 plex. */ 250190507Slulfstatic int 251190507Slulfgv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 252135966Sle caddr_t addr, off_t boff, off_t bcount) 253135966Sle{ 254135966Sle struct gv_sd *broken, *s; 255135966Sle struct gv_bioq *bq; 256190507Slulf struct bio *cbp; 257137730Sle off_t real_len, real_off; 258135966Sle 259135966Sle if (p == NULL || LIST_EMPTY(&p->subdisks)) 260135966Sle return (ENXIO); 261135966Sle 262190507Slulf gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); 263135966Sle 264135966Sle /* Find the right subdisk. */ 265135966Sle broken = NULL; 266135966Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 267135966Sle if (s->state != GV_SD_UP) 268135966Sle broken = s; 269135966Sle } 270135966Sle 271138110Sle /* Broken stripe not found. */ 272135966Sle if (broken == NULL) 273135966Sle return (ENXIO); 274135966Sle 275135966Sle switch (broken->state) { 276135966Sle case GV_SD_UP: 277135966Sle return (EINVAL); 278135966Sle 279135966Sle case GV_SD_STALE: 280191856Slulf if (!(bp->bio_pflags & GV_BIO_REBUILD)) 281135966Sle return (ENXIO); 282135966Sle 283184292Slulf G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); 284135966Sle gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); 285190507Slulf /* Set this bit now, but should be set at end. */ 286190507Slulf broken->flags |= GV_SD_CANGOUP; 287135966Sle break; 288135966Sle 289135966Sle case GV_SD_REVIVING: 290135966Sle break; 291135966Sle 292135966Sle default: 293135966Sle /* All other subdisk states mean it's not accessible. */ 294135966Sle return (ENXIO); 295135966Sle } 296135966Sle 297135966Sle wp->length = real_len; 298135966Sle wp->data = addr; 299135966Sle wp->lockbase = real_off; 300135966Sle 301137730Sle KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); 302135966Sle 303135966Sle /* Read all subdisks. */ 304135966Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 305135966Sle /* Skip the broken subdisk. */ 306135966Sle if (s == broken) 307135966Sle continue; 308135966Sle 309190507Slulf /* Skip growing subdisks. */ 310190507Slulf if (s->flags & GV_SD_GROW) 311190507Slulf continue; 312190507Slulf 313190507Slulf cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 314135966Sle if (cbp == NULL) 315135966Sle return (ENOMEM); 316135966Sle cbp->bio_cmd = BIO_READ; 317135966Sle 318190507Slulf bioq_insert_tail(p->bqueue, cbp); 319135966Sle 320135966Sle bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 321135966Sle bq->bp = cbp; 322135966Sle TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 323135966Sle } 324135966Sle 325135966Sle /* Write the parity data. */ 326190507Slulf cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); 327135966Sle if (cbp == NULL) 328135966Sle return (ENOMEM); 329135966Sle wp->parity = cbp; 330135966Sle 331135966Sle p->synced = boff; 332135966Sle 333190507Slulf /* Post notification that we're finished. */ 334135966Sle return (0); 335135966Sle} 336135966Sle 337130389Sle/* Build a request group to perform (part of) a RAID5 request. */ 338190507Slulfstatic int 339190507Slulfgv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, 340190507Slulf struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) 341130389Sle{ 342130389Sle struct g_geom *gp; 343130389Sle struct gv_sd *broken, *original, *parity, *s; 344135426Sle struct gv_bioq *bq; 345190507Slulf struct bio *cbp; 346190507Slulf int i, psdno, sdno, type, grow; 347137730Sle off_t real_len, real_off; 348130389Sle 349130389Sle gp = bp->bio_to->geom; 350130389Sle 351130389Sle if (p == NULL || LIST_EMPTY(&p->subdisks)) 352130389Sle return (ENXIO); 353130389Sle 354130389Sle /* We are optimistic and assume that this request will be OK. */ 355135426Sle#define REQ_TYPE_NORMAL 0 356135426Sle#define REQ_TYPE_DEGRADED 1 357135426Sle#define REQ_TYPE_NOPARITY 2 358135426Sle 359135426Sle type = REQ_TYPE_NORMAL; 360130389Sle original = parity = broken = NULL; 361130389Sle 362190507Slulf /* XXX: The resize won't crash with rebuild or sync, but we should still 363190507Slulf * be aware of it. Also this should perhaps be done on rebuild/check as 364190507Slulf * well? 365190507Slulf */ 366190507Slulf /* If we're over, we must use the old. */ 367190507Slulf if (boff >= p->synced) { 368190507Slulf grow = 1; 369190507Slulf /* Or if over the resized offset, we use all drives. */ 370190507Slulf } else if (boff + bcount <= p->synced) { 371190507Slulf grow = 0; 372190507Slulf /* Else, we're in the middle, and must wait a bit. */ 373190507Slulf } else { 374190507Slulf bioq_disksort(p->rqueue, bp); 375190507Slulf *delay = 1; 376190507Slulf return (0); 377190507Slulf } 378190507Slulf gv_raid5_offset(p, boff, bcount, &real_off, &real_len, 379190507Slulf &sdno, &psdno, grow); 380130389Sle 381130389Sle /* Find the right subdisks. */ 382130389Sle i = 0; 383130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 384130389Sle if (i == sdno) 385130389Sle original = s; 386130389Sle if (i == psdno) 387130389Sle parity = s; 388130389Sle if (s->state != GV_SD_UP) 389130389Sle broken = s; 390130389Sle i++; 391130389Sle } 392130389Sle 393130389Sle if ((original == NULL) || (parity == NULL)) 394130389Sle return (ENXIO); 395130389Sle 396130389Sle /* Our data stripe is missing. */ 397130389Sle if (original->state != GV_SD_UP) 398135426Sle type = REQ_TYPE_DEGRADED; 399190507Slulf 400190507Slulf /* If synchronizing request, just write it if disks are stale. */ 401190507Slulf if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE && 402191856Slulf bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) { 403190507Slulf type = REQ_TYPE_NORMAL; 404130389Sle /* Our parity stripe is missing. */ 405190507Slulf } else if (parity->state != GV_SD_UP) { 406130389Sle /* We cannot take another failure if we're already degraded. */ 407135426Sle if (type != REQ_TYPE_NORMAL) 408130389Sle return (ENXIO); 409130389Sle else 410135426Sle type = REQ_TYPE_NOPARITY; 411130389Sle } 412130389Sle 413135426Sle wp->length = real_len; 414130389Sle wp->data = addr; 415135426Sle wp->lockbase = real_off; 416130389Sle 417130389Sle KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 418130389Sle 419190507Slulf if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) 420135966Sle type = REQ_TYPE_NORMAL; 421135966Sle 422190507Slulf if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { 423190507Slulf bioq_disksort(p->rqueue, bp); 424190507Slulf *delay = 1; 425190507Slulf return (0); 426190507Slulf } 427190507Slulf 428130389Sle switch (bp->bio_cmd) { 429130389Sle case BIO_READ: 430130389Sle /* 431130389Sle * For a degraded read we need to read in all stripes except 432130389Sle * the broken one plus the parity stripe and then recalculate 433130389Sle * the desired data. 434130389Sle */ 435135426Sle if (type == REQ_TYPE_DEGRADED) { 436135426Sle bzero(wp->data, wp->length); 437130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 438130389Sle /* Skip the broken subdisk. */ 439130389Sle if (s == broken) 440130389Sle continue; 441190507Slulf /* Skip growing if within offset. */ 442190507Slulf if (grow && s->flags & GV_SD_GROW) 443190507Slulf continue; 444190507Slulf cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 445135426Sle if (cbp == NULL) 446130389Sle return (ENOMEM); 447135426Sle 448190507Slulf bioq_insert_tail(p->bqueue, cbp); 449135426Sle 450135426Sle bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 451135426Sle bq->bp = cbp; 452135426Sle TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 453130389Sle } 454130389Sle 455130389Sle /* A normal read can be fulfilled with the original subdisk. */ 456130389Sle } else { 457190507Slulf cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); 458135426Sle if (cbp == NULL) 459130389Sle return (ENOMEM); 460135426Sle 461190507Slulf bioq_insert_tail(p->bqueue, cbp); 462130389Sle } 463135426Sle wp->lockbase = -1; 464135426Sle 465130389Sle break; 466130389Sle 467130389Sle case BIO_WRITE: 468130389Sle /* 469130389Sle * A degraded write means we cannot write to the original data 470130389Sle * subdisk. Thus we need to read in all valid stripes, 471130389Sle * recalculate the parity from the original data, and then 472130389Sle * write the parity stripe back out. 473130389Sle */ 474135426Sle if (type == REQ_TYPE_DEGRADED) { 475135426Sle /* Read all subdisks. */ 476130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 477130389Sle /* Skip the broken and the parity subdisk. */ 478135426Sle if ((s == broken) || (s == parity)) 479130389Sle continue; 480190507Slulf /* Skip growing if within offset. */ 481190507Slulf if (grow && s->flags & GV_SD_GROW) 482190507Slulf continue; 483130389Sle 484190507Slulf cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 485135426Sle if (cbp == NULL) 486130389Sle return (ENOMEM); 487135426Sle cbp->bio_cmd = BIO_READ; 488135426Sle 489190507Slulf bioq_insert_tail(p->bqueue, cbp); 490135426Sle 491135426Sle bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 492135426Sle bq->bp = cbp; 493135426Sle TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 494130389Sle } 495130389Sle 496135426Sle /* Write the parity data. */ 497190507Slulf cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 498135426Sle if (cbp == NULL) 499130389Sle return (ENOMEM); 500190507Slulf bcopy(addr, cbp->bio_data, wp->length); 501135426Sle wp->parity = cbp; 502130389Sle 503130389Sle /* 504135426Sle * When the parity stripe is missing we just write out the data. 505130389Sle */ 506135426Sle } else if (type == REQ_TYPE_NOPARITY) { 507190507Slulf cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 508135426Sle if (cbp == NULL) 509130925Sle return (ENOMEM); 510130389Sle 511190507Slulf bioq_insert_tail(p->bqueue, cbp); 512130389Sle 513135426Sle bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 514135426Sle bq->bp = cbp; 515135426Sle TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 516130389Sle 517130389Sle /* 518130389Sle * A normal write request goes to the original subdisk, then we 519130389Sle * read in all other stripes, recalculate the parity and write 520130389Sle * out the parity again. 521130389Sle */ 522130389Sle } else { 523135426Sle /* Read old parity. */ 524190507Slulf cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 525135426Sle if (cbp == NULL) 526130925Sle return (ENOMEM); 527135426Sle cbp->bio_cmd = BIO_READ; 528130389Sle 529190507Slulf bioq_insert_tail(p->bqueue, cbp); 530135426Sle 531135426Sle bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 532135426Sle bq->bp = cbp; 533135426Sle TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 534135426Sle 535135426Sle /* Read old data. */ 536190507Slulf cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); 537135426Sle if (cbp == NULL) 538135426Sle return (ENOMEM); 539135426Sle cbp->bio_cmd = BIO_READ; 540135426Sle 541190507Slulf bioq_insert_tail(p->bqueue, cbp); 542135426Sle 543135426Sle bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 544135426Sle bq->bp = cbp; 545135426Sle TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 546135426Sle 547135426Sle /* Write new data. */ 548190507Slulf cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 549135426Sle if (cbp == NULL) 550135426Sle return (ENOMEM); 551135426Sle 552135426Sle /* 553135426Sle * We must not write the new data until the old data 554135426Sle * was read, so hold this BIO back until we're ready 555135426Sle * for it. 556135426Sle */ 557135426Sle wp->waiting = cbp; 558135426Sle 559135426Sle /* The final bio for the parity. */ 560190507Slulf cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 561135426Sle if (cbp == NULL) 562135426Sle return (ENOMEM); 563135426Sle 564135426Sle /* Remember that this is the BIO for the parity data. */ 565135426Sle wp->parity = cbp; 566130389Sle } 567130389Sle break; 568135426Sle 569130389Sle default: 570130389Sle return (EINVAL); 571130389Sle } 572130389Sle 573130389Sle return (0); 574130389Sle} 575137730Sle 576190507Slulf/* 577190507Slulf * Calculate the offsets in the various subdisks for a RAID5 request. Also take 578190507Slulf * care of new subdisks in an expanded RAID5 array. 579190507Slulf * XXX: This assumes that the new subdisks are inserted after the others (which 580190507Slulf * is okay as long as plex_offset is larger). If subdisks are inserted into the 581190507Slulf * plexlist before, we get problems. 582190507Slulf */ 583190507Slulfstatic int 584137730Slegv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, 585190507Slulf off_t *real_len, int *sdno, int *psdno, int growing) 586137730Sle{ 587190507Slulf struct gv_sd *s; 588190507Slulf int sd, psd, sdcount; 589137730Sle off_t len_left, stripeend, stripeoff, stripestart; 590137730Sle 591190507Slulf sdcount = p->sdcount; 592190507Slulf if (growing) { 593190507Slulf LIST_FOREACH(s, &p->subdisks, in_plex) { 594190507Slulf if (s->flags & GV_SD_GROW) 595190507Slulf sdcount--; 596190507Slulf } 597190507Slulf } 598190507Slulf 599137730Sle /* The number of the subdisk containing the parity stripe. */ 600190507Slulf psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % 601190507Slulf sdcount; 602137730Sle KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); 603137730Sle 604137730Sle /* Offset of the start address from the start of the stripe. */ 605190507Slulf stripeoff = boff % (p->stripesize * (sdcount - 1)); 606137730Sle KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); 607137730Sle 608137730Sle /* The number of the subdisk where the stripe resides. */ 609137730Sle sd = stripeoff / p->stripesize; 610137730Sle KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); 611137730Sle 612137730Sle /* At or past parity subdisk. */ 613137730Sle if (sd >= psd) 614137730Sle sd++; 615137730Sle 616137730Sle /* The offset of the stripe on this subdisk. */ 617190507Slulf stripestart = (boff - stripeoff) / (sdcount - 1); 618137730Sle KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); 619137730Sle 620137730Sle stripeoff %= p->stripesize; 621137730Sle 622137730Sle /* The offset of the request on this subdisk. */ 623137730Sle *real_off = stripestart + stripeoff; 624137730Sle 625137730Sle stripeend = stripestart + p->stripesize; 626137730Sle len_left = stripeend - *real_off; 627137730Sle KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); 628137730Sle 629137730Sle *real_len = (bcount <= len_left) ? bcount : len_left; 630137730Sle 631137730Sle if (sdno != NULL) 632137730Sle *sdno = sd; 633137730Sle if (psdno != NULL) 634137730Sle *psdno = psd; 635137730Sle 636137730Sle return (0); 637137730Sle} 638190507Slulf 639190507Slulfstatic struct bio * 640190507Slulfgv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, 641190507Slulf caddr_t addr, int use_wp) 642190507Slulf{ 643190507Slulf struct bio *cbp; 644190507Slulf 645190507Slulf cbp = g_clone_bio(bp); 646190507Slulf if (cbp == NULL) 647190507Slulf return (NULL); 648190507Slulf if (addr == NULL) { 649190507Slulf cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); 650190507Slulf cbp->bio_cflags |= GV_BIO_MALLOC; 651190507Slulf } else 652190507Slulf cbp->bio_data = addr; 653190507Slulf cbp->bio_offset = wp->lockbase + s->drive_offset; 654190507Slulf cbp->bio_length = wp->length; 655190507Slulf cbp->bio_done = gv_done; 656190507Slulf cbp->bio_caller1 = s; 657190507Slulf if (use_wp) 658190507Slulf cbp->bio_caller2 = wp; 659190507Slulf 660190507Slulf return (cbp); 661190507Slulf} 662