geom_vinum_raid5.c revision 130477
1130389Sle/*- 2130389Sle * Copyright (c) 2004 Lukas Ertl 3130389Sle * All rights reserved. 4130389Sle * 5130389Sle * Redistribution and use in source and binary forms, with or without 6130389Sle * modification, are permitted provided that the following conditions 7130389Sle * are met: 8130389Sle * 1. Redistributions of source code must retain the above copyright 9130389Sle * notice, this list of conditions and the following disclaimer. 10130389Sle * 2. Redistributions in binary form must reproduce the above copyright 11130389Sle * notice, this list of conditions and the following disclaimer in the 12130389Sle * documentation and/or other materials provided with the distribution. 13130389Sle * 14130389Sle * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15130389Sle * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16130389Sle * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17130389Sle * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18130389Sle * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19130389Sle * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20130389Sle * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21130389Sle * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22130389Sle * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23130389Sle * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24130389Sle * SUCH DAMAGE. 25130389Sle */ 26130389Sle 27130389Sle#include <sys/cdefs.h> 28130389Sle__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_raid5.c 130477 2004-06-14 17:06:55Z le $"); 29130389Sle 30130389Sle#include <sys/param.h> 31130389Sle#include <sys/bio.h> 32130389Sle#include <sys/conf.h> 33130389Sle#include <sys/errno.h> 34130389Sle#include <sys/kernel.h> 35130389Sle#include <sys/kthread.h> 36130389Sle#include <sys/libkern.h> 37130389Sle#include <sys/lock.h> 38130389Sle#include <sys/malloc.h> 39130389Sle#include <sys/mutex.h> 40130389Sle#include <sys/systm.h> 41130389Sle 42130389Sle#include <geom/geom.h> 43130389Sle#include <geom/vinum/geom_vinum_var.h> 44130389Sle#include <geom/vinum/geom_vinum_raid5.h> 45130389Sle#include <geom/vinum/geom_vinum.h> 46130389Sle 47130389Sleint gv_raid5_parity(struct gv_raid5_packet *); 48130389Sleint gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *); 49130389Sle 50130389Slestruct gv_raid5_bit * 51130389Slegv_new_raid5_bit(void) 52130389Sle{ 53130389Sle struct gv_raid5_bit *r; 54130389Sle r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO); 55130389Sle KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r")); 56130389Sle return (r); 57130389Sle} 58130389Sle 59130389Slestruct gv_raid5_packet * 60130389Slegv_new_raid5_packet(void) 61130389Sle{ 62130389Sle struct gv_raid5_packet *wp; 63130389Sle 64130389Sle wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO); 65130389Sle KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp")); 66130389Sle wp->state = SETUP; 67130389Sle wp->type = JUNK; 68130389Sle TAILQ_INIT(&wp->bits); 69130389Sle 70130389Sle return (wp); 71130389Sle} 72130389Sle 73130389Sle/* 74130389Sle * Check if the stripe that the work packet wants is already being used by 75130389Sle * some other work packet. 76130389Sle */ 77130389Sleint 78130389Slegv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc) 79130389Sle{ 80130389Sle struct gv_raid5_packet *wpa; 81130389Sle 82130389Sle TAILQ_FOREACH(wpa, &sc->worklist, list) { 83130389Sle if (wpa->lockbase == wp->lockbase) { 84130389Sle if (wpa->bio == wp->bio) 85130389Sle return (0); 86130389Sle return (1); 87130389Sle } 88130389Sle } 89130389Sle return (0); 90130389Sle} 91130389Sle 92130389Sle/* 93130389Sle * The "worker" thread that runs through the worklist and fires off the 94130389Sle * "subrequests" needed to fulfill a RAID5 read or write request. 95130389Sle */ 96130389Slevoid 97130389Slegv_raid5_worker(void *arg) 98130389Sle{ 99130389Sle struct bio *bp; 100130389Sle struct g_geom *gp; 101130389Sle struct gv_plex *p; 102130389Sle struct gv_raid5_packet *wp, *wpt; 103130389Sle struct gv_raid5_bit *rbp, *rbpt; 104130389Sle int error, restart; 105130389Sle 106130389Sle gp = arg; 107130389Sle p = gp->softc; 108130389Sle 109130389Sle mtx_lock(&p->worklist_mtx); 110130389Sle for (;;) { 111130389Sle restart = 0; 112130389Sle g_trace(G_T_TOPOLOGY, "gv_raid5_worker scan"); 113130389Sle TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) { 114130389Sle /* This request packet is already being processed. */ 115130389Sle if (wp->state == IO) 116130389Sle continue; 117130389Sle /* This request packet is ready for processing. */ 118130389Sle if (wp->state == VALID) { 119130389Sle /* Couldn't get the lock, try again. */ 120130389Sle if ((wp->lockbase != -1) && 121130389Sle gv_stripe_active(wp, p)) 122130389Sle continue; 123130389Sle 124130389Sle wp->state = IO; 125130389Sle mtx_unlock(&p->worklist_mtx); 126130389Sle TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt) 127130389Sle g_io_request(rbp->bio, rbp->consumer); 128130389Sle mtx_lock(&p->worklist_mtx); 129130389Sle continue; 130130389Sle } 131130389Sle if (wp->state == FINISH) { 132130389Sle bp = wp->bio; 133130389Sle bp->bio_completed += wp->length; 134130389Sle /* 135130389Sle * Deliver the original request if we have 136130389Sle * finished. 137130389Sle */ 138130389Sle if (bp->bio_completed == bp->bio_length) { 139130389Sle mtx_unlock(&p->worklist_mtx); 140130389Sle g_io_deliver(bp, 0); 141130389Sle mtx_lock(&p->worklist_mtx); 142130389Sle } 143130389Sle TAILQ_REMOVE(&p->worklist, wp, list); 144130389Sle if (wp->bufmalloc == 1) 145130389Sle g_free(wp->buf); 146130389Sle g_free(wp); 147130389Sle restart++; 148130389Sle /*break;*/ 149130389Sle } 150130389Sle } 151130389Sle if (!restart) { 152130389Sle /* Self-destruct. */ 153130389Sle if (p->flags & GV_PLEX_THREAD_DIE) 154130389Sle break; 155130389Sle g_trace(G_T_TOPOLOGY, "gv_raid5_worker sleep"); 156130389Sle error = msleep(p, &p->worklist_mtx, PRIBIO, "-", 157130389Sle hz/100); 158130389Sle } 159130389Sle } 160130389Sle mtx_unlock(&p->worklist_mtx); 161130389Sle 162130389Sle g_trace(G_T_TOPOLOGY, "gv_raid5_worker die"); 163130389Sle 164130389Sle /* Signal our plex that we are dead. */ 165130389Sle p->flags |= GV_PLEX_THREAD_DEAD; 166130389Sle wakeup(p); 167130389Sle kthread_exit(0); 168130389Sle} 169130389Sle 170130389Sle/* Final bio transaction to write out the parity data. */ 171130389Sleint 172130389Slegv_raid5_parity(struct gv_raid5_packet *wp) 173130389Sle{ 174130389Sle struct bio *bp; 175130389Sle 176130389Sle bp = g_new_bio(); 177130389Sle if (bp == NULL) 178130389Sle return (ENOMEM); 179130389Sle 180130389Sle wp->type = ISPARITY; 181130389Sle bp->bio_cmd = BIO_WRITE; 182130389Sle bp->bio_data = wp->buf; 183130389Sle bp->bio_offset = wp->offset; 184130389Sle bp->bio_length = wp->length; 185130389Sle bp->bio_done = gv_raid5_done; 186130389Sle bp->bio_caller1 = wp; 187130389Sle bp->bio_caller2 = NULL; 188130389Sle g_io_request(bp, wp->parity); 189130389Sle 190130389Sle return (0); 191130389Sle} 192130389Sle 193130389Sle/* We end up here after each subrequest. */ 194130389Slevoid 195130389Slegv_raid5_done(struct bio *bp) 196130389Sle{ 197130389Sle struct bio *obp; 198130389Sle struct g_geom *gp; 199130389Sle struct gv_plex *p; 200130389Sle struct gv_raid5_packet *wp; 201130389Sle struct gv_raid5_bit *rbp; 202130389Sle off_t i; 203130389Sle int error; 204130389Sle 205130389Sle wp = bp->bio_caller1; 206130389Sle rbp = bp->bio_caller2; 207130389Sle obp = wp->bio; 208130389Sle gp = bp->bio_from->geom; 209130389Sle p = gp->softc; 210130389Sle 211130389Sle /* One less active subrequest. */ 212130389Sle wp->active--; 213130389Sle 214130389Sle switch (obp->bio_cmd) { 215130389Sle case BIO_READ: 216130389Sle /* Degraded reads need to handle parity data. */ 217130389Sle if (wp->type == DEGRADED) { 218130389Sle for (i = 0; i < wp->length; i++) 219130389Sle wp->buf[i] ^= bp->bio_data[i]; 220130389Sle 221130389Sle /* When we're finished copy back the data we want. */ 222130389Sle if (wp->active == 0) 223130389Sle bcopy(wp->buf, wp->data, wp->length); 224130389Sle } 225130389Sle 226130389Sle break; 227130389Sle 228130389Sle case BIO_WRITE: 229130389Sle /* Handle the parity data, if needed. */ 230130389Sle if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) { 231130389Sle for (i = 0; i < wp->length; i++) 232130389Sle wp->buf[i] ^= bp->bio_data[i]; 233130389Sle 234130389Sle /* Write out the parity data we calculated. */ 235130389Sle if (wp->active == 0) { 236130389Sle wp->active++; 237130389Sle error = gv_raid5_parity(wp); 238130389Sle } 239130389Sle } 240130389Sle break; 241130389Sle } 242130389Sle 243130389Sle g_destroy_bio(bp); 244130389Sle 245130389Sle if (rbp != NULL) { 246130389Sle if (rbp->malloc == 1) 247130389Sle g_free(rbp->buf); 248130389Sle TAILQ_REMOVE(&wp->bits, rbp, list); 249130389Sle g_free(rbp); 250130389Sle } 251130389Sle 252130389Sle /* This request group is done. */ 253130389Sle if (wp->active == 0) 254130389Sle wp->state = FINISH; 255130389Sle} 256130389Sle 257130389Sle/* Build a request group to perform (part of) a RAID5 request. */ 258130389Sleint 259130389Slegv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, 260130389Sle long bcount, off_t boff) 261130389Sle{ 262130389Sle struct g_geom *gp; 263130389Sle struct gv_plex *p; 264130389Sle struct gv_raid5_bit *rbp; 265130389Sle struct gv_sd *broken, *original, *parity, *s; 266130389Sle int i, psdno, sdno; 267130389Sle off_t len_left, real_off, stripeend, stripeoff, stripestart; 268130389Sle 269130389Sle gp = bp->bio_to->geom; 270130389Sle p = gp->softc; 271130389Sle 272130389Sle if (p == NULL || LIST_EMPTY(&p->subdisks)) 273130389Sle return (ENXIO); 274130389Sle 275130389Sle /* We are optimistic and assume that this request will be OK. */ 276130389Sle wp->type = NORMAL; 277130389Sle original = parity = broken = NULL; 278130389Sle 279130389Sle /* The number of the subdisk containing the parity stripe. */ 280130389Sle psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % 281130389Sle p->sdcount; 282130389Sle KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0")); 283130389Sle 284130389Sle /* Offset of the start address from the start of the stripe. */ 285130389Sle stripeoff = boff % (p->stripesize * (p->sdcount - 1)); 286130389Sle KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0")); 287130389Sle 288130389Sle /* The number of the subdisk where the stripe resides. */ 289130389Sle sdno = stripeoff / p->stripesize; 290130389Sle KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0")); 291130389Sle 292130389Sle /* At or past parity subdisk. */ 293130389Sle if (sdno >= psdno) 294130389Sle sdno++; 295130389Sle 296130389Sle /* The offset of the stripe on this subdisk. */ 297130389Sle stripestart = (boff - stripeoff) / (p->sdcount - 1); 298130389Sle KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0")); 299130389Sle 300130477Sle stripeoff %= p->stripesize; 301130389Sle 302130389Sle /* The offset of the request on this subdisk. */ 303130389Sle real_off = stripestart + stripeoff; 304130389Sle 305130389Sle stripeend = stripestart + p->stripesize; 306130389Sle len_left = stripeend - real_off; 307130389Sle KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0")); 308130389Sle 309130389Sle /* Find the right subdisks. */ 310130389Sle i = 0; 311130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 312130389Sle if (i == sdno) 313130389Sle original = s; 314130389Sle if (i == psdno) 315130389Sle parity = s; 316130389Sle if (s->state != GV_SD_UP) 317130389Sle broken = s; 318130389Sle i++; 319130389Sle } 320130389Sle 321130389Sle if ((original == NULL) || (parity == NULL)) 322130389Sle return (ENXIO); 323130389Sle 324130389Sle /* Our data stripe is missing. */ 325130389Sle if (original->state != GV_SD_UP) 326130389Sle wp->type = DEGRADED; 327130389Sle /* Our parity stripe is missing. */ 328130389Sle if (parity->state != GV_SD_UP) { 329130389Sle /* We cannot take another failure if we're already degraded. */ 330130389Sle if (wp->type != NORMAL) 331130389Sle return (ENXIO); 332130389Sle else 333130389Sle wp->type = NOPARITY; 334130389Sle } 335130389Sle 336130389Sle /* 337130389Sle * A combined write is necessary when the original data subdisk and the 338130389Sle * parity subdisk are both up, but one of the other subdisks isn't. 339130389Sle */ 340130389Sle if ((broken != NULL) && (broken != parity) && (broken != original)) 341130389Sle wp->type = COMBINED; 342130389Sle 343130389Sle wp->offset = real_off; 344130389Sle wp->length = (bcount <= len_left) ? bcount : len_left; 345130389Sle wp->data = addr; 346130389Sle wp->original = original->consumer; 347130389Sle wp->parity = parity->consumer; 348130389Sle wp->lockbase = stripestart; 349130389Sle 350130389Sle KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 351130389Sle 352130389Sle switch (bp->bio_cmd) { 353130389Sle case BIO_READ: 354130389Sle /* 355130389Sle * For a degraded read we need to read in all stripes except 356130389Sle * the broken one plus the parity stripe and then recalculate 357130389Sle * the desired data. 358130389Sle */ 359130389Sle if (wp->type == DEGRADED) { 360130389Sle wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); 361130389Sle wp->bufmalloc = 1; 362130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 363130389Sle /* Skip the broken subdisk. */ 364130389Sle if (s == broken) 365130389Sle continue; 366130389Sle rbp = gv_new_raid5_bit(); 367130389Sle rbp->consumer = s->consumer; 368130389Sle rbp->bio = g_new_bio(); 369130389Sle if (rbp->bio == NULL) 370130389Sle return (ENOMEM); 371130389Sle rbp->buf = g_malloc(wp->length, 372130389Sle M_WAITOK | M_ZERO); 373130389Sle rbp->malloc = 1; 374130389Sle rbp->bio->bio_cmd = BIO_READ; 375130389Sle rbp->bio->bio_offset = wp->offset; 376130389Sle rbp->bio->bio_length = wp->length; 377130389Sle rbp->bio->bio_data = rbp->buf; 378130389Sle rbp->bio->bio_done = gv_raid5_done; 379130389Sle rbp->bio->bio_caller1 = wp; 380130389Sle rbp->bio->bio_caller2 = rbp; 381130389Sle TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 382130389Sle wp->active++; 383130389Sle wp->rqcount++; 384130389Sle } 385130389Sle 386130389Sle /* A normal read can be fulfilled with the original subdisk. */ 387130389Sle } else { 388130389Sle rbp = gv_new_raid5_bit(); 389130389Sle rbp->consumer = wp->original; 390130389Sle rbp->bio = g_new_bio(); 391130389Sle if (rbp->bio == NULL) 392130389Sle return (ENOMEM); 393130389Sle rbp->bio->bio_cmd = BIO_READ; 394130389Sle rbp->bio->bio_offset = wp->offset; 395130389Sle rbp->bio->bio_length = wp->length; 396130389Sle rbp->buf = addr; 397130389Sle rbp->bio->bio_data = rbp->buf; 398130389Sle rbp->bio->bio_done = gv_raid5_done; 399130389Sle rbp->bio->bio_caller1 = wp; 400130389Sle rbp->bio->bio_caller2 = rbp; 401130389Sle TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 402130389Sle wp->active++; 403130389Sle wp->rqcount++; 404130389Sle } 405130389Sle if (wp->type != COMBINED) 406130389Sle wp->lockbase = -1; 407130389Sle break; 408130389Sle 409130389Sle case BIO_WRITE: 410130389Sle /* 411130389Sle * A degraded write means we cannot write to the original data 412130389Sle * subdisk. Thus we need to read in all valid stripes, 413130389Sle * recalculate the parity from the original data, and then 414130389Sle * write the parity stripe back out. 415130389Sle */ 416130389Sle if (wp->type == DEGRADED) { 417130389Sle wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); 418130389Sle wp->bufmalloc = 1; 419130389Sle 420130389Sle /* Copy the original data. */ 421130389Sle bcopy(wp->data, wp->buf, wp->length); 422130389Sle 423130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 424130389Sle /* Skip the broken and the parity subdisk. */ 425130389Sle if ((s == broken) || 426130389Sle (s->consumer == wp->parity)) 427130389Sle continue; 428130389Sle 429130389Sle rbp = gv_new_raid5_bit(); 430130389Sle rbp->consumer = s->consumer; 431130389Sle rbp->bio = g_new_bio(); 432130389Sle if (rbp->bio == NULL) 433130389Sle return (ENOMEM); 434130389Sle rbp->buf = g_malloc(wp->length, 435130389Sle M_WAITOK | M_ZERO); 436130389Sle rbp->malloc = 1; 437130389Sle rbp->bio->bio_cmd = BIO_READ; 438130389Sle rbp->bio->bio_data = rbp->buf; 439130389Sle rbp->bio->bio_offset = wp->offset; 440130389Sle rbp->bio->bio_length = wp->length; 441130389Sle rbp->bio->bio_done = gv_raid5_done; 442130389Sle rbp->bio->bio_caller1 = wp; 443130389Sle rbp->bio->bio_caller2 = rbp; 444130389Sle TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 445130389Sle wp->active++; 446130389Sle wp->rqcount++; 447130389Sle } 448130389Sle 449130389Sle /* 450130389Sle * When we don't have the parity stripe we just write out the 451130389Sle * data. 452130389Sle */ 453130389Sle } else if (wp->type == NOPARITY) { 454130389Sle rbp = gv_new_raid5_bit(); 455130389Sle rbp->consumer = wp->original; 456130389Sle rbp->bio = g_new_bio(); 457130389Sle if (rbp->bio == NULL) 458130389Sle return (ENOMEM); 459130389Sle rbp->bio->bio_cmd = BIO_WRITE; 460130389Sle rbp->bio->bio_offset = wp->offset; 461130389Sle rbp->bio->bio_length = wp->length; 462130389Sle rbp->bio->bio_data = addr; 463130389Sle rbp->bio->bio_done = gv_raid5_done; 464130389Sle rbp->bio->bio_caller1 = wp; 465130389Sle rbp->bio->bio_caller2 = rbp; 466130389Sle TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 467130389Sle wp->active++; 468130389Sle wp->rqcount++; 469130389Sle 470130389Sle /* 471130389Sle * A combined write means that our data subdisk and the parity 472130389Sle * subdisks are both up, but another subdisk isn't. We need to 473130389Sle * read all valid stripes including the parity to recalculate 474130389Sle * the data of the stripe that is missing. Then we write our 475130389Sle * original data, and together with the other data stripes 476130389Sle * recalculate the parity again. 477130389Sle */ 478130389Sle } else if (wp->type == COMBINED) { 479130389Sle wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); 480130389Sle wp->bufmalloc = 1; 481130389Sle 482130389Sle /* Get the data from all subdisks. */ 483130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 484130389Sle /* Skip the broken subdisk. */ 485130389Sle if (s == broken) 486130389Sle continue; 487130389Sle 488130389Sle rbp = gv_new_raid5_bit(); 489130389Sle rbp->consumer = s->consumer; 490130389Sle rbp->bio = g_new_bio(); 491130389Sle if (rbp->bio == NULL) 492130389Sle return (ENOMEM); 493130389Sle rbp->bio->bio_cmd = BIO_READ; 494130389Sle rbp->buf = g_malloc(wp->length, 495130389Sle M_WAITOK | M_ZERO); 496130389Sle rbp->malloc = 1; 497130389Sle rbp->bio->bio_data = rbp->buf; 498130389Sle rbp->bio->bio_offset = wp->offset; 499130389Sle rbp->bio->bio_length = wp->length; 500130389Sle rbp->bio->bio_done = gv_raid5_done; 501130389Sle rbp->bio->bio_caller1 = wp; 502130389Sle rbp->bio->bio_caller2 = rbp; 503130389Sle TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 504130389Sle wp->active++; 505130389Sle wp->rqcount++; 506130389Sle } 507130389Sle 508130389Sle /* Write the original data. */ 509130389Sle rbp = gv_new_raid5_bit(); 510130389Sle rbp->consumer = wp->original; 511130389Sle rbp->buf = addr; 512130389Sle rbp->bio = g_new_bio(); 513130389Sle if (rbp->bio == NULL) 514130389Sle return (ENOMEM); 515130389Sle rbp->bio->bio_cmd = BIO_WRITE; 516130389Sle rbp->bio->bio_data = rbp->buf; 517130389Sle rbp->bio->bio_offset = wp->offset; 518130389Sle rbp->bio->bio_length = wp->length; 519130389Sle rbp->bio->bio_done = gv_raid5_done; 520130389Sle rbp->bio->bio_caller1 = wp; 521130389Sle rbp->bio->bio_caller2 = rbp; 522130389Sle /* 523130389Sle * Insert at the tail, because we want to read the old 524130389Sle * data first. 525130389Sle */ 526130389Sle TAILQ_INSERT_TAIL(&wp->bits, rbp, list); 527130389Sle wp->active++; 528130389Sle wp->rqcount++; 529130389Sle 530130389Sle /* Get the rest of the data again. */ 531130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 532130389Sle /* 533130389Sle * Skip the broken subdisk, the parity, and the 534130389Sle * one we just wrote. 535130389Sle */ 536130389Sle if ((s == broken) || 537130389Sle (s->consumer == wp->parity) || 538130389Sle (s->consumer == wp->original)) 539130389Sle continue; 540130389Sle rbp = gv_new_raid5_bit(); 541130389Sle rbp->consumer = s->consumer; 542130389Sle rbp->bio = g_new_bio(); 543130389Sle if (rbp->bio == NULL) 544130389Sle return (ENOMEM); 545130389Sle rbp->bio->bio_cmd = BIO_READ; 546130389Sle rbp->buf = g_malloc(wp->length, 547130389Sle M_WAITOK | M_ZERO); 548130389Sle rbp->malloc = 1; 549130389Sle rbp->bio->bio_data = rbp->buf; 550130389Sle rbp->bio->bio_offset = wp->offset; 551130389Sle rbp->bio->bio_length = wp->length; 552130389Sle rbp->bio->bio_done = gv_raid5_done; 553130389Sle rbp->bio->bio_caller1 = wp; 554130389Sle rbp->bio->bio_caller2 = rbp; 555130389Sle /* 556130389Sle * Again, insert at the tail to keep correct 557130389Sle * order. 558130389Sle */ 559130389Sle TAILQ_INSERT_TAIL(&wp->bits, rbp, list); 560130389Sle wp->active++; 561130389Sle wp->rqcount++; 562130389Sle } 563130389Sle 564130389Sle 565130389Sle /* 566130389Sle * A normal write request goes to the original subdisk, then we 567130389Sle * read in all other stripes, recalculate the parity and write 568130389Sle * out the parity again. 569130389Sle */ 570130389Sle } else { 571130389Sle wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); 572130389Sle wp->bufmalloc = 1; 573130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 574130389Sle /* Skip the parity stripe. */ 575130389Sle if (s->consumer == wp->parity) 576130389Sle continue; 577130389Sle 578130389Sle rbp = gv_new_raid5_bit(); 579130389Sle rbp->consumer = s->consumer; 580130389Sle rbp->bio = g_new_bio(); 581130389Sle if (rbp->bio == NULL) 582130389Sle return (ENOMEM); 583130389Sle /* 584130389Sle * The data for the original stripe is written, 585130389Sle * the others need to be read in for the parity 586130389Sle * calculation. 587130389Sle */ 588130389Sle if (s->consumer == wp->original) { 589130389Sle rbp->bio->bio_cmd = BIO_WRITE; 590130389Sle rbp->buf = addr; 591130389Sle } else { 592130389Sle rbp->bio->bio_cmd = BIO_READ; 593130389Sle rbp->buf = g_malloc(wp->length, 594130389Sle M_WAITOK | M_ZERO); 595130389Sle rbp->malloc = 1; 596130389Sle } 597130389Sle rbp->bio->bio_data = rbp->buf; 598130389Sle rbp->bio->bio_offset = wp->offset; 599130389Sle rbp->bio->bio_length = wp->length; 600130389Sle rbp->bio->bio_done = gv_raid5_done; 601130389Sle rbp->bio->bio_caller1 = wp; 602130389Sle rbp->bio->bio_caller2 = rbp; 603130389Sle TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 604130389Sle wp->active++; 605130389Sle wp->rqcount++; 606130389Sle } 607130389Sle } 608130389Sle break; 609130389Sle default: 610130389Sle return (EINVAL); 611130389Sle } 612130389Sle 613130389Sle wp->state = VALID; 614130389Sle return (0); 615130389Sle} 616