geom_vinum_raid5.c revision 135426
1130389Sle/*- 2130389Sle * Copyright (c) 2004 Lukas Ertl 3130389Sle * All rights reserved. 4130389Sle * 5130389Sle * Redistribution and use in source and binary forms, with or without 6130389Sle * modification, are permitted provided that the following conditions 7130389Sle * are met: 8130389Sle * 1. Redistributions of source code must retain the above copyright 9130389Sle * notice, this list of conditions and the following disclaimer. 10130389Sle * 2. Redistributions in binary form must reproduce the above copyright 11130389Sle * notice, this list of conditions and the following disclaimer in the 12130389Sle * documentation and/or other materials provided with the distribution. 13130389Sle * 14130389Sle * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15130389Sle * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16130389Sle * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17130389Sle * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18130389Sle * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19130389Sle * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20130389Sle * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21130389Sle * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22130389Sle * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23130389Sle * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24130389Sle * SUCH DAMAGE. 25130389Sle */ 26130389Sle 27130389Sle#include <sys/cdefs.h> 28130389Sle__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_raid5.c 135426 2004-09-18 13:44:43Z le $"); 29130389Sle 30130389Sle#include <sys/param.h> 31130389Sle#include <sys/bio.h> 32130389Sle#include <sys/conf.h> 33130389Sle#include <sys/errno.h> 34130389Sle#include <sys/kernel.h> 35130389Sle#include <sys/kthread.h> 36130389Sle#include <sys/libkern.h> 37130389Sle#include <sys/lock.h> 38130389Sle#include <sys/malloc.h> 39130389Sle#include <sys/mutex.h> 40130389Sle#include <sys/systm.h> 41130389Sle 42130389Sle#include <geom/geom.h> 43130389Sle#include <geom/vinum/geom_vinum_var.h> 44130389Sle#include <geom/vinum/geom_vinum_raid5.h> 45130389Sle#include <geom/vinum/geom_vinum.h> 46130389Sle 47130389Sle/* 48130389Sle * Check if the stripe that the work packet wants is already being used by 49130389Sle * some other work packet. 50130389Sle */ 51130389Sleint 52135426Slegv_stripe_active(struct gv_plex *p, struct bio *bp) 53130389Sle{ 54135426Sle struct gv_raid5_packet *wp, *owp; 55135426Sle int overlap; 56130389Sle 57135426Sle wp = bp->bio_driver1; 58135426Sle if (wp->lockbase == -1) 59135426Sle return (0); 60130389Sle 61135426Sle overlap = 0; 62135426Sle TAILQ_FOREACH(owp, &p->packets, list) { 63135426Sle if (owp == wp) 64135426Sle break; 65135426Sle if ((wp->lockbase >= owp->lockbase) && 66135426Sle (wp->lockbase <= owp->lockbase + owp->length)) { 67135426Sle overlap++; 68135426Sle break; 69130389Sle } 70135426Sle if ((wp->lockbase <= owp->lockbase) && 71135426Sle (wp->lockbase + wp->length >= owp->lockbase)) { 72135426Sle overlap++; 73135426Sle break; 74130389Sle } 75130389Sle } 76130389Sle 77135426Sle return (overlap); 78130389Sle} 79130389Sle 80130389Sle/* Build a request group to perform (part of) a RAID5 request. */ 81130389Sleint 82135426Slegv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp, 83135426Sle struct bio *bp, caddr_t addr, off_t boff, off_t bcount) 84130389Sle{ 85130389Sle struct g_geom *gp; 86130389Sle struct gv_sd *broken, *original, *parity, *s; 87135426Sle struct gv_bioq *bq; 88135426Sle struct bio *cbp, *pbp; 89135426Sle int i, psdno, sdno, type; 90135426Sle off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart; 91130389Sle 92130389Sle gp = bp->bio_to->geom; 93130389Sle 94130389Sle if (p == NULL || LIST_EMPTY(&p->subdisks)) 95130389Sle return (ENXIO); 96130389Sle 97130389Sle /* We are optimistic and assume that this request will be OK. */ 98135426Sle#define REQ_TYPE_NORMAL 0 99135426Sle#define REQ_TYPE_DEGRADED 1 100135426Sle#define REQ_TYPE_NOPARITY 2 101135426Sle 102135426Sle type = REQ_TYPE_NORMAL; 103130389Sle original = parity = broken = NULL; 104130389Sle 105130389Sle /* The number of the subdisk containing the parity stripe. */ 106130389Sle psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % 107130389Sle p->sdcount; 108130389Sle KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0")); 109130389Sle 110130389Sle /* Offset of the start address from the start of the stripe. */ 111130389Sle stripeoff = boff % (p->stripesize * (p->sdcount - 1)); 112130389Sle KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0")); 113130389Sle 114130389Sle /* The number of the subdisk where the stripe resides. */ 115130389Sle sdno = stripeoff / p->stripesize; 116130389Sle KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0")); 117130389Sle 118130389Sle /* At or past parity subdisk. */ 119130389Sle if (sdno >= psdno) 120130389Sle sdno++; 121130389Sle 122130389Sle /* The offset of the stripe on this subdisk. */ 123130389Sle stripestart = (boff - stripeoff) / (p->sdcount - 1); 124130389Sle KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0")); 125130389Sle 126130477Sle stripeoff %= p->stripesize; 127130389Sle 128130389Sle /* The offset of the request on this subdisk. */ 129130389Sle real_off = stripestart + stripeoff; 130130389Sle 131130389Sle stripeend = stripestart + p->stripesize; 132130389Sle len_left = stripeend - real_off; 133130389Sle KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0")); 134130389Sle 135130389Sle /* Find the right subdisks. */ 136130389Sle i = 0; 137130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 138130389Sle if (i == sdno) 139130389Sle original = s; 140130389Sle if (i == psdno) 141130389Sle parity = s; 142130389Sle if (s->state != GV_SD_UP) 143130389Sle broken = s; 144130389Sle i++; 145130389Sle } 146130389Sle 147130389Sle if ((original == NULL) || (parity == NULL)) 148130389Sle return (ENXIO); 149130389Sle 150130389Sle /* Our data stripe is missing. */ 151130389Sle if (original->state != GV_SD_UP) 152135426Sle type = REQ_TYPE_DEGRADED; 153130389Sle /* Our parity stripe is missing. */ 154130389Sle if (parity->state != GV_SD_UP) { 155130389Sle /* We cannot take another failure if we're already degraded. */ 156135426Sle if (type != REQ_TYPE_NORMAL) 157130389Sle return (ENXIO); 158130389Sle else 159135426Sle type = REQ_TYPE_NOPARITY; 160130389Sle } 161130389Sle 162135426Sle real_len = (bcount <= len_left) ? bcount : len_left; 163135426Sle wp->length = real_len; 164130389Sle wp->data = addr; 165135426Sle wp->lockbase = real_off; 166130389Sle 167130389Sle KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 168130389Sle 169130389Sle switch (bp->bio_cmd) { 170130389Sle case BIO_READ: 171130389Sle /* 172130389Sle * For a degraded read we need to read in all stripes except 173130389Sle * the broken one plus the parity stripe and then recalculate 174130389Sle * the desired data. 175130389Sle */ 176135426Sle if (type == REQ_TYPE_DEGRADED) { 177135426Sle bzero(wp->data, wp->length); 178130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 179130389Sle /* Skip the broken subdisk. */ 180130389Sle if (s == broken) 181130389Sle continue; 182135426Sle cbp = g_clone_bio(bp); 183135426Sle if (cbp == NULL) 184130389Sle return (ENOMEM); 185135426Sle cbp->bio_data = g_malloc(real_len, M_WAITOK); 186135426Sle cbp->bio_cflags |= GV_BIO_MALLOC; 187135426Sle cbp->bio_offset = real_off; 188135426Sle cbp->bio_length = real_len; 189135426Sle cbp->bio_done = gv_plex_done; 190135426Sle cbp->bio_caller2 = s->consumer; 191135426Sle cbp->bio_driver1 = wp; 192135426Sle 193135426Sle GV_ENQUEUE(bp, cbp, pbp); 194135426Sle 195135426Sle bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 196135426Sle bq->bp = cbp; 197135426Sle TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 198130389Sle } 199130389Sle 200130389Sle /* A normal read can be fulfilled with the original subdisk. */ 201130389Sle } else { 202135426Sle cbp = g_clone_bio(bp); 203135426Sle if (cbp == NULL) 204130389Sle return (ENOMEM); 205135426Sle cbp->bio_offset = real_off; 206135426Sle cbp->bio_length = real_len; 207135426Sle cbp->bio_data = addr; 208135426Sle cbp->bio_done = g_std_done; 209135426Sle cbp->bio_caller2 = original->consumer; 210135426Sle 211135426Sle GV_ENQUEUE(bp, cbp, pbp); 212130389Sle } 213135426Sle wp->lockbase = -1; 214135426Sle 215130389Sle break; 216130389Sle 217130389Sle case BIO_WRITE: 218130389Sle /* 219130389Sle * A degraded write means we cannot write to the original data 220130389Sle * subdisk. Thus we need to read in all valid stripes, 221130389Sle * recalculate the parity from the original data, and then 222130389Sle * write the parity stripe back out. 223130389Sle */ 224135426Sle if (type == REQ_TYPE_DEGRADED) { 225135426Sle /* Read all subdisks. */ 226130389Sle LIST_FOREACH(s, &p->subdisks, in_plex) { 227130389Sle /* Skip the broken and the parity subdisk. */ 228135426Sle if ((s == broken) || (s == parity)) 229130389Sle continue; 230130389Sle 231135426Sle cbp = g_clone_bio(bp); 232135426Sle if (cbp == NULL) 233130389Sle return (ENOMEM); 234135426Sle cbp->bio_cmd = BIO_READ; 235135426Sle cbp->bio_data = g_malloc(real_len, M_WAITOK); 236135426Sle cbp->bio_cflags |= GV_BIO_MALLOC; 237135426Sle cbp->bio_offset = real_off; 238135426Sle cbp->bio_length = real_len; 239135426Sle cbp->bio_done = gv_plex_done; 240135426Sle cbp->bio_caller2 = s->consumer; 241135426Sle cbp->bio_driver1 = wp; 242135426Sle 243135426Sle GV_ENQUEUE(bp, cbp, pbp); 244135426Sle 245135426Sle bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 246135426Sle bq->bp = cbp; 247135426Sle TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 248130389Sle } 249130389Sle 250135426Sle /* Write the parity data. */ 251135426Sle cbp = g_clone_bio(bp); 252135426Sle if (cbp == NULL) 253130389Sle return (ENOMEM); 254135426Sle cbp->bio_data = g_malloc(real_len, M_WAITOK); 255135426Sle cbp->bio_cflags |= GV_BIO_MALLOC; 256135426Sle bcopy(addr, cbp->bio_data, real_len); 257135426Sle cbp->bio_offset = real_off; 258135426Sle cbp->bio_length = real_len; 259135426Sle cbp->bio_done = gv_plex_done; 260135426Sle cbp->bio_caller2 = parity->consumer; 261135426Sle cbp->bio_driver1 = wp; 262135426Sle wp->parity = cbp; 263130389Sle 264130389Sle /* 265135426Sle * When the parity stripe is missing we just write out the data. 266130389Sle */ 267135426Sle } else if (type == REQ_TYPE_NOPARITY) { 268135426Sle cbp = g_clone_bio(bp); 269135426Sle if (cbp == NULL) 270130925Sle return (ENOMEM); 271135426Sle cbp->bio_offset = real_off; 272135426Sle cbp->bio_length = real_len; 273135426Sle cbp->bio_data = addr; 274135426Sle cbp->bio_done = gv_plex_done; 275135426Sle cbp->bio_caller2 = original->consumer; 276135426Sle cbp->bio_driver1 = wp; 277130389Sle 278135426Sle GV_ENQUEUE(bp, cbp, pbp); 279130389Sle 280135426Sle bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 281135426Sle bq->bp = cbp; 282135426Sle TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 283130389Sle 284130389Sle /* 285130389Sle * A normal write request goes to the original subdisk, then we 286130389Sle * read in all other stripes, recalculate the parity and write 287130389Sle * out the parity again. 288130389Sle */ 289130389Sle } else { 290135426Sle /* Read old parity. */ 291135426Sle cbp = g_clone_bio(bp); 292135426Sle if (cbp == NULL) 293130925Sle return (ENOMEM); 294135426Sle cbp->bio_cmd = BIO_READ; 295135426Sle cbp->bio_data = g_malloc(real_len, M_WAITOK); 296135426Sle cbp->bio_cflags |= GV_BIO_MALLOC; 297135426Sle cbp->bio_offset = real_off; 298135426Sle cbp->bio_length = real_len; 299135426Sle cbp->bio_done = gv_plex_done; 300135426Sle cbp->bio_caller2 = parity->consumer; 301135426Sle cbp->bio_driver1 = wp; 302130389Sle 303135426Sle GV_ENQUEUE(bp, cbp, pbp); 304135426Sle 305135426Sle bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 306135426Sle bq->bp = cbp; 307135426Sle TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 308135426Sle 309135426Sle /* Read old data. */ 310135426Sle cbp = g_clone_bio(bp); 311135426Sle if (cbp == NULL) 312135426Sle return (ENOMEM); 313135426Sle cbp->bio_cmd = BIO_READ; 314135426Sle cbp->bio_data = g_malloc(real_len, M_WAITOK); 315135426Sle cbp->bio_cflags |= GV_BIO_MALLOC; 316135426Sle cbp->bio_offset = real_off; 317135426Sle cbp->bio_length = real_len; 318135426Sle cbp->bio_done = gv_plex_done; 319135426Sle cbp->bio_caller2 = original->consumer; 320135426Sle cbp->bio_driver1 = wp; 321135426Sle 322135426Sle GV_ENQUEUE(bp, cbp, pbp); 323135426Sle 324135426Sle bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 325135426Sle bq->bp = cbp; 326135426Sle TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 327135426Sle 328135426Sle /* Write new data. */ 329135426Sle cbp = g_clone_bio(bp); 330135426Sle if (cbp == NULL) 331135426Sle return (ENOMEM); 332135426Sle cbp->bio_data = addr; 333135426Sle cbp->bio_offset = real_off; 334135426Sle cbp->bio_length = real_len; 335135426Sle cbp->bio_done = gv_plex_done; 336135426Sle cbp->bio_caller2 = original->consumer; 337135426Sle 338135426Sle cbp->bio_driver1 = wp; 339135426Sle 340135426Sle /* 341135426Sle * We must not write the new data until the old data 342135426Sle * was read, so hold this BIO back until we're ready 343135426Sle * for it. 344135426Sle */ 345135426Sle wp->waiting = cbp; 346135426Sle 347135426Sle /* The final bio for the parity. */ 348135426Sle cbp = g_clone_bio(bp); 349135426Sle if (cbp == NULL) 350135426Sle return (ENOMEM); 351135426Sle cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 352135426Sle cbp->bio_cflags |= GV_BIO_MALLOC; 353135426Sle cbp->bio_offset = real_off; 354135426Sle cbp->bio_length = real_len; 355135426Sle cbp->bio_done = gv_plex_done; 356135426Sle cbp->bio_caller2 = parity->consumer; 357135426Sle cbp->bio_driver1 = wp; 358135426Sle 359135426Sle /* Remember that this is the BIO for the parity data. */ 360135426Sle wp->parity = cbp; 361130389Sle } 362130389Sle break; 363135426Sle 364130389Sle default: 365130389Sle return (EINVAL); 366130389Sle } 367130389Sle 368130389Sle return (0); 369130389Sle} 370