geom_vinum_raid5.c (184292) | geom_vinum_raid5.c (190507) |
---|---|
1/*- | 1/*- |
2 * Copyright (c) 2004 Lukas Ertl | 2 * Copyright (c) 2004, 2007 Lukas Ertl |
3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright --- 9 unchanged lines hidden (view full) --- 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> | 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright --- 9 unchanged lines hidden (view full) --- 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> |
28__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_raid5.c 184292 2008-10-26 17:20:37Z lulf $"); | 28__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_raid5.c 190507 2009-03-28 17:20:08Z lulf $"); |
29 30#include <sys/param.h> 31#include <sys/bio.h> | 29 30#include <sys/param.h> 31#include <sys/bio.h> |
32#include <sys/conf.h> 33#include <sys/errno.h> 34#include <sys/kernel.h> 35#include <sys/kthread.h> 36#include <sys/libkern.h> | |
37#include <sys/lock.h> 38#include <sys/malloc.h> | 32#include <sys/lock.h> 33#include <sys/malloc.h> |
39#include <sys/mutex.h> | |
40#include <sys/systm.h> 41 42#include <geom/geom.h> 43#include <geom/vinum/geom_vinum_var.h> 44#include <geom/vinum/geom_vinum_raid5.h> 45#include <geom/vinum/geom_vinum.h> 46 | 34#include <sys/systm.h> 35 36#include <geom/geom.h> 37#include <geom/vinum/geom_vinum_var.h> 38#include <geom/vinum/geom_vinum_raid5.h> 39#include <geom/vinum/geom_vinum.h> 40 |
47int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, 48 int *, int *); | 41static int gv_raid5_offset(struct gv_plex *, off_t, off_t, 42 off_t *, off_t *, int *, int *, int); 43static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, 44 struct gv_raid5_packet *, caddr_t, int); 45static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, 46 struct bio *, caddr_t, off_t, off_t, int *); 47static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, 48 struct bio *, caddr_t, off_t, off_t); 49static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, 50 struct bio *, caddr_t, off_t, off_t); |
49 | 51 |
52struct gv_raid5_packet * 53gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, 54 off_t bcount) 55{ 56 struct bio *cbp; 57 struct gv_raid5_packet *wp, *wp2; 58 struct gv_bioq *bq, *bq2; 59 int err, delay; 60 61 delay = 0; 62 wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); 63 wp->bio = bp; 64 wp->waiting = NULL; 65 wp->parity = NULL; 66 TAILQ_INIT(&wp->bits); 67 68 if (bp->bio_cflags & GV_BIO_REBUILD) 69 err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); 70 else if (bp->bio_cflags & GV_BIO_CHECK) 71 err = gv_raid5_check(p, wp, bp, addr, boff, bcount); 72 else 73 err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); 74 75 /* Means we have a delayed request. */ 76 if (delay) { 77 g_free(wp); 78 return (NULL); 79 } 80 81 /* 82 * Building the sub-request failed, we probably need to clean up a lot. 83 */ 84 if (err) { 85 G_VINUM_LOGREQ(0, bp, "raid5 plex request failed."); 86 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 87 TAILQ_REMOVE(&wp->bits, bq, queue); 88 g_free(bq); 89 } 90 if (wp->waiting != NULL) { 91 if (wp->waiting->bio_cflags & GV_BIO_MALLOC) 92 g_free(wp->waiting->bio_data); 93 g_destroy_bio(wp->waiting); 94 } 95 if (wp->parity != NULL) { 96 if (wp->parity->bio_cflags & GV_BIO_MALLOC) 97 g_free(wp->parity->bio_data); 98 g_destroy_bio(wp->parity); 99 } 100 g_free(wp); 101 102 TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { 103 if (wp->bio != bp) 104 continue; 105 106 TAILQ_REMOVE(&p->packets, wp, list); 107 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 108 TAILQ_REMOVE(&wp->bits, bq, queue); 109 g_free(bq); 110 } 111 g_free(wp); 112 } 113 114 cbp = bioq_takefirst(p->bqueue); 115 while (cbp != NULL) { 116 if (cbp->bio_cflags & GV_BIO_MALLOC) 117 g_free(cbp->bio_data); 118 g_destroy_bio(cbp); 119 cbp = bioq_takefirst(p->bqueue); 120 } 121 122 /* If internal, stop and reset state. */ 123 if (bp->bio_cflags & GV_BIO_INTERNAL) { 124 if (bp->bio_cflags & GV_BIO_MALLOC) 125 g_free(cbp->bio_data); 126 g_destroy_bio(bp); 127 /* Reset flags. */ 128 p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | 129 GV_PLEX_GROWING); 130 return (NULL); 131 } 132 g_io_deliver(bp, err); 133 return (NULL); 134 } 135 136 return (wp); 137} 138 |
|
50/* 51 * Check if the stripe that the work packet wants is already being used by 52 * some other work packet. 53 */ 54int 55gv_stripe_active(struct gv_plex *p, struct bio *bp) 56{ 57 struct gv_raid5_packet *wp, *owp; 58 int overlap; 59 | 139/* 140 * Check if the stripe that the work packet wants is already being used by 141 * some other work packet. 142 */ 143int 144gv_stripe_active(struct gv_plex *p, struct bio *bp) 145{ 146 struct gv_raid5_packet *wp, *owp; 147 int overlap; 148 |
60 wp = bp->bio_driver1; | 149 wp = bp->bio_caller2; |
61 if (wp->lockbase == -1) 62 return (0); 63 64 overlap = 0; 65 TAILQ_FOREACH(owp, &p->packets, list) { 66 if (owp == wp) 67 break; 68 if ((wp->lockbase >= owp->lockbase) && --- 6 unchanged lines hidden (view full) --- 75 overlap++; 76 break; 77 } 78 } 79 80 return (overlap); 81} 82 | 150 if (wp->lockbase == -1) 151 return (0); 152 153 overlap = 0; 154 TAILQ_FOREACH(owp, &p->packets, list) { 155 if (owp == wp) 156 break; 157 if ((wp->lockbase >= owp->lockbase) && --- 6 unchanged lines hidden (view full) --- 164 overlap++; 165 break; 166 } 167 } 168 169 return (overlap); 170} 171 |
83int 84gv_check_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, | 172static int 173gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, |
85 caddr_t addr, off_t boff, off_t bcount) 86{ 87 struct gv_sd *parity, *s; 88 struct gv_bioq *bq; | 174 caddr_t addr, off_t boff, off_t bcount) 175{ 176 struct gv_sd *parity, *s; 177 struct gv_bioq *bq; |
89 struct bio *cbp, *pbp; | 178 struct bio *cbp; |
90 int i, psdno; 91 off_t real_len, real_off; 92 93 if (p == NULL || LIST_EMPTY(&p->subdisks)) 94 return (ENXIO); 95 | 179 int i, psdno; 180 off_t real_len, real_off; 181 182 if (p == NULL || LIST_EMPTY(&p->subdisks)) 183 return (ENXIO); 184 |
96 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno); | 185 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); |
97 98 /* Find the right subdisk. */ 99 parity = NULL; 100 i = 0; 101 LIST_FOREACH(s, &p->subdisks, in_plex) { 102 if (i == psdno) { 103 parity = s; 104 break; --- 12 unchanged lines hidden (view full) --- 117 wp->data = addr; 118 wp->lockbase = real_off; 119 120 /* Read all subdisks. */ 121 LIST_FOREACH(s, &p->subdisks, in_plex) { 122 /* Skip the parity subdisk. */ 123 if (s == parity) 124 continue; | 186 187 /* Find the right subdisk. */ 188 parity = NULL; 189 i = 0; 190 LIST_FOREACH(s, &p->subdisks, in_plex) { 191 if (i == psdno) { 192 parity = s; 193 break; --- 12 unchanged lines hidden (view full) --- 206 wp->data = addr; 207 wp->lockbase = real_off; 208 209 /* Read all subdisks. */ 210 LIST_FOREACH(s, &p->subdisks, in_plex) { 211 /* Skip the parity subdisk. */ 212 if (s == parity) 213 continue; |
214 /* Skip growing subdisks. */ 215 if (s->flags & GV_SD_GROW) 216 continue; |
|
125 | 217 |
126 cbp = g_clone_bio(bp); | 218 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); |
127 if (cbp == NULL) 128 return (ENOMEM); 129 cbp->bio_cmd = BIO_READ; | 219 if (cbp == NULL) 220 return (ENOMEM); 221 cbp->bio_cmd = BIO_READ; |
130 cbp->bio_data = g_malloc(real_len, M_WAITOK); 131 cbp->bio_cflags |= GV_BIO_MALLOC; 132 cbp->bio_offset = real_off; 133 cbp->bio_length = real_len; 134 cbp->bio_done = gv_plex_done; 135 cbp->bio_caller2 = s->consumer; 136 cbp->bio_driver1 = wp; | |
137 | 222 |
138 GV_ENQUEUE(bp, cbp, pbp); | 223 bioq_insert_tail(p->bqueue, cbp); |
139 140 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 141 bq->bp = cbp; 142 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 143 } 144 145 /* Read the parity data. */ | 224 225 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 226 bq->bp = cbp; 227 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 228 } 229 230 /* Read the parity data. */ |
146 cbp = g_clone_bio(bp); | 231 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); |
147 if (cbp == NULL) 148 return (ENOMEM); 149 cbp->bio_cmd = BIO_READ; | 232 if (cbp == NULL) 233 return (ENOMEM); 234 cbp->bio_cmd = BIO_READ; |
150 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 151 cbp->bio_cflags |= GV_BIO_MALLOC; 152 cbp->bio_offset = real_off; 153 cbp->bio_length = real_len; 154 cbp->bio_done = gv_plex_done; 155 cbp->bio_caller2 = parity->consumer; 156 cbp->bio_driver1 = wp; | |
157 wp->waiting = cbp; 158 159 /* 160 * In case we want to rebuild the parity, create an extra BIO to write 161 * it out. It also acts as buffer for the XOR operations. 162 */ | 235 wp->waiting = cbp; 236 237 /* 238 * In case we want to rebuild the parity, create an extra BIO to write 239 * it out. It also acts as buffer for the XOR operations. 240 */ |
163 cbp = g_clone_bio(bp); | 241 cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); |
164 if (cbp == NULL) 165 return (ENOMEM); | 242 if (cbp == NULL) 243 return (ENOMEM); |
166 cbp->bio_data = addr; 167 cbp->bio_offset = real_off; 168 cbp->bio_length = real_len; 169 cbp->bio_done = gv_plex_done; 170 cbp->bio_caller2 = parity->consumer; 171 cbp->bio_driver1 = wp; | |
172 wp->parity = cbp; 173 174 return (0); 175} 176 177/* Rebuild a degraded RAID5 plex. */ | 244 wp->parity = cbp; 245 246 return (0); 247} 248 249/* Rebuild a degraded RAID5 plex. */ |
178int 179gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, | 250static int 251gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, |
180 caddr_t addr, off_t boff, off_t bcount) 181{ 182 struct gv_sd *broken, *s; 183 struct gv_bioq *bq; | 252 caddr_t addr, off_t boff, off_t bcount) 253{ 254 struct gv_sd *broken, *s; 255 struct gv_bioq *bq; |
184 struct bio *cbp, *pbp; | 256 struct bio *cbp; |
185 off_t real_len, real_off; 186 187 if (p == NULL || LIST_EMPTY(&p->subdisks)) 188 return (ENXIO); 189 | 257 off_t real_len, real_off; 258 259 if (p == NULL || LIST_EMPTY(&p->subdisks)) 260 return (ENXIO); 261 |
190 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL); | 262 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); |
191 192 /* Find the right subdisk. */ 193 broken = NULL; 194 LIST_FOREACH(s, &p->subdisks, in_plex) { 195 if (s->state != GV_SD_UP) 196 broken = s; 197 } 198 --- 6 unchanged lines hidden (view full) --- 205 return (EINVAL); 206 207 case GV_SD_STALE: 208 if (!(bp->bio_cflags & GV_BIO_REBUILD)) 209 return (ENXIO); 210 211 G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); 212 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); | 263 264 /* Find the right subdisk. */ 265 broken = NULL; 266 LIST_FOREACH(s, &p->subdisks, in_plex) { 267 if (s->state != GV_SD_UP) 268 broken = s; 269 } 270 --- 6 unchanged lines hidden (view full) --- 277 return (EINVAL); 278 279 case GV_SD_STALE: 280 if (!(bp->bio_cflags & GV_BIO_REBUILD)) 281 return (ENXIO); 282 283 G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); 284 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); |
285 /* Set this bit now, but should be set at end. */ 286 broken->flags |= GV_SD_CANGOUP; |
|
213 break; 214 215 case GV_SD_REVIVING: 216 break; 217 218 default: 219 /* All other subdisk states mean it's not accessible. */ 220 return (ENXIO); --- 6 unchanged lines hidden (view full) --- 227 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); 228 229 /* Read all subdisks. */ 230 LIST_FOREACH(s, &p->subdisks, in_plex) { 231 /* Skip the broken subdisk. */ 232 if (s == broken) 233 continue; 234 | 287 break; 288 289 case GV_SD_REVIVING: 290 break; 291 292 default: 293 /* All other subdisk states mean it's not accessible. */ 294 return (ENXIO); --- 6 unchanged lines hidden (view full) --- 301 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); 302 303 /* Read all subdisks. */ 304 LIST_FOREACH(s, &p->subdisks, in_plex) { 305 /* Skip the broken subdisk. */ 306 if (s == broken) 307 continue; 308 |
235 cbp = g_clone_bio(bp); | 309 /* Skip growing subdisks. */ 310 if (s->flags & GV_SD_GROW) 311 continue; 312 313 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); |
236 if (cbp == NULL) 237 return (ENOMEM); 238 cbp->bio_cmd = BIO_READ; | 314 if (cbp == NULL) 315 return (ENOMEM); 316 cbp->bio_cmd = BIO_READ; |
239 cbp->bio_data = g_malloc(real_len, M_WAITOK); 240 cbp->bio_cflags |= GV_BIO_MALLOC; 241 cbp->bio_offset = real_off; 242 cbp->bio_length = real_len; 243 cbp->bio_done = gv_plex_done; 244 cbp->bio_caller2 = s->consumer; 245 cbp->bio_driver1 = wp; | |
246 | 317 |
247 GV_ENQUEUE(bp, cbp, pbp); | 318 bioq_insert_tail(p->bqueue, cbp); |
248 249 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 250 bq->bp = cbp; 251 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 252 } 253 254 /* Write the parity data. */ | 319 320 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 321 bq->bp = cbp; 322 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 323 } 324 325 /* Write the parity data. */ |
255 cbp = g_clone_bio(bp); | 326 cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); |
256 if (cbp == NULL) 257 return (ENOMEM); | 327 if (cbp == NULL) 328 return (ENOMEM); |
258 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 259 cbp->bio_cflags |= GV_BIO_MALLOC; 260 cbp->bio_offset = real_off; 261 cbp->bio_length = real_len; 262 cbp->bio_done = gv_plex_done; 263 cbp->bio_caller2 = broken->consumer; 264 cbp->bio_driver1 = wp; | |
265 cbp->bio_cflags |= GV_BIO_REBUILD; 266 wp->parity = cbp; 267 268 p->synced = boff; 269 | 329 cbp->bio_cflags |= GV_BIO_REBUILD; 330 wp->parity = cbp; 331 332 p->synced = boff; 333 |
334 /* Post notification that we're finished. */ |
|
270 return (0); 271} 272 273/* Build a request group to perform (part of) a RAID5 request. */ | 335 return (0); 336} 337 338/* Build a request group to perform (part of) a RAID5 request. */ |
274int 275gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp, 276 struct bio *bp, caddr_t addr, off_t boff, off_t bcount) | 339static int 340gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, 341 struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) |
277{ 278 struct g_geom *gp; 279 struct gv_sd *broken, *original, *parity, *s; 280 struct gv_bioq *bq; | 342{ 343 struct g_geom *gp; 344 struct gv_sd *broken, *original, *parity, *s; 345 struct gv_bioq *bq; |
281 struct bio *cbp, *pbp; 282 int i, psdno, sdno, type; | 346 struct bio *cbp; 347 int i, psdno, sdno, type, grow; |
283 off_t real_len, real_off; 284 285 gp = bp->bio_to->geom; 286 287 if (p == NULL || LIST_EMPTY(&p->subdisks)) 288 return (ENXIO); 289 290 /* We are optimistic and assume that this request will be OK. */ 291#define REQ_TYPE_NORMAL 0 292#define REQ_TYPE_DEGRADED 1 293#define REQ_TYPE_NOPARITY 2 294 295 type = REQ_TYPE_NORMAL; 296 original = parity = broken = NULL; 297 | 348 off_t real_len, real_off; 349 350 gp = bp->bio_to->geom; 351 352 if (p == NULL || LIST_EMPTY(&p->subdisks)) 353 return (ENXIO); 354 355 /* We are optimistic and assume that this request will be OK. */ 356#define REQ_TYPE_NORMAL 0 357#define REQ_TYPE_DEGRADED 1 358#define REQ_TYPE_NOPARITY 2 359 360 type = REQ_TYPE_NORMAL; 361 original = parity = broken = NULL; 362 |
298 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno); | 363 /* XXX: The resize won't crash with rebuild or sync, but we should still 364 * be aware of it. Also this should perhaps be done on rebuild/check as 365 * well? 366 */ 367 /* If we're over, we must use the old. */ 368 if (boff >= p->synced) { 369 grow = 1; 370 /* Or if over the resized offset, we use all drives. */ 371 } else if (boff + bcount <= p->synced) { 372 grow = 0; 373 /* Else, we're in the middle, and must wait a bit. */ 374 } else { 375 bioq_disksort(p->rqueue, bp); 376 *delay = 1; 377 return (0); 378 } 379 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, 380 &sdno, &psdno, grow); |
299 300 /* Find the right subdisks. */ 301 i = 0; 302 LIST_FOREACH(s, &p->subdisks, in_plex) { 303 if (i == sdno) 304 original = s; 305 if (i == psdno) 306 parity = s; 307 if (s->state != GV_SD_UP) 308 broken = s; 309 i++; 310 } 311 312 if ((original == NULL) || (parity == NULL)) 313 return (ENXIO); 314 315 /* Our data stripe is missing. */ 316 if (original->state != GV_SD_UP) 317 type = REQ_TYPE_DEGRADED; | 381 382 /* Find the right subdisks. */ 383 i = 0; 384 LIST_FOREACH(s, &p->subdisks, in_plex) { 385 if (i == sdno) 386 original = s; 387 if (i == psdno) 388 parity = s; 389 if (s->state != GV_SD_UP) 390 broken = s; 391 i++; 392 } 393 394 if ((original == NULL) || (parity == NULL)) 395 return (ENXIO); 396 397 /* Our data stripe is missing. */ 398 if (original->state != GV_SD_UP) 399 type = REQ_TYPE_DEGRADED; |
400 401 /* If synchronizing request, just write it if disks are stale. */ 402 if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE && 403 bp->bio_cflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) { 404 type = REQ_TYPE_NORMAL; |
|
318 /* Our parity stripe is missing. */ | 405 /* Our parity stripe is missing. */ |
319 if (parity->state != GV_SD_UP) { | 406 } else if (parity->state != GV_SD_UP) { |
320 /* We cannot take another failure if we're already degraded. */ 321 if (type != REQ_TYPE_NORMAL) 322 return (ENXIO); 323 else 324 type = REQ_TYPE_NOPARITY; 325 } 326 327 wp->length = real_len; 328 wp->data = addr; 329 wp->lockbase = real_off; 330 331 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 332 | 407 /* We cannot take another failure if we're already degraded. */ 408 if (type != REQ_TYPE_NORMAL) 409 return (ENXIO); 410 else 411 type = REQ_TYPE_NOPARITY; 412 } 413 414 wp->length = real_len; 415 wp->data = addr; 416 wp->lockbase = real_off; 417 418 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 419 |
333 if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced)) | 420 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) |
334 type = REQ_TYPE_NORMAL; 335 | 421 type = REQ_TYPE_NORMAL; 422 |
423 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { 424 bioq_disksort(p->rqueue, bp); 425 *delay = 1; 426 return (0); 427 } 428 |
|
336 switch (bp->bio_cmd) { 337 case BIO_READ: 338 /* 339 * For a degraded read we need to read in all stripes except 340 * the broken one plus the parity stripe and then recalculate 341 * the desired data. 342 */ 343 if (type == REQ_TYPE_DEGRADED) { 344 bzero(wp->data, wp->length); 345 LIST_FOREACH(s, &p->subdisks, in_plex) { 346 /* Skip the broken subdisk. */ 347 if (s == broken) 348 continue; | 429 switch (bp->bio_cmd) { 430 case BIO_READ: 431 /* 432 * For a degraded read we need to read in all stripes except 433 * the broken one plus the parity stripe and then recalculate 434 * the desired data. 435 */ 436 if (type == REQ_TYPE_DEGRADED) { 437 bzero(wp->data, wp->length); 438 LIST_FOREACH(s, &p->subdisks, in_plex) { 439 /* Skip the broken subdisk. */ 440 if (s == broken) 441 continue; |
349 cbp = g_clone_bio(bp); | 442 /* Skip growing if within offset. */ 443 if (grow && s->flags & GV_SD_GROW) 444 continue; 445 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); |
350 if (cbp == NULL) 351 return (ENOMEM); | 446 if (cbp == NULL) 447 return (ENOMEM); |
352 cbp->bio_data = g_malloc(real_len, M_WAITOK); 353 cbp->bio_cflags |= GV_BIO_MALLOC; 354 cbp->bio_offset = real_off; 355 cbp->bio_length = real_len; 356 cbp->bio_done = gv_plex_done; 357 cbp->bio_caller2 = s->consumer; 358 cbp->bio_driver1 = wp; | |
359 | 448 |
360 GV_ENQUEUE(bp, cbp, pbp); | 449 bioq_insert_tail(p->bqueue, cbp); |
361 362 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 363 bq->bp = cbp; 364 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 365 } 366 367 /* A normal read can be fulfilled with the original subdisk. */ 368 } else { | 450 451 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 452 bq->bp = cbp; 453 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 454 } 455 456 /* A normal read can be fulfilled with the original subdisk. */ 457 } else { |
369 cbp = g_clone_bio(bp); | 458 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); |
370 if (cbp == NULL) 371 return (ENOMEM); | 459 if (cbp == NULL) 460 return (ENOMEM); |
372 cbp->bio_offset = real_off; 373 cbp->bio_length = real_len; 374 cbp->bio_data = addr; 375 cbp->bio_done = g_std_done; 376 cbp->bio_caller2 = original->consumer; | |
377 | 461 |
378 GV_ENQUEUE(bp, cbp, pbp); | 462 bioq_insert_tail(p->bqueue, cbp); |
379 } 380 wp->lockbase = -1; 381 382 break; 383 384 case BIO_WRITE: 385 /* 386 * A degraded write means we cannot write to the original data 387 * subdisk. Thus we need to read in all valid stripes, 388 * recalculate the parity from the original data, and then 389 * write the parity stripe back out. 390 */ 391 if (type == REQ_TYPE_DEGRADED) { 392 /* Read all subdisks. */ 393 LIST_FOREACH(s, &p->subdisks, in_plex) { 394 /* Skip the broken and the parity subdisk. */ 395 if ((s == broken) || (s == parity)) 396 continue; | 463 } 464 wp->lockbase = -1; 465 466 break; 467 468 case BIO_WRITE: 469 /* 470 * A degraded write means we cannot write to the original data 471 * subdisk. Thus we need to read in all valid stripes, 472 * recalculate the parity from the original data, and then 473 * write the parity stripe back out. 474 */ 475 if (type == REQ_TYPE_DEGRADED) { 476 /* Read all subdisks. */ 477 LIST_FOREACH(s, &p->subdisks, in_plex) { 478 /* Skip the broken and the parity subdisk. */ 479 if ((s == broken) || (s == parity)) 480 continue; |
481 /* Skip growing if within offset. */ 482 if (grow && s->flags & GV_SD_GROW) 483 continue; |
|
397 | 484 |
398 cbp = g_clone_bio(bp); | 485 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); |
399 if (cbp == NULL) 400 return (ENOMEM); 401 cbp->bio_cmd = BIO_READ; | 486 if (cbp == NULL) 487 return (ENOMEM); 488 cbp->bio_cmd = BIO_READ; |
402 cbp->bio_data = g_malloc(real_len, M_WAITOK); 403 cbp->bio_cflags |= GV_BIO_MALLOC; 404 cbp->bio_offset = real_off; 405 cbp->bio_length = real_len; 406 cbp->bio_done = gv_plex_done; 407 cbp->bio_caller2 = s->consumer; 408 cbp->bio_driver1 = wp; | |
409 | 489 |
410 GV_ENQUEUE(bp, cbp, pbp); | 490 bioq_insert_tail(p->bqueue, cbp); |
411 412 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 413 bq->bp = cbp; 414 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 415 } 416 417 /* Write the parity data. */ | 491 492 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 493 bq->bp = cbp; 494 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 495 } 496 497 /* Write the parity data. */ |
418 cbp = g_clone_bio(bp); | 498 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); |
419 if (cbp == NULL) 420 return (ENOMEM); | 499 if (cbp == NULL) 500 return (ENOMEM); |
421 cbp->bio_data = g_malloc(real_len, M_WAITOK); 422 cbp->bio_cflags |= GV_BIO_MALLOC; 423 bcopy(addr, cbp->bio_data, real_len); 424 cbp->bio_offset = real_off; 425 cbp->bio_length = real_len; 426 cbp->bio_done = gv_plex_done; 427 cbp->bio_caller2 = parity->consumer; 428 cbp->bio_driver1 = wp; | 501 bcopy(addr, cbp->bio_data, wp->length); |
429 wp->parity = cbp; 430 431 /* 432 * When the parity stripe is missing we just write out the data. 433 */ 434 } else if (type == REQ_TYPE_NOPARITY) { | 502 wp->parity = cbp; 503 504 /* 505 * When the parity stripe is missing we just write out the data. 506 */ 507 } else if (type == REQ_TYPE_NOPARITY) { |
435 cbp = g_clone_bio(bp); | 508 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); |
436 if (cbp == NULL) 437 return (ENOMEM); | 509 if (cbp == NULL) 510 return (ENOMEM); |
438 cbp->bio_offset = real_off; 439 cbp->bio_length = real_len; 440 cbp->bio_data = addr; 441 cbp->bio_done = gv_plex_done; 442 cbp->bio_caller2 = original->consumer; 443 cbp->bio_driver1 = wp; | |
444 | 511 |
445 GV_ENQUEUE(bp, cbp, pbp); | 512 bioq_insert_tail(p->bqueue, cbp); |
446 447 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 448 bq->bp = cbp; 449 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 450 451 /* 452 * A normal write request goes to the original subdisk, then we 453 * read in all other stripes, recalculate the parity and write 454 * out the parity again. 455 */ 456 } else { 457 /* Read old parity. */ | 513 514 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 515 bq->bp = cbp; 516 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 517 518 /* 519 * A normal write request goes to the original subdisk, then we 520 * read in all other stripes, recalculate the parity and write 521 * out the parity again. 522 */ 523 } else { 524 /* Read old parity. */ |
458 cbp = g_clone_bio(bp); | 525 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); |
459 if (cbp == NULL) 460 return (ENOMEM); 461 cbp->bio_cmd = BIO_READ; | 526 if (cbp == NULL) 527 return (ENOMEM); 528 cbp->bio_cmd = BIO_READ; |
462 cbp->bio_data = g_malloc(real_len, M_WAITOK); 463 cbp->bio_cflags |= GV_BIO_MALLOC; 464 cbp->bio_offset = real_off; 465 cbp->bio_length = real_len; 466 cbp->bio_done = gv_plex_done; 467 cbp->bio_caller2 = parity->consumer; 468 cbp->bio_driver1 = wp; | |
469 | 529 |
470 GV_ENQUEUE(bp, cbp, pbp); | 530 bioq_insert_tail(p->bqueue, cbp); |
471 472 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 473 bq->bp = cbp; 474 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 475 476 /* Read old data. */ | 531 532 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 533 bq->bp = cbp; 534 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 535 536 /* Read old data. */ |
477 cbp = g_clone_bio(bp); | 537 cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); |
478 if (cbp == NULL) 479 return (ENOMEM); 480 cbp->bio_cmd = BIO_READ; | 538 if (cbp == NULL) 539 return (ENOMEM); 540 cbp->bio_cmd = BIO_READ; |
481 cbp->bio_data = g_malloc(real_len, M_WAITOK); 482 cbp->bio_cflags |= GV_BIO_MALLOC; 483 cbp->bio_offset = real_off; 484 cbp->bio_length = real_len; 485 cbp->bio_done = gv_plex_done; 486 cbp->bio_caller2 = original->consumer; 487 cbp->bio_driver1 = wp; | |
488 | 541 |
489 GV_ENQUEUE(bp, cbp, pbp); | 542 bioq_insert_tail(p->bqueue, cbp); |
490 491 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 492 bq->bp = cbp; 493 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 494 495 /* Write new data. */ | 543 544 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 545 bq->bp = cbp; 546 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 547 548 /* Write new data. */ |
496 cbp = g_clone_bio(bp); | 549 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); |
497 if (cbp == NULL) 498 return (ENOMEM); | 550 if (cbp == NULL) 551 return (ENOMEM); |
499 cbp->bio_data = addr; 500 cbp->bio_offset = real_off; 501 cbp->bio_length = real_len; 502 cbp->bio_done = gv_plex_done; 503 cbp->bio_caller2 = original->consumer; | |
504 | 552 |
505 cbp->bio_driver1 = wp; 506 | |
507 /* 508 * We must not write the new data until the old data 509 * was read, so hold this BIO back until we're ready 510 * for it. 511 */ 512 wp->waiting = cbp; 513 514 /* The final bio for the parity. */ | 553 /* 554 * We must not write the new data until the old data 555 * was read, so hold this BIO back until we're ready 556 * for it. 557 */ 558 wp->waiting = cbp; 559 560 /* The final bio for the parity. */ |
515 cbp = g_clone_bio(bp); | 561 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); |
516 if (cbp == NULL) 517 return (ENOMEM); | 562 if (cbp == NULL) 563 return (ENOMEM); |
518 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 519 cbp->bio_cflags |= GV_BIO_MALLOC; 520 cbp->bio_offset = real_off; 521 cbp->bio_length = real_len; 522 cbp->bio_done = gv_plex_done; 523 cbp->bio_caller2 = parity->consumer; 524 cbp->bio_driver1 = wp; | |
525 526 /* Remember that this is the BIO for the parity data. */ 527 wp->parity = cbp; 528 } 529 break; 530 531 default: 532 return (EINVAL); 533 } 534 535 return (0); 536} 537 | 564 565 /* Remember that this is the BIO for the parity data. */ 566 wp->parity = cbp; 567 } 568 break; 569 570 default: 571 return (EINVAL); 572 } 573 574 return (0); 575} 576 |
538/* Calculate the offsets in the various subdisks for a RAID5 request. */ 539int | 577/* 578 * Calculate the offsets in the various subdisks for a RAID5 request. Also take 579 * care of new subdisks in an expanded RAID5 array. 580 * XXX: This assumes that the new subdisks are inserted after the others (which 581 * is okay as long as plex_offset is larger). If subdisks are inserted into the 582 * plexlist before, we get problems. 583 */ 584static int |
540gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, | 585gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, |
541 off_t *real_len, int *sdno, int *psdno) | 586 off_t *real_len, int *sdno, int *psdno, int growing) |
542{ | 587{ |
543 int sd, psd; | 588 struct gv_sd *s; 589 int sd, psd, sdcount; |
544 off_t len_left, stripeend, stripeoff, stripestart; 545 | 590 off_t len_left, stripeend, stripeoff, stripestart; 591 |
592 sdcount = p->sdcount; 593 if (growing) { 594 LIST_FOREACH(s, &p->subdisks, in_plex) { 595 if (s->flags & GV_SD_GROW) 596 sdcount--; 597 } 598 } 599 |
|
546 /* The number of the subdisk containing the parity stripe. */ | 600 /* The number of the subdisk containing the parity stripe. */ |
547 psd = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % 548 p->sdcount; | 601 psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % 602 sdcount; |
549 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); 550 551 /* Offset of the start address from the start of the stripe. */ | 603 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); 604 605 /* Offset of the start address from the start of the stripe. */ |
552 stripeoff = boff % (p->stripesize * (p->sdcount - 1)); | 606 stripeoff = boff % (p->stripesize * (sdcount - 1)); |
553 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); 554 555 /* The number of the subdisk where the stripe resides. */ 556 sd = stripeoff / p->stripesize; 557 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); 558 559 /* At or past parity subdisk. */ 560 if (sd >= psd) 561 sd++; 562 563 /* The offset of the stripe on this subdisk. */ | 607 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); 608 609 /* The number of the subdisk where the stripe resides. */ 610 sd = stripeoff / p->stripesize; 611 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); 612 613 /* At or past parity subdisk. */ 614 if (sd >= psd) 615 sd++; 616 617 /* The offset of the stripe on this subdisk. */ |
564 stripestart = (boff - stripeoff) / (p->sdcount - 1); | 618 stripestart = (boff - stripeoff) / (sdcount - 1); |
565 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); 566 567 stripeoff %= p->stripesize; 568 569 /* The offset of the request on this subdisk. */ 570 *real_off = stripestart + stripeoff; 571 572 stripeend = stripestart + p->stripesize; --- 4 unchanged lines hidden (view full) --- 577 578 if (sdno != NULL) 579 *sdno = sd; 580 if (psdno != NULL) 581 *psdno = psd; 582 583 return (0); 584} | 619 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); 620 621 stripeoff %= p->stripesize; 622 623 /* The offset of the request on this subdisk. */ 624 *real_off = stripestart + stripeoff; 625 626 stripeend = stripestart + p->stripesize; --- 4 unchanged lines hidden (view full) --- 631 632 if (sdno != NULL) 633 *sdno = sd; 634 if (psdno != NULL) 635 *psdno = psd; 636 637 return (0); 638} |
639 640static struct bio * 641gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, 642 caddr_t addr, int use_wp) 643{ 644 struct bio *cbp; 645 646 cbp = g_clone_bio(bp); 647 if (cbp == NULL) 648 return (NULL); 649 if (addr == NULL) { 650 cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); 651 cbp->bio_cflags |= GV_BIO_MALLOC; 652 } else 653 cbp->bio_data = addr; 654 cbp->bio_offset = wp->lockbase + s->drive_offset; 655 cbp->bio_length = wp->length; 656 cbp->bio_done = gv_done; 657 cbp->bio_caller1 = s; 658 if (use_wp) 659 cbp->bio_caller2 = wp; 660 661 return (cbp); 662} |
|