geom_vinum_raid5.c revision 191852
1/*-
2 * Copyright (c) 2004, 2007 Lukas Ertl
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_raid5.c 191852 2009-05-06 18:27:28Z lulf $");
29
30#include <sys/param.h>
31#include <sys/bio.h>
32#include <sys/lock.h>
33#include <sys/malloc.h>
34#include <sys/systm.h>
35
36#include <geom/geom.h>
37#include <geom/vinum/geom_vinum_var.h>
38#include <geom/vinum/geom_vinum_raid5.h>
39#include <geom/vinum/geom_vinum.h>
40
41static int		gv_raid5_offset(struct gv_plex *, off_t, off_t,
42			    off_t *, off_t *, int *, int *, int);
43static struct bio *	gv_raid5_clone_bio(struct bio *, struct gv_sd *,
44			    struct gv_raid5_packet *, caddr_t, int);
45static int	gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *,
46		    struct bio *, caddr_t, off_t, off_t, int *);
47static int	gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *,
48		    struct bio *, caddr_t, off_t, off_t);
49static int	gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *,
50		    struct bio *, caddr_t, off_t, off_t);
51
52struct gv_raid5_packet *
53gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff,
54    off_t bcount)
55{
56	struct bio *cbp;
57	struct gv_raid5_packet *wp, *wp2;
58	struct gv_bioq *bq, *bq2;
59	int err, delay;
60
61	delay = 0;
62	wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
63	wp->bio = bp;
64	wp->waiting = NULL;
65	wp->parity = NULL;
66	TAILQ_INIT(&wp->bits);
67
68	if (bp->bio_cflags & GV_BIO_REBUILD)
69		err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount);
70	else if (bp->bio_cflags & GV_BIO_CHECK)
71		err = gv_raid5_check(p, wp, bp, addr, boff, bcount);
72	else
73		err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay);
74
75	/* Means we have a delayed request. */
76	if (delay) {
77		g_free(wp);
78		return (NULL);
79	}
80
81	/*
82	 * Building the sub-request failed, we probably need to clean up a lot.
83	 */
84	if (err) {
85		G_VINUM_LOGREQ(0, bp, "raid5 plex request failed.");
86		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
87			TAILQ_REMOVE(&wp->bits, bq, queue);
88			g_free(bq);
89		}
90		if (wp->waiting != NULL) {
91			if (wp->waiting->bio_cflags & GV_BIO_MALLOC)
92				g_free(wp->waiting->bio_data);
93			g_destroy_bio(wp->waiting);
94		}
95		if (wp->parity != NULL) {
96			if (wp->parity->bio_cflags & GV_BIO_MALLOC)
97				g_free(wp->parity->bio_data);
98			g_destroy_bio(wp->parity);
99		}
100		g_free(wp);
101
102		TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
103			if (wp->bio != bp)
104				continue;
105
106			TAILQ_REMOVE(&p->packets, wp, list);
107			TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
108				TAILQ_REMOVE(&wp->bits, bq, queue);
109				g_free(bq);
110			}
111			g_free(wp);
112		}
113
114		cbp = bioq_takefirst(p->bqueue);
115		while (cbp != NULL) {
116			if (cbp->bio_cflags & GV_BIO_MALLOC)
117				g_free(cbp->bio_data);
118			g_destroy_bio(cbp);
119			cbp = bioq_takefirst(p->bqueue);
120		}
121
122		/* If internal, stop and reset state. */
123		if (bp->bio_cflags & GV_BIO_INTERNAL) {
124			if (bp->bio_cflags & GV_BIO_MALLOC)
125				g_free(bp->bio_data);
126			g_destroy_bio(bp);
127			/* Reset flags. */
128			p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
129			    GV_PLEX_GROWING);
130			return (NULL);
131		}
132		g_io_deliver(bp, err);
133		return (NULL);
134	}
135
136	return (wp);
137}
138
139/*
140 * Check if the stripe that the work packet wants is already being used by
141 * some other work packet.
142 */
143int
144gv_stripe_active(struct gv_plex *p, struct bio *bp)
145{
146	struct gv_raid5_packet *wp, *owp;
147	int overlap;
148
149	wp = bp->bio_caller2;
150	if (wp->lockbase == -1)
151		return (0);
152
153	overlap = 0;
154	TAILQ_FOREACH(owp, &p->packets, list) {
155		if (owp == wp)
156			break;
157		if ((wp->lockbase >= owp->lockbase) &&
158		    (wp->lockbase <= owp->lockbase + owp->length)) {
159			overlap++;
160			break;
161		}
162		if ((wp->lockbase <= owp->lockbase) &&
163		    (wp->lockbase + wp->length >= owp->lockbase)) {
164			overlap++;
165			break;
166		}
167	}
168
169	return (overlap);
170}
171
172static int
173gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
174    caddr_t addr, off_t boff, off_t bcount)
175{
176	struct gv_sd *parity, *s;
177	struct gv_bioq *bq;
178	struct bio *cbp;
179	int i, psdno;
180	off_t real_len, real_off;
181
182	if (p == NULL || LIST_EMPTY(&p->subdisks))
183		return (ENXIO);
184
185	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1);
186
187	/* Find the right subdisk. */
188	parity = NULL;
189	i = 0;
190	LIST_FOREACH(s, &p->subdisks, in_plex) {
191		if (i == psdno) {
192			parity = s;
193			break;
194		}
195		i++;
196	}
197
198	/* Parity stripe not found. */
199	if (parity == NULL)
200		return (ENXIO);
201
202	if (parity->state != GV_SD_UP)
203		return (ENXIO);
204
205	wp->length = real_len;
206	wp->data = addr;
207	wp->lockbase = real_off;
208
209	/* Read all subdisks. */
210	LIST_FOREACH(s, &p->subdisks, in_plex) {
211		/* Skip the parity subdisk. */
212		if (s == parity)
213			continue;
214		/* Skip growing subdisks. */
215		if (s->flags & GV_SD_GROW)
216			continue;
217
218		cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
219		if (cbp == NULL)
220			return (ENOMEM);
221		cbp->bio_cmd = BIO_READ;
222
223		bioq_insert_tail(p->bqueue, cbp);
224
225		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
226		bq->bp = cbp;
227		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
228	}
229
230	/* Read the parity data. */
231	cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
232	if (cbp == NULL)
233		return (ENOMEM);
234	cbp->bio_cmd = BIO_READ;
235	wp->waiting = cbp;
236
237	/*
238	 * In case we want to rebuild the parity, create an extra BIO to write
239	 * it out.  It also acts as buffer for the XOR operations.
240	 */
241	cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1);
242	if (cbp == NULL)
243		return (ENOMEM);
244	wp->parity = cbp;
245
246	return (0);
247}
248
249/* Rebuild a degraded RAID5 plex. */
250static int
251gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
252    caddr_t addr, off_t boff, off_t bcount)
253{
254	struct gv_sd *broken, *s;
255	struct gv_bioq *bq;
256	struct bio *cbp;
257	off_t real_len, real_off;
258
259	if (p == NULL || LIST_EMPTY(&p->subdisks))
260		return (ENXIO);
261
262	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1);
263
264	/* Find the right subdisk. */
265	broken = NULL;
266	LIST_FOREACH(s, &p->subdisks, in_plex) {
267		if (s->state != GV_SD_UP)
268			broken = s;
269	}
270
271	/* Broken stripe not found. */
272	if (broken == NULL)
273		return (ENXIO);
274
275	switch (broken->state) {
276	case GV_SD_UP:
277		return (EINVAL);
278
279	case GV_SD_STALE:
280		if (!(bp->bio_cflags & GV_BIO_REBUILD))
281			return (ENXIO);
282
283		G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
284		gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
285		/* Set this bit now, but should be set at end. */
286		broken->flags |= GV_SD_CANGOUP;
287		break;
288
289	case GV_SD_REVIVING:
290		break;
291
292	default:
293		/* All other subdisk states mean it's not accessible. */
294		return (ENXIO);
295	}
296
297	wp->length = real_len;
298	wp->data = addr;
299	wp->lockbase = real_off;
300
301	KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
302
303	/* Read all subdisks. */
304	LIST_FOREACH(s, &p->subdisks, in_plex) {
305		/* Skip the broken subdisk. */
306		if (s == broken)
307			continue;
308
309		/* Skip growing subdisks. */
310		if (s->flags & GV_SD_GROW)
311			continue;
312
313		cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
314		if (cbp == NULL)
315			return (ENOMEM);
316		cbp->bio_cmd = BIO_READ;
317
318		bioq_insert_tail(p->bqueue, cbp);
319
320		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
321		bq->bp = cbp;
322		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
323	}
324
325	/* Write the parity data. */
326	cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1);
327	if (cbp == NULL)
328		return (ENOMEM);
329	cbp->bio_cflags |= GV_BIO_REBUILD;
330	wp->parity = cbp;
331
332	p->synced = boff;
333
334	/* Post notification that we're finished. */
335	return (0);
336}
337
338/* Build a request group to perform (part of) a RAID5 request. */
339static int
340gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp,
341    struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay)
342{
343	struct g_geom *gp;
344	struct gv_sd *broken, *original, *parity, *s;
345	struct gv_bioq *bq;
346	struct bio *cbp;
347	int i, psdno, sdno, type, grow;
348	off_t real_len, real_off;
349
350	gp = bp->bio_to->geom;
351
352	if (p == NULL || LIST_EMPTY(&p->subdisks))
353		return (ENXIO);
354
355	/* We are optimistic and assume that this request will be OK. */
356#define	REQ_TYPE_NORMAL		0
357#define	REQ_TYPE_DEGRADED	1
358#define	REQ_TYPE_NOPARITY	2
359
360	type = REQ_TYPE_NORMAL;
361	original = parity = broken = NULL;
362
363	/* XXX: The resize won't crash with rebuild or sync, but we should still
364	 * be aware of it. Also this should perhaps be done on rebuild/check as
365	 * well?
366	 */
367	/* If we're over, we must use the old. */
368	if (boff >= p->synced) {
369		grow = 1;
370	/* Or if over the resized offset, we use all drives. */
371	} else if (boff + bcount <= p->synced) {
372		grow = 0;
373	/* Else, we're in the middle, and must wait a bit. */
374	} else {
375		bioq_disksort(p->rqueue, bp);
376		*delay = 1;
377		return (0);
378	}
379	gv_raid5_offset(p, boff, bcount, &real_off, &real_len,
380	    &sdno, &psdno, grow);
381
382	/* Find the right subdisks. */
383	i = 0;
384	LIST_FOREACH(s, &p->subdisks, in_plex) {
385		if (i == sdno)
386			original = s;
387		if (i == psdno)
388			parity = s;
389		if (s->state != GV_SD_UP)
390			broken = s;
391		i++;
392	}
393
394	if ((original == NULL) || (parity == NULL))
395		return (ENXIO);
396
397	/* Our data stripe is missing. */
398	if (original->state != GV_SD_UP)
399		type = REQ_TYPE_DEGRADED;
400
401	/* If synchronizing request, just write it if disks are stale. */
402	if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE &&
403	    bp->bio_cflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) {
404		type = REQ_TYPE_NORMAL;
405	/* Our parity stripe is missing. */
406	} else if (parity->state != GV_SD_UP) {
407		/* We cannot take another failure if we're already degraded. */
408		if (type != REQ_TYPE_NORMAL)
409			return (ENXIO);
410		else
411			type = REQ_TYPE_NOPARITY;
412	}
413
414	wp->length = real_len;
415	wp->data = addr;
416	wp->lockbase = real_off;
417
418	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
419
420	if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced))
421		type = REQ_TYPE_NORMAL;
422
423	if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) {
424		bioq_disksort(p->rqueue, bp);
425		*delay = 1;
426		return (0);
427	}
428
429	switch (bp->bio_cmd) {
430	case BIO_READ:
431		/*
432		 * For a degraded read we need to read in all stripes except
433		 * the broken one plus the parity stripe and then recalculate
434		 * the desired data.
435		 */
436		if (type == REQ_TYPE_DEGRADED) {
437			bzero(wp->data, wp->length);
438			LIST_FOREACH(s, &p->subdisks, in_plex) {
439				/* Skip the broken subdisk. */
440				if (s == broken)
441					continue;
442				/* Skip growing if within offset. */
443				if (grow && s->flags & GV_SD_GROW)
444					continue;
445				cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
446				if (cbp == NULL)
447					return (ENOMEM);
448
449				bioq_insert_tail(p->bqueue, cbp);
450
451				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
452				bq->bp = cbp;
453				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
454			}
455
456		/* A normal read can be fulfilled with the original subdisk. */
457		} else {
458			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0);
459			if (cbp == NULL)
460				return (ENOMEM);
461
462			bioq_insert_tail(p->bqueue, cbp);
463		}
464		wp->lockbase = -1;
465
466		break;
467
468	case BIO_WRITE:
469		/*
470		 * A degraded write means we cannot write to the original data
471		 * subdisk.  Thus we need to read in all valid stripes,
472		 * recalculate the parity from the original data, and then
473		 * write the parity stripe back out.
474		 */
475		if (type == REQ_TYPE_DEGRADED) {
476			/* Read all subdisks. */
477			LIST_FOREACH(s, &p->subdisks, in_plex) {
478				/* Skip the broken and the parity subdisk. */
479				if ((s == broken) || (s == parity))
480					continue;
481				/* Skip growing if within offset. */
482				if (grow && s->flags & GV_SD_GROW)
483					continue;
484
485				cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
486				if (cbp == NULL)
487					return (ENOMEM);
488				cbp->bio_cmd = BIO_READ;
489
490				bioq_insert_tail(p->bqueue, cbp);
491
492				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
493				bq->bp = cbp;
494				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
495			}
496
497			/* Write the parity data. */
498			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
499			if (cbp == NULL)
500				return (ENOMEM);
501			bcopy(addr, cbp->bio_data, wp->length);
502			wp->parity = cbp;
503
504		/*
505		 * When the parity stripe is missing we just write out the data.
506		 */
507		} else if (type == REQ_TYPE_NOPARITY) {
508			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
509			if (cbp == NULL)
510				return (ENOMEM);
511
512			bioq_insert_tail(p->bqueue, cbp);
513
514			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
515			bq->bp = cbp;
516			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
517
518		/*
519		 * A normal write request goes to the original subdisk, then we
520		 * read in all other stripes, recalculate the parity and write
521		 * out the parity again.
522		 */
523		} else {
524			/* Read old parity. */
525			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
526			if (cbp == NULL)
527				return (ENOMEM);
528			cbp->bio_cmd = BIO_READ;
529
530			bioq_insert_tail(p->bqueue, cbp);
531
532			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
533			bq->bp = cbp;
534			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
535
536			/* Read old data. */
537			cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1);
538			if (cbp == NULL)
539				return (ENOMEM);
540			cbp->bio_cmd = BIO_READ;
541
542			bioq_insert_tail(p->bqueue, cbp);
543
544			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
545			bq->bp = cbp;
546			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
547
548			/* Write new data. */
549			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
550			if (cbp == NULL)
551				return (ENOMEM);
552
553			/*
554			 * We must not write the new data until the old data
555			 * was read, so hold this BIO back until we're ready
556			 * for it.
557			 */
558			wp->waiting = cbp;
559
560			/* The final bio for the parity. */
561			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
562			if (cbp == NULL)
563				return (ENOMEM);
564
565			/* Remember that this is the BIO for the parity data. */
566			wp->parity = cbp;
567		}
568		break;
569
570	default:
571		return (EINVAL);
572	}
573
574	return (0);
575}
576
577/*
578 * Calculate the offsets in the various subdisks for a RAID5 request. Also take
579 * care of new subdisks in an expanded RAID5 array.
580 * XXX: This assumes that the new subdisks are inserted after the others (which
581 * is okay as long as plex_offset is larger). If subdisks are inserted into the
582 * plexlist before, we get problems.
583 */
584static int
585gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
586    off_t *real_len, int *sdno, int *psdno, int growing)
587{
588	struct gv_sd *s;
589	int sd, psd, sdcount;
590	off_t len_left, stripeend, stripeoff, stripestart;
591
592	sdcount = p->sdcount;
593	if (growing) {
594		LIST_FOREACH(s, &p->subdisks, in_plex) {
595			if (s->flags & GV_SD_GROW)
596				sdcount--;
597		}
598	}
599
600	/* The number of the subdisk containing the parity stripe. */
601	psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) %
602	    sdcount;
603	KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
604
605	/* Offset of the start address from the start of the stripe. */
606	stripeoff = boff % (p->stripesize * (sdcount - 1));
607	KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
608
609	/* The number of the subdisk where the stripe resides. */
610	sd = stripeoff / p->stripesize;
611	KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
612
613	/* At or past parity subdisk. */
614	if (sd >= psd)
615		sd++;
616
617	/* The offset of the stripe on this subdisk. */
618	stripestart = (boff - stripeoff) / (sdcount - 1);
619	KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
620
621	stripeoff %= p->stripesize;
622
623	/* The offset of the request on this subdisk. */
624	*real_off = stripestart + stripeoff;
625
626	stripeend = stripestart + p->stripesize;
627	len_left = stripeend - *real_off;
628	KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
629
630	*real_len = (bcount <= len_left) ? bcount : len_left;
631
632	if (sdno != NULL)
633		*sdno = sd;
634	if (psdno != NULL)
635		*psdno = psd;
636
637	return (0);
638}
639
640static struct bio *
641gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp,
642    caddr_t addr, int use_wp)
643{
644	struct bio *cbp;
645
646	cbp = g_clone_bio(bp);
647	if (cbp == NULL)
648		return (NULL);
649	if (addr == NULL) {
650		cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO);
651		cbp->bio_cflags |= GV_BIO_MALLOC;
652	} else
653		cbp->bio_data = addr;
654	cbp->bio_offset = wp->lockbase + s->drive_offset;
655	cbp->bio_length = wp->length;
656	cbp->bio_done = gv_done;
657	cbp->bio_caller1 = s;
658	if (use_wp)
659		cbp->bio_caller2 = wp;
660
661	return (cbp);
662}
663