1/*-
2 * Copyright (c) 2004, 2007 Lukas Ertl
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: stable/11/sys/geom/vinum/geom_vinum_raid5.c 356577 2020-01-10 00:42:05Z mav $");
29
30#include <sys/param.h>
31#include <sys/bio.h>
32#include <sys/lock.h>
33#include <sys/malloc.h>
34#include <sys/systm.h>
35
36#include <geom/geom.h>
37#include <geom/vinum/geom_vinum_var.h>
38#include <geom/vinum/geom_vinum_raid5.h>
39#include <geom/vinum/geom_vinum.h>
40
41static int		gv_raid5_offset(struct gv_plex *, off_t, off_t,
42			    off_t *, off_t *, int *, int *, int);
43static struct bio *	gv_raid5_clone_bio(struct bio *, struct gv_sd *,
44			    struct gv_raid5_packet *, caddr_t, int);
45static int	gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *,
46		    struct bio *, caddr_t, off_t, off_t, int *);
47static int	gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *,
48		    struct bio *, caddr_t, off_t, off_t);
49static int	gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *,
50		    struct bio *, caddr_t, off_t, off_t);
51
52struct gv_raid5_packet *
53gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff,
54    off_t bcount)
55{
56	struct bio *cbp;
57	struct gv_raid5_packet *wp, *wp2;
58	struct gv_bioq *bq, *bq2;
59	int err, delay;
60
61	delay = 0;
62	wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
63	wp->bio = bp;
64	wp->waiting = NULL;
65	wp->parity = NULL;
66	TAILQ_INIT(&wp->bits);
67
68	if (bp->bio_pflags & GV_BIO_REBUILD)
69		err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount);
70	else if (bp->bio_pflags & GV_BIO_CHECK)
71		err = gv_raid5_check(p, wp, bp, addr, boff, bcount);
72	else
73		err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay);
74
75	/* Means we have a delayed request. */
76	if (delay) {
77		g_free(wp);
78		return (NULL);
79	}
80
81	/*
82	 * Building the sub-request failed, we probably need to clean up a lot.
83	 */
84	if (err) {
85		G_VINUM_LOGREQ(0, bp, "raid5 plex request failed.");
86		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
87			TAILQ_REMOVE(&wp->bits, bq, queue);
88			g_free(bq);
89		}
90		if (wp->waiting != NULL) {
91			if (wp->waiting->bio_cflags & GV_BIO_MALLOC)
92				g_free(wp->waiting->bio_data);
93			gv_drive_done(wp->waiting->bio_caller1);
94			g_destroy_bio(wp->waiting);
95		}
96		if (wp->parity != NULL) {
97			if (wp->parity->bio_cflags & GV_BIO_MALLOC)
98				g_free(wp->parity->bio_data);
99			gv_drive_done(wp->parity->bio_caller1);
100			g_destroy_bio(wp->parity);
101		}
102		g_free(wp);
103
104		TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
105			if (wp->bio != bp)
106				continue;
107
108			TAILQ_REMOVE(&p->packets, wp, list);
109			TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
110				TAILQ_REMOVE(&wp->bits, bq, queue);
111				g_free(bq);
112			}
113			g_free(wp);
114		}
115
116		cbp = bioq_takefirst(p->bqueue);
117		while (cbp != NULL) {
118			if (cbp->bio_cflags & GV_BIO_MALLOC)
119				g_free(cbp->bio_data);
120			gv_drive_done(cbp->bio_caller1);
121			g_destroy_bio(cbp);
122			cbp = bioq_takefirst(p->bqueue);
123		}
124
125		/* If internal, stop and reset state. */
126		if (bp->bio_pflags & GV_BIO_INTERNAL) {
127			if (bp->bio_pflags & GV_BIO_MALLOC)
128				g_free(bp->bio_data);
129			g_destroy_bio(bp);
130			/* Reset flags. */
131			p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
132			    GV_PLEX_GROWING);
133			return (NULL);
134		}
135		g_io_deliver(bp, err);
136		return (NULL);
137	}
138
139	return (wp);
140}
141
142/*
143 * Check if the stripe that the work packet wants is already being used by
144 * some other work packet.
145 */
146int
147gv_stripe_active(struct gv_plex *p, struct bio *bp)
148{
149	struct gv_raid5_packet *wp, *owp;
150	int overlap;
151
152	wp = bp->bio_caller2;
153	if (wp->lockbase == -1)
154		return (0);
155
156	overlap = 0;
157	TAILQ_FOREACH(owp, &p->packets, list) {
158		if (owp == wp)
159			break;
160		if ((wp->lockbase >= owp->lockbase) &&
161		    (wp->lockbase <= owp->lockbase + owp->length)) {
162			overlap++;
163			break;
164		}
165		if ((wp->lockbase <= owp->lockbase) &&
166		    (wp->lockbase + wp->length >= owp->lockbase)) {
167			overlap++;
168			break;
169		}
170	}
171
172	return (overlap);
173}
174
175static int
176gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
177    caddr_t addr, off_t boff, off_t bcount)
178{
179	struct gv_sd *parity, *s;
180	struct gv_bioq *bq;
181	struct bio *cbp;
182	int i, psdno;
183	off_t real_len, real_off;
184
185	if (p == NULL || LIST_EMPTY(&p->subdisks))
186		return (ENXIO);
187
188	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1);
189
190	/* Find the right subdisk. */
191	parity = NULL;
192	i = 0;
193	LIST_FOREACH(s, &p->subdisks, in_plex) {
194		if (i == psdno) {
195			parity = s;
196			break;
197		}
198		i++;
199	}
200
201	/* Parity stripe not found. */
202	if (parity == NULL)
203		return (ENXIO);
204
205	if (parity->state != GV_SD_UP)
206		return (ENXIO);
207
208	wp->length = real_len;
209	wp->data = addr;
210	wp->lockbase = real_off;
211
212	/* Read all subdisks. */
213	LIST_FOREACH(s, &p->subdisks, in_plex) {
214		/* Skip the parity subdisk. */
215		if (s == parity)
216			continue;
217		/* Skip growing subdisks. */
218		if (s->flags & GV_SD_GROW)
219			continue;
220
221		cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
222		if (cbp == NULL)
223			return (ENOMEM);
224		cbp->bio_cmd = BIO_READ;
225
226		bioq_insert_tail(p->bqueue, cbp);
227
228		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
229		bq->bp = cbp;
230		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
231	}
232
233	/* Read the parity data. */
234	cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
235	if (cbp == NULL)
236		return (ENOMEM);
237	cbp->bio_cmd = BIO_READ;
238	wp->waiting = cbp;
239
240	/*
241	 * In case we want to rebuild the parity, create an extra BIO to write
242	 * it out.  It also acts as buffer for the XOR operations.
243	 */
244	cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1);
245	if (cbp == NULL)
246		return (ENOMEM);
247	wp->parity = cbp;
248
249	return (0);
250}
251
252/* Rebuild a degraded RAID5 plex. */
253static int
254gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
255    caddr_t addr, off_t boff, off_t bcount)
256{
257	struct gv_sd *broken, *s;
258	struct gv_bioq *bq;
259	struct bio *cbp;
260	off_t real_len, real_off;
261
262	if (p == NULL || LIST_EMPTY(&p->subdisks))
263		return (ENXIO);
264
265	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1);
266
267	/* Find the right subdisk. */
268	broken = NULL;
269	LIST_FOREACH(s, &p->subdisks, in_plex) {
270		if (s->state != GV_SD_UP)
271			broken = s;
272	}
273
274	/* Broken stripe not found. */
275	if (broken == NULL)
276		return (ENXIO);
277
278	switch (broken->state) {
279	case GV_SD_UP:
280		return (EINVAL);
281
282	case GV_SD_STALE:
283		if (!(bp->bio_pflags & GV_BIO_REBUILD))
284			return (ENXIO);
285
286		G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
287		gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
288		/* Set this bit now, but should be set at end. */
289		broken->flags |= GV_SD_CANGOUP;
290		break;
291
292	case GV_SD_REVIVING:
293		break;
294
295	default:
296		/* All other subdisk states mean it's not accessible. */
297		return (ENXIO);
298	}
299
300	wp->length = real_len;
301	wp->data = addr;
302	wp->lockbase = real_off;
303
304	KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
305
306	/* Read all subdisks. */
307	LIST_FOREACH(s, &p->subdisks, in_plex) {
308		/* Skip the broken subdisk. */
309		if (s == broken)
310			continue;
311
312		/* Skip growing subdisks. */
313		if (s->flags & GV_SD_GROW)
314			continue;
315
316		cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
317		if (cbp == NULL)
318			return (ENOMEM);
319		cbp->bio_cmd = BIO_READ;
320
321		bioq_insert_tail(p->bqueue, cbp);
322
323		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
324		bq->bp = cbp;
325		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
326	}
327
328	/* Write the parity data. */
329	cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1);
330	if (cbp == NULL)
331		return (ENOMEM);
332	wp->parity = cbp;
333
334	p->synced = boff;
335
336	/* Post notification that we're finished. */
337	return (0);
338}
339
340/* Build a request group to perform (part of) a RAID5 request. */
341static int
342gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp,
343    struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay)
344{
345	struct g_geom *gp;
346	struct gv_sd *broken, *original, *parity, *s;
347	struct gv_bioq *bq;
348	struct bio *cbp;
349	int i, psdno, sdno, type, grow;
350	off_t real_len, real_off;
351
352	gp = bp->bio_to->geom;
353
354	if (p == NULL || LIST_EMPTY(&p->subdisks))
355		return (ENXIO);
356
357	/* We are optimistic and assume that this request will be OK. */
358#define	REQ_TYPE_NORMAL		0
359#define	REQ_TYPE_DEGRADED	1
360#define	REQ_TYPE_NOPARITY	2
361
362	type = REQ_TYPE_NORMAL;
363	original = parity = broken = NULL;
364
365	/* XXX: The resize won't crash with rebuild or sync, but we should still
366	 * be aware of it. Also this should perhaps be done on rebuild/check as
367	 * well?
368	 */
369	/* If we're over, we must use the old. */
370	if (boff >= p->synced) {
371		grow = 1;
372	/* Or if over the resized offset, we use all drives. */
373	} else if (boff + bcount <= p->synced) {
374		grow = 0;
375	/* Else, we're in the middle, and must wait a bit. */
376	} else {
377		bioq_disksort(p->rqueue, bp);
378		*delay = 1;
379		return (0);
380	}
381	gv_raid5_offset(p, boff, bcount, &real_off, &real_len,
382	    &sdno, &psdno, grow);
383
384	/* Find the right subdisks. */
385	i = 0;
386	LIST_FOREACH(s, &p->subdisks, in_plex) {
387		if (i == sdno)
388			original = s;
389		if (i == psdno)
390			parity = s;
391		if (s->state != GV_SD_UP)
392			broken = s;
393		i++;
394	}
395
396	if ((original == NULL) || (parity == NULL))
397		return (ENXIO);
398
399	/* Our data stripe is missing. */
400	if (original->state != GV_SD_UP)
401		type = REQ_TYPE_DEGRADED;
402
403	/* If synchronizing request, just write it if disks are stale. */
404	if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE &&
405	    bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) {
406		type = REQ_TYPE_NORMAL;
407	/* Our parity stripe is missing. */
408	} else if (parity->state != GV_SD_UP) {
409		/* We cannot take another failure if we're already degraded. */
410		if (type != REQ_TYPE_NORMAL)
411			return (ENXIO);
412		else
413			type = REQ_TYPE_NOPARITY;
414	}
415
416	wp->length = real_len;
417	wp->data = addr;
418	wp->lockbase = real_off;
419
420	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
421
422	if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced))
423		type = REQ_TYPE_NORMAL;
424
425	if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) {
426		bioq_disksort(p->rqueue, bp);
427		*delay = 1;
428		return (0);
429	}
430
431	switch (bp->bio_cmd) {
432	case BIO_READ:
433		/*
434		 * For a degraded read we need to read in all stripes except
435		 * the broken one plus the parity stripe and then recalculate
436		 * the desired data.
437		 */
438		if (type == REQ_TYPE_DEGRADED) {
439			bzero(wp->data, wp->length);
440			LIST_FOREACH(s, &p->subdisks, in_plex) {
441				/* Skip the broken subdisk. */
442				if (s == broken)
443					continue;
444				/* Skip growing if within offset. */
445				if (grow && s->flags & GV_SD_GROW)
446					continue;
447				cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
448				if (cbp == NULL)
449					return (ENOMEM);
450
451				bioq_insert_tail(p->bqueue, cbp);
452
453				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
454				bq->bp = cbp;
455				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
456			}
457
458		/* A normal read can be fulfilled with the original subdisk. */
459		} else {
460			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0);
461			if (cbp == NULL)
462				return (ENOMEM);
463
464			bioq_insert_tail(p->bqueue, cbp);
465		}
466		wp->lockbase = -1;
467
468		break;
469
470	case BIO_WRITE:
471		/*
472		 * A degraded write means we cannot write to the original data
473		 * subdisk.  Thus we need to read in all valid stripes,
474		 * recalculate the parity from the original data, and then
475		 * write the parity stripe back out.
476		 */
477		if (type == REQ_TYPE_DEGRADED) {
478			/* Read all subdisks. */
479			LIST_FOREACH(s, &p->subdisks, in_plex) {
480				/* Skip the broken and the parity subdisk. */
481				if ((s == broken) || (s == parity))
482					continue;
483				/* Skip growing if within offset. */
484				if (grow && s->flags & GV_SD_GROW)
485					continue;
486
487				cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
488				if (cbp == NULL)
489					return (ENOMEM);
490				cbp->bio_cmd = BIO_READ;
491
492				bioq_insert_tail(p->bqueue, cbp);
493
494				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
495				bq->bp = cbp;
496				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
497			}
498
499			/* Write the parity data. */
500			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
501			if (cbp == NULL)
502				return (ENOMEM);
503			bcopy(addr, cbp->bio_data, wp->length);
504			wp->parity = cbp;
505
506		/*
507		 * When the parity stripe is missing we just write out the data.
508		 */
509		} else if (type == REQ_TYPE_NOPARITY) {
510			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
511			if (cbp == NULL)
512				return (ENOMEM);
513
514			bioq_insert_tail(p->bqueue, cbp);
515
516			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
517			bq->bp = cbp;
518			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
519
520		/*
521		 * A normal write request goes to the original subdisk, then we
522		 * read in all other stripes, recalculate the parity and write
523		 * out the parity again.
524		 */
525		} else {
526			/* Read old parity. */
527			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
528			if (cbp == NULL)
529				return (ENOMEM);
530			cbp->bio_cmd = BIO_READ;
531
532			bioq_insert_tail(p->bqueue, cbp);
533
534			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
535			bq->bp = cbp;
536			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
537
538			/* Read old data. */
539			cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1);
540			if (cbp == NULL)
541				return (ENOMEM);
542			cbp->bio_cmd = BIO_READ;
543
544			bioq_insert_tail(p->bqueue, cbp);
545
546			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
547			bq->bp = cbp;
548			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
549
550			/* Write new data. */
551			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
552			if (cbp == NULL)
553				return (ENOMEM);
554
555			/*
556			 * We must not write the new data until the old data
557			 * was read, so hold this BIO back until we're ready
558			 * for it.
559			 */
560			wp->waiting = cbp;
561
562			/* The final bio for the parity. */
563			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
564			if (cbp == NULL)
565				return (ENOMEM);
566
567			/* Remember that this is the BIO for the parity data. */
568			wp->parity = cbp;
569		}
570		break;
571
572	default:
573		return (EINVAL);
574	}
575
576	return (0);
577}
578
579/*
580 * Calculate the offsets in the various subdisks for a RAID5 request. Also take
581 * care of new subdisks in an expanded RAID5 array.
582 * XXX: This assumes that the new subdisks are inserted after the others (which
583 * is okay as long as plex_offset is larger). If subdisks are inserted into the
584 * plexlist before, we get problems.
585 */
586static int
587gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
588    off_t *real_len, int *sdno, int *psdno, int growing)
589{
590	struct gv_sd *s;
591	int sd, psd, sdcount;
592	off_t len_left, stripeend, stripeoff, stripestart;
593
594	sdcount = p->sdcount;
595	if (growing) {
596		LIST_FOREACH(s, &p->subdisks, in_plex) {
597			if (s->flags & GV_SD_GROW)
598				sdcount--;
599		}
600	}
601
602	/* The number of the subdisk containing the parity stripe. */
603	psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) %
604	    sdcount;
605	KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
606
607	/* Offset of the start address from the start of the stripe. */
608	stripeoff = boff % (p->stripesize * (sdcount - 1));
609	KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
610
611	/* The number of the subdisk where the stripe resides. */
612	sd = stripeoff / p->stripesize;
613	KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
614
615	/* At or past parity subdisk. */
616	if (sd >= psd)
617		sd++;
618
619	/* The offset of the stripe on this subdisk. */
620	stripestart = (boff - stripeoff) / (sdcount - 1);
621	KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
622
623	stripeoff %= p->stripesize;
624
625	/* The offset of the request on this subdisk. */
626	*real_off = stripestart + stripeoff;
627
628	stripeend = stripestart + p->stripesize;
629	len_left = stripeend - *real_off;
630	KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
631
632	*real_len = (bcount <= len_left) ? bcount : len_left;
633
634	if (sdno != NULL)
635		*sdno = sd;
636	if (psdno != NULL)
637		*psdno = psd;
638
639	return (0);
640}
641
642static struct bio *
643gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp,
644    caddr_t addr, int use_wp)
645{
646	struct bio *cbp;
647
648	cbp = g_clone_bio(bp);
649	if (cbp == NULL)
650		return (NULL);
651	if (addr == NULL) {
652		cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO);
653		cbp->bio_cflags |= GV_BIO_MALLOC;
654	} else
655		cbp->bio_data = addr;
656	cbp->bio_offset = wp->lockbase + s->drive_offset;
657	cbp->bio_length = wp->length;
658	cbp->bio_done = gv_done;
659	cbp->bio_caller1 = s;
660	s->drive_sc->active++;
661	if (use_wp)
662		cbp->bio_caller2 = wp;
663
664	return (cbp);
665}
666