1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2004, 2007 Lukas Ertl
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD$");
31
32#include <sys/param.h>
33#include <sys/bio.h>
34#include <sys/lock.h>
35#include <sys/malloc.h>
36#include <sys/systm.h>
37
38#include <geom/geom.h>
39#include <geom/vinum/geom_vinum_var.h>
40#include <geom/vinum/geom_vinum_raid5.h>
41#include <geom/vinum/geom_vinum.h>
42
43static int		gv_raid5_offset(struct gv_plex *, off_t, off_t,
44			    off_t *, off_t *, int *, int *, int);
45static struct bio *	gv_raid5_clone_bio(struct bio *, struct gv_sd *,
46			    struct gv_raid5_packet *, caddr_t, int);
47static int	gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *,
48		    struct bio *, caddr_t, off_t, off_t, int *);
49static int	gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *,
50		    struct bio *, caddr_t, off_t, off_t);
51static int	gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *,
52		    struct bio *, caddr_t, off_t, off_t);
53
54struct gv_raid5_packet *
55gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff,
56    off_t bcount)
57{
58	struct bio *cbp;
59	struct gv_raid5_packet *wp, *wp2;
60	struct gv_bioq *bq, *bq2;
61	int err, delay;
62
63	delay = 0;
64	wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
65	wp->bio = bp;
66	wp->waiting = NULL;
67	wp->parity = NULL;
68	TAILQ_INIT(&wp->bits);
69
70	if (bp->bio_pflags & GV_BIO_REBUILD)
71		err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount);
72	else if (bp->bio_pflags & GV_BIO_CHECK)
73		err = gv_raid5_check(p, wp, bp, addr, boff, bcount);
74	else
75		err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay);
76
77	/* Means we have a delayed request. */
78	if (delay) {
79		g_free(wp);
80		return (NULL);
81	}
82
83	/*
84	 * Building the sub-request failed, we probably need to clean up a lot.
85	 */
86	if (err) {
87		G_VINUM_LOGREQ(0, bp, "raid5 plex request failed.");
88		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
89			TAILQ_REMOVE(&wp->bits, bq, queue);
90			g_free(bq);
91		}
92		if (wp->waiting != NULL) {
93			if (wp->waiting->bio_cflags & GV_BIO_MALLOC)
94				g_free(wp->waiting->bio_data);
95			gv_drive_done(wp->waiting->bio_caller1);
96			g_destroy_bio(wp->waiting);
97		}
98		if (wp->parity != NULL) {
99			if (wp->parity->bio_cflags & GV_BIO_MALLOC)
100				g_free(wp->parity->bio_data);
101			gv_drive_done(wp->parity->bio_caller1);
102			g_destroy_bio(wp->parity);
103		}
104		g_free(wp);
105
106		TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
107			if (wp->bio != bp)
108				continue;
109
110			TAILQ_REMOVE(&p->packets, wp, list);
111			TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
112				TAILQ_REMOVE(&wp->bits, bq, queue);
113				g_free(bq);
114			}
115			g_free(wp);
116		}
117
118		cbp = bioq_takefirst(p->bqueue);
119		while (cbp != NULL) {
120			if (cbp->bio_cflags & GV_BIO_MALLOC)
121				g_free(cbp->bio_data);
122			gv_drive_done(cbp->bio_caller1);
123			g_destroy_bio(cbp);
124			cbp = bioq_takefirst(p->bqueue);
125		}
126
127		/* If internal, stop and reset state. */
128		if (bp->bio_pflags & GV_BIO_INTERNAL) {
129			if (bp->bio_pflags & GV_BIO_MALLOC)
130				g_free(bp->bio_data);
131			g_destroy_bio(bp);
132			/* Reset flags. */
133			p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
134			    GV_PLEX_GROWING);
135			return (NULL);
136		}
137		g_io_deliver(bp, err);
138		return (NULL);
139	}
140
141	return (wp);
142}
143
144/*
145 * Check if the stripe that the work packet wants is already being used by
146 * some other work packet.
147 */
148int
149gv_stripe_active(struct gv_plex *p, struct bio *bp)
150{
151	struct gv_raid5_packet *wp, *owp;
152	int overlap;
153
154	wp = bp->bio_caller2;
155	if (wp->lockbase == -1)
156		return (0);
157
158	overlap = 0;
159	TAILQ_FOREACH(owp, &p->packets, list) {
160		if (owp == wp)
161			break;
162		if ((wp->lockbase >= owp->lockbase) &&
163		    (wp->lockbase <= owp->lockbase + owp->length)) {
164			overlap++;
165			break;
166		}
167		if ((wp->lockbase <= owp->lockbase) &&
168		    (wp->lockbase + wp->length >= owp->lockbase)) {
169			overlap++;
170			break;
171		}
172	}
173
174	return (overlap);
175}
176
177static int
178gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
179    caddr_t addr, off_t boff, off_t bcount)
180{
181	struct gv_sd *parity, *s;
182	struct gv_bioq *bq;
183	struct bio *cbp;
184	int i, psdno;
185	off_t real_len, real_off;
186
187	if (p == NULL || LIST_EMPTY(&p->subdisks))
188		return (ENXIO);
189
190	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1);
191
192	/* Find the right subdisk. */
193	parity = NULL;
194	i = 0;
195	LIST_FOREACH(s, &p->subdisks, in_plex) {
196		if (i == psdno) {
197			parity = s;
198			break;
199		}
200		i++;
201	}
202
203	/* Parity stripe not found. */
204	if (parity == NULL)
205		return (ENXIO);
206
207	if (parity->state != GV_SD_UP)
208		return (ENXIO);
209
210	wp->length = real_len;
211	wp->data = addr;
212	wp->lockbase = real_off;
213
214	/* Read all subdisks. */
215	LIST_FOREACH(s, &p->subdisks, in_plex) {
216		/* Skip the parity subdisk. */
217		if (s == parity)
218			continue;
219		/* Skip growing subdisks. */
220		if (s->flags & GV_SD_GROW)
221			continue;
222
223		cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
224		if (cbp == NULL)
225			return (ENOMEM);
226		cbp->bio_cmd = BIO_READ;
227
228		bioq_insert_tail(p->bqueue, cbp);
229
230		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
231		bq->bp = cbp;
232		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
233	}
234
235	/* Read the parity data. */
236	cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
237	if (cbp == NULL)
238		return (ENOMEM);
239	cbp->bio_cmd = BIO_READ;
240	wp->waiting = cbp;
241
242	/*
243	 * In case we want to rebuild the parity, create an extra BIO to write
244	 * it out.  It also acts as buffer for the XOR operations.
245	 */
246	cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1);
247	if (cbp == NULL)
248		return (ENOMEM);
249	wp->parity = cbp;
250
251	return (0);
252}
253
254/* Rebuild a degraded RAID5 plex. */
255static int
256gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
257    caddr_t addr, off_t boff, off_t bcount)
258{
259	struct gv_sd *broken, *s;
260	struct gv_bioq *bq;
261	struct bio *cbp;
262	off_t real_len, real_off;
263
264	if (p == NULL || LIST_EMPTY(&p->subdisks))
265		return (ENXIO);
266
267	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1);
268
269	/* Find the right subdisk. */
270	broken = NULL;
271	LIST_FOREACH(s, &p->subdisks, in_plex) {
272		if (s->state != GV_SD_UP)
273			broken = s;
274	}
275
276	/* Broken stripe not found. */
277	if (broken == NULL)
278		return (ENXIO);
279
280	switch (broken->state) {
281	case GV_SD_UP:
282		return (EINVAL);
283
284	case GV_SD_STALE:
285		if (!(bp->bio_pflags & GV_BIO_REBUILD))
286			return (ENXIO);
287
288		G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
289		gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
290		/* Set this bit now, but should be set at end. */
291		broken->flags |= GV_SD_CANGOUP;
292		break;
293
294	case GV_SD_REVIVING:
295		break;
296
297	default:
298		/* All other subdisk states mean it's not accessible. */
299		return (ENXIO);
300	}
301
302	wp->length = real_len;
303	wp->data = addr;
304	wp->lockbase = real_off;
305
306	KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
307
308	/* Read all subdisks. */
309	LIST_FOREACH(s, &p->subdisks, in_plex) {
310		/* Skip the broken subdisk. */
311		if (s == broken)
312			continue;
313
314		/* Skip growing subdisks. */
315		if (s->flags & GV_SD_GROW)
316			continue;
317
318		cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
319		if (cbp == NULL)
320			return (ENOMEM);
321		cbp->bio_cmd = BIO_READ;
322
323		bioq_insert_tail(p->bqueue, cbp);
324
325		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
326		bq->bp = cbp;
327		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
328	}
329
330	/* Write the parity data. */
331	cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1);
332	if (cbp == NULL)
333		return (ENOMEM);
334	wp->parity = cbp;
335
336	p->synced = boff;
337
338	/* Post notification that we're finished. */
339	return (0);
340}
341
342/* Build a request group to perform (part of) a RAID5 request. */
343static int
344gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp,
345    struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay)
346{
347	struct g_geom *gp;
348	struct gv_sd *broken, *original, *parity, *s;
349	struct gv_bioq *bq;
350	struct bio *cbp;
351	int i, psdno, sdno, type, grow;
352	off_t real_len, real_off;
353
354	gp = bp->bio_to->geom;
355
356	if (p == NULL || LIST_EMPTY(&p->subdisks))
357		return (ENXIO);
358
359	/* We are optimistic and assume that this request will be OK. */
360#define	REQ_TYPE_NORMAL		0
361#define	REQ_TYPE_DEGRADED	1
362#define	REQ_TYPE_NOPARITY	2
363
364	type = REQ_TYPE_NORMAL;
365	original = parity = broken = NULL;
366
367	/* XXX: The resize won't crash with rebuild or sync, but we should still
368	 * be aware of it. Also this should perhaps be done on rebuild/check as
369	 * well?
370	 */
371	/* If we're over, we must use the old. */
372	if (boff >= p->synced) {
373		grow = 1;
374	/* Or if over the resized offset, we use all drives. */
375	} else if (boff + bcount <= p->synced) {
376		grow = 0;
377	/* Else, we're in the middle, and must wait a bit. */
378	} else {
379		bioq_disksort(p->rqueue, bp);
380		*delay = 1;
381		return (0);
382	}
383	gv_raid5_offset(p, boff, bcount, &real_off, &real_len,
384	    &sdno, &psdno, grow);
385
386	/* Find the right subdisks. */
387	i = 0;
388	LIST_FOREACH(s, &p->subdisks, in_plex) {
389		if (i == sdno)
390			original = s;
391		if (i == psdno)
392			parity = s;
393		if (s->state != GV_SD_UP)
394			broken = s;
395		i++;
396	}
397
398	if ((original == NULL) || (parity == NULL))
399		return (ENXIO);
400
401	/* Our data stripe is missing. */
402	if (original->state != GV_SD_UP)
403		type = REQ_TYPE_DEGRADED;
404
405	/* If synchronizing request, just write it if disks are stale. */
406	if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE &&
407	    bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) {
408		type = REQ_TYPE_NORMAL;
409	/* Our parity stripe is missing. */
410	} else if (parity->state != GV_SD_UP) {
411		/* We cannot take another failure if we're already degraded. */
412		if (type != REQ_TYPE_NORMAL)
413			return (ENXIO);
414		else
415			type = REQ_TYPE_NOPARITY;
416	}
417
418	wp->length = real_len;
419	wp->data = addr;
420	wp->lockbase = real_off;
421
422	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
423
424	if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced))
425		type = REQ_TYPE_NORMAL;
426
427	if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) {
428		bioq_disksort(p->rqueue, bp);
429		*delay = 1;
430		return (0);
431	}
432
433	switch (bp->bio_cmd) {
434	case BIO_READ:
435		/*
436		 * For a degraded read we need to read in all stripes except
437		 * the broken one plus the parity stripe and then recalculate
438		 * the desired data.
439		 */
440		if (type == REQ_TYPE_DEGRADED) {
441			bzero(wp->data, wp->length);
442			LIST_FOREACH(s, &p->subdisks, in_plex) {
443				/* Skip the broken subdisk. */
444				if (s == broken)
445					continue;
446				/* Skip growing if within offset. */
447				if (grow && s->flags & GV_SD_GROW)
448					continue;
449				cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
450				if (cbp == NULL)
451					return (ENOMEM);
452
453				bioq_insert_tail(p->bqueue, cbp);
454
455				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
456				bq->bp = cbp;
457				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
458			}
459
460		/* A normal read can be fulfilled with the original subdisk. */
461		} else {
462			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0);
463			if (cbp == NULL)
464				return (ENOMEM);
465
466			bioq_insert_tail(p->bqueue, cbp);
467		}
468		wp->lockbase = -1;
469
470		break;
471
472	case BIO_WRITE:
473		/*
474		 * A degraded write means we cannot write to the original data
475		 * subdisk.  Thus we need to read in all valid stripes,
476		 * recalculate the parity from the original data, and then
477		 * write the parity stripe back out.
478		 */
479		if (type == REQ_TYPE_DEGRADED) {
480			/* Read all subdisks. */
481			LIST_FOREACH(s, &p->subdisks, in_plex) {
482				/* Skip the broken and the parity subdisk. */
483				if ((s == broken) || (s == parity))
484					continue;
485				/* Skip growing if within offset. */
486				if (grow && s->flags & GV_SD_GROW)
487					continue;
488
489				cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
490				if (cbp == NULL)
491					return (ENOMEM);
492				cbp->bio_cmd = BIO_READ;
493
494				bioq_insert_tail(p->bqueue, cbp);
495
496				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
497				bq->bp = cbp;
498				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
499			}
500
501			/* Write the parity data. */
502			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
503			if (cbp == NULL)
504				return (ENOMEM);
505			bcopy(addr, cbp->bio_data, wp->length);
506			wp->parity = cbp;
507
508		/*
509		 * When the parity stripe is missing we just write out the data.
510		 */
511		} else if (type == REQ_TYPE_NOPARITY) {
512			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
513			if (cbp == NULL)
514				return (ENOMEM);
515
516			bioq_insert_tail(p->bqueue, cbp);
517
518			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
519			bq->bp = cbp;
520			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
521
522		/*
523		 * A normal write request goes to the original subdisk, then we
524		 * read in all other stripes, recalculate the parity and write
525		 * out the parity again.
526		 */
527		} else {
528			/* Read old parity. */
529			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
530			if (cbp == NULL)
531				return (ENOMEM);
532			cbp->bio_cmd = BIO_READ;
533
534			bioq_insert_tail(p->bqueue, cbp);
535
536			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
537			bq->bp = cbp;
538			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
539
540			/* Read old data. */
541			cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1);
542			if (cbp == NULL)
543				return (ENOMEM);
544			cbp->bio_cmd = BIO_READ;
545
546			bioq_insert_tail(p->bqueue, cbp);
547
548			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
549			bq->bp = cbp;
550			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
551
552			/* Write new data. */
553			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
554			if (cbp == NULL)
555				return (ENOMEM);
556
557			/*
558			 * We must not write the new data until the old data
559			 * was read, so hold this BIO back until we're ready
560			 * for it.
561			 */
562			wp->waiting = cbp;
563
564			/* The final bio for the parity. */
565			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
566			if (cbp == NULL)
567				return (ENOMEM);
568
569			/* Remember that this is the BIO for the parity data. */
570			wp->parity = cbp;
571		}
572		break;
573
574	default:
575		return (EINVAL);
576	}
577
578	return (0);
579}
580
581/*
582 * Calculate the offsets in the various subdisks for a RAID5 request. Also take
583 * care of new subdisks in an expanded RAID5 array.
584 * XXX: This assumes that the new subdisks are inserted after the others (which
585 * is okay as long as plex_offset is larger). If subdisks are inserted into the
586 * plexlist before, we get problems.
587 */
588static int
589gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
590    off_t *real_len, int *sdno, int *psdno, int growing)
591{
592	struct gv_sd *s;
593	int sd, psd, sdcount;
594	off_t len_left, stripeend, stripeoff, stripestart;
595
596	sdcount = p->sdcount;
597	if (growing) {
598		LIST_FOREACH(s, &p->subdisks, in_plex) {
599			if (s->flags & GV_SD_GROW)
600				sdcount--;
601		}
602	}
603
604	/* The number of the subdisk containing the parity stripe. */
605	psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) %
606	    sdcount;
607	KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
608
609	/* Offset of the start address from the start of the stripe. */
610	stripeoff = boff % (p->stripesize * (sdcount - 1));
611	KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
612
613	/* The number of the subdisk where the stripe resides. */
614	sd = stripeoff / p->stripesize;
615	KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
616
617	/* At or past parity subdisk. */
618	if (sd >= psd)
619		sd++;
620
621	/* The offset of the stripe on this subdisk. */
622	stripestart = (boff - stripeoff) / (sdcount - 1);
623	KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
624
625	stripeoff %= p->stripesize;
626
627	/* The offset of the request on this subdisk. */
628	*real_off = stripestart + stripeoff;
629
630	stripeend = stripestart + p->stripesize;
631	len_left = stripeend - *real_off;
632	KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
633
634	*real_len = (bcount <= len_left) ? bcount : len_left;
635
636	if (sdno != NULL)
637		*sdno = sd;
638	if (psdno != NULL)
639		*psdno = psd;
640
641	return (0);
642}
643
644static struct bio *
645gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp,
646    caddr_t addr, int use_wp)
647{
648	struct bio *cbp;
649
650	cbp = g_clone_bio(bp);
651	if (cbp == NULL)
652		return (NULL);
653	if (addr == NULL) {
654		cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO);
655		cbp->bio_cflags |= GV_BIO_MALLOC;
656	} else
657		cbp->bio_data = addr;
658	cbp->bio_offset = wp->lockbase + s->drive_offset;
659	cbp->bio_length = wp->length;
660	cbp->bio_done = gv_done;
661	cbp->bio_caller1 = s;
662	s->drive_sc->active++;
663	if (use_wp)
664		cbp->bio_caller2 = wp;
665
666	return (cbp);
667}
668