1130389Sle/*-
2190507Slulf * Copyright (c) 2004, 2007 Lukas Ertl
3130389Sle * All rights reserved.
4130389Sle *
5130389Sle * Redistribution and use in source and binary forms, with or without
6130389Sle * modification, are permitted provided that the following conditions
7130389Sle * are met:
8130389Sle * 1. Redistributions of source code must retain the above copyright
9130389Sle *    notice, this list of conditions and the following disclaimer.
10130389Sle * 2. Redistributions in binary form must reproduce the above copyright
11130389Sle *    notice, this list of conditions and the following disclaimer in the
12130389Sle *    documentation and/or other materials provided with the distribution.
13130389Sle *
14130389Sle * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15130389Sle * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16130389Sle * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17130389Sle * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18130389Sle * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19130389Sle * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20130389Sle * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21130389Sle * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22130389Sle * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23130389Sle * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24130389Sle * SUCH DAMAGE.
25130389Sle */
26130389Sle
27130389Sle#include <sys/cdefs.h>
28130389Sle__FBSDID("$FreeBSD$");
29130389Sle
30130389Sle#include <sys/param.h>
31130389Sle#include <sys/bio.h>
32130389Sle#include <sys/lock.h>
33130389Sle#include <sys/malloc.h>
34130389Sle#include <sys/systm.h>
35130389Sle
36130389Sle#include <geom/geom.h>
37130389Sle#include <geom/vinum/geom_vinum_var.h>
38130389Sle#include <geom/vinum/geom_vinum_raid5.h>
39130389Sle#include <geom/vinum/geom_vinum.h>
40130389Sle
41190507Slulfstatic int		gv_raid5_offset(struct gv_plex *, off_t, off_t,
42190507Slulf			    off_t *, off_t *, int *, int *, int);
43190507Slulfstatic struct bio *	gv_raid5_clone_bio(struct bio *, struct gv_sd *,
44190507Slulf			    struct gv_raid5_packet *, caddr_t, int);
45190507Slulfstatic int	gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *,
46190507Slulf		    struct bio *, caddr_t, off_t, off_t, int *);
47190507Slulfstatic int	gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *,
48190507Slulf		    struct bio *, caddr_t, off_t, off_t);
49190507Slulfstatic int	gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *,
50190507Slulf		    struct bio *, caddr_t, off_t, off_t);
51137730Sle
52190507Slulfstruct gv_raid5_packet *
53190507Slulfgv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff,
54190507Slulf    off_t bcount)
55190507Slulf{
56190507Slulf	struct bio *cbp;
57190507Slulf	struct gv_raid5_packet *wp, *wp2;
58190507Slulf	struct gv_bioq *bq, *bq2;
59190507Slulf	int err, delay;
60190507Slulf
61190507Slulf	delay = 0;
62190507Slulf	wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
63190507Slulf	wp->bio = bp;
64190507Slulf	wp->waiting = NULL;
65190507Slulf	wp->parity = NULL;
66190507Slulf	TAILQ_INIT(&wp->bits);
67190507Slulf
68191856Slulf	if (bp->bio_pflags & GV_BIO_REBUILD)
69190507Slulf		err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount);
70191856Slulf	else if (bp->bio_pflags & GV_BIO_CHECK)
71190507Slulf		err = gv_raid5_check(p, wp, bp, addr, boff, bcount);
72190507Slulf	else
73190507Slulf		err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay);
74190507Slulf
75190507Slulf	/* Means we have a delayed request. */
76190507Slulf	if (delay) {
77190507Slulf		g_free(wp);
78190507Slulf		return (NULL);
79190507Slulf	}
80190507Slulf
81190507Slulf	/*
82190507Slulf	 * Building the sub-request failed, we probably need to clean up a lot.
83190507Slulf	 */
84190507Slulf	if (err) {
85190507Slulf		G_VINUM_LOGREQ(0, bp, "raid5 plex request failed.");
86190507Slulf		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
87190507Slulf			TAILQ_REMOVE(&wp->bits, bq, queue);
88190507Slulf			g_free(bq);
89190507Slulf		}
90190507Slulf		if (wp->waiting != NULL) {
91190507Slulf			if (wp->waiting->bio_cflags & GV_BIO_MALLOC)
92190507Slulf				g_free(wp->waiting->bio_data);
93190507Slulf			g_destroy_bio(wp->waiting);
94190507Slulf		}
95190507Slulf		if (wp->parity != NULL) {
96190507Slulf			if (wp->parity->bio_cflags & GV_BIO_MALLOC)
97190507Slulf				g_free(wp->parity->bio_data);
98190507Slulf			g_destroy_bio(wp->parity);
99190507Slulf		}
100190507Slulf		g_free(wp);
101190507Slulf
102190507Slulf		TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
103190507Slulf			if (wp->bio != bp)
104190507Slulf				continue;
105190507Slulf
106190507Slulf			TAILQ_REMOVE(&p->packets, wp, list);
107190507Slulf			TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
108190507Slulf				TAILQ_REMOVE(&wp->bits, bq, queue);
109190507Slulf				g_free(bq);
110190507Slulf			}
111190507Slulf			g_free(wp);
112190507Slulf		}
113190507Slulf
114190507Slulf		cbp = bioq_takefirst(p->bqueue);
115190507Slulf		while (cbp != NULL) {
116190507Slulf			if (cbp->bio_cflags & GV_BIO_MALLOC)
117190507Slulf				g_free(cbp->bio_data);
118190507Slulf			g_destroy_bio(cbp);
119190507Slulf			cbp = bioq_takefirst(p->bqueue);
120190507Slulf		}
121190507Slulf
122190507Slulf		/* If internal, stop and reset state. */
123191856Slulf		if (bp->bio_pflags & GV_BIO_INTERNAL) {
124191856Slulf			if (bp->bio_pflags & GV_BIO_MALLOC)
125191852Slulf				g_free(bp->bio_data);
126190507Slulf			g_destroy_bio(bp);
127190507Slulf			/* Reset flags. */
128190507Slulf			p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
129190507Slulf			    GV_PLEX_GROWING);
130190507Slulf			return (NULL);
131190507Slulf		}
132190507Slulf		g_io_deliver(bp, err);
133190507Slulf		return (NULL);
134190507Slulf	}
135190507Slulf
136190507Slulf	return (wp);
137190507Slulf}
138190507Slulf
139130389Sle/*
140130389Sle * Check if the stripe that the work packet wants is already being used by
141130389Sle * some other work packet.
142130389Sle */
143130389Sleint
144135426Slegv_stripe_active(struct gv_plex *p, struct bio *bp)
145130389Sle{
146135426Sle	struct gv_raid5_packet *wp, *owp;
147135426Sle	int overlap;
148130389Sle
149190507Slulf	wp = bp->bio_caller2;
150135426Sle	if (wp->lockbase == -1)
151135426Sle		return (0);
152130389Sle
153135426Sle	overlap = 0;
154135426Sle	TAILQ_FOREACH(owp, &p->packets, list) {
155135426Sle		if (owp == wp)
156135426Sle			break;
157135426Sle		if ((wp->lockbase >= owp->lockbase) &&
158135426Sle		    (wp->lockbase <= owp->lockbase + owp->length)) {
159135426Sle			overlap++;
160135426Sle			break;
161130389Sle		}
162135426Sle		if ((wp->lockbase <= owp->lockbase) &&
163135426Sle		    (wp->lockbase + wp->length >= owp->lockbase)) {
164135426Sle			overlap++;
165135426Sle			break;
166130389Sle		}
167130389Sle	}
168130389Sle
169135426Sle	return (overlap);
170130389Sle}
171130389Sle
172190507Slulfstatic int
173190507Slulfgv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
174138110Sle    caddr_t addr, off_t boff, off_t bcount)
175138110Sle{
176138110Sle	struct gv_sd *parity, *s;
177138110Sle	struct gv_bioq *bq;
178190507Slulf	struct bio *cbp;
179138110Sle	int i, psdno;
180138110Sle	off_t real_len, real_off;
181138110Sle
182138110Sle	if (p == NULL || LIST_EMPTY(&p->subdisks))
183138110Sle		return (ENXIO);
184138110Sle
185190507Slulf	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1);
186138110Sle
187138110Sle	/* Find the right subdisk. */
188138110Sle	parity = NULL;
189138110Sle	i = 0;
190138110Sle	LIST_FOREACH(s, &p->subdisks, in_plex) {
191138110Sle		if (i == psdno) {
192138110Sle			parity = s;
193138110Sle			break;
194138110Sle		}
195138110Sle		i++;
196138110Sle	}
197138110Sle
198138110Sle	/* Parity stripe not found. */
199138110Sle	if (parity == NULL)
200138110Sle		return (ENXIO);
201138110Sle
202138110Sle	if (parity->state != GV_SD_UP)
203138110Sle		return (ENXIO);
204138110Sle
205138110Sle	wp->length = real_len;
206138110Sle	wp->data = addr;
207138110Sle	wp->lockbase = real_off;
208138110Sle
209138110Sle	/* Read all subdisks. */
210138110Sle	LIST_FOREACH(s, &p->subdisks, in_plex) {
211138110Sle		/* Skip the parity subdisk. */
212138110Sle		if (s == parity)
213138110Sle			continue;
214190507Slulf		/* Skip growing subdisks. */
215190507Slulf		if (s->flags & GV_SD_GROW)
216190507Slulf			continue;
217138110Sle
218190507Slulf		cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
219138110Sle		if (cbp == NULL)
220138110Sle			return (ENOMEM);
221138110Sle		cbp->bio_cmd = BIO_READ;
222138110Sle
223190507Slulf		bioq_insert_tail(p->bqueue, cbp);
224138110Sle
225138110Sle		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
226138110Sle		bq->bp = cbp;
227138110Sle		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
228138110Sle	}
229138110Sle
230138110Sle	/* Read the parity data. */
231190507Slulf	cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
232138110Sle	if (cbp == NULL)
233138110Sle		return (ENOMEM);
234138110Sle	cbp->bio_cmd = BIO_READ;
235138110Sle	wp->waiting = cbp;
236138110Sle
237138110Sle	/*
238138110Sle	 * In case we want to rebuild the parity, create an extra BIO to write
239138110Sle	 * it out.  It also acts as buffer for the XOR operations.
240138110Sle	 */
241190507Slulf	cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1);
242138110Sle	if (cbp == NULL)
243138110Sle		return (ENOMEM);
244138110Sle	wp->parity = cbp;
245138110Sle
246138110Sle	return (0);
247138110Sle}
248138110Sle
249138110Sle/* Rebuild a degraded RAID5 plex. */
250190507Slulfstatic int
251190507Slulfgv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
252135966Sle    caddr_t addr, off_t boff, off_t bcount)
253135966Sle{
254135966Sle	struct gv_sd *broken, *s;
255135966Sle	struct gv_bioq *bq;
256190507Slulf	struct bio *cbp;
257137730Sle	off_t real_len, real_off;
258135966Sle
259135966Sle	if (p == NULL || LIST_EMPTY(&p->subdisks))
260135966Sle		return (ENXIO);
261135966Sle
262190507Slulf	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1);
263135966Sle
264135966Sle	/* Find the right subdisk. */
265135966Sle	broken = NULL;
266135966Sle	LIST_FOREACH(s, &p->subdisks, in_plex) {
267135966Sle		if (s->state != GV_SD_UP)
268135966Sle			broken = s;
269135966Sle	}
270135966Sle
271138110Sle	/* Broken stripe not found. */
272135966Sle	if (broken == NULL)
273135966Sle		return (ENXIO);
274135966Sle
275135966Sle	switch (broken->state) {
276135966Sle	case GV_SD_UP:
277135966Sle		return (EINVAL);
278135966Sle
279135966Sle	case GV_SD_STALE:
280191856Slulf		if (!(bp->bio_pflags & GV_BIO_REBUILD))
281135966Sle			return (ENXIO);
282135966Sle
283184292Slulf		G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
284135966Sle		gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
285190507Slulf		/* Set this bit now, but should be set at end. */
286190507Slulf		broken->flags |= GV_SD_CANGOUP;
287135966Sle		break;
288135966Sle
289135966Sle	case GV_SD_REVIVING:
290135966Sle		break;
291135966Sle
292135966Sle	default:
293135966Sle		/* All other subdisk states mean it's not accessible. */
294135966Sle		return (ENXIO);
295135966Sle	}
296135966Sle
297135966Sle	wp->length = real_len;
298135966Sle	wp->data = addr;
299135966Sle	wp->lockbase = real_off;
300135966Sle
301137730Sle	KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
302135966Sle
303135966Sle	/* Read all subdisks. */
304135966Sle	LIST_FOREACH(s, &p->subdisks, in_plex) {
305135966Sle		/* Skip the broken subdisk. */
306135966Sle		if (s == broken)
307135966Sle			continue;
308135966Sle
309190507Slulf		/* Skip growing subdisks. */
310190507Slulf		if (s->flags & GV_SD_GROW)
311190507Slulf			continue;
312190507Slulf
313190507Slulf		cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
314135966Sle		if (cbp == NULL)
315135966Sle			return (ENOMEM);
316135966Sle		cbp->bio_cmd = BIO_READ;
317135966Sle
318190507Slulf		bioq_insert_tail(p->bqueue, cbp);
319135966Sle
320135966Sle		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
321135966Sle		bq->bp = cbp;
322135966Sle		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
323135966Sle	}
324135966Sle
325135966Sle	/* Write the parity data. */
326190507Slulf	cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1);
327135966Sle	if (cbp == NULL)
328135966Sle		return (ENOMEM);
329135966Sle	wp->parity = cbp;
330135966Sle
331135966Sle	p->synced = boff;
332135966Sle
333190507Slulf	/* Post notification that we're finished. */
334135966Sle	return (0);
335135966Sle}
336135966Sle
337130389Sle/* Build a request group to perform (part of) a RAID5 request. */
338190507Slulfstatic int
339190507Slulfgv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp,
340190507Slulf    struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay)
341130389Sle{
342130389Sle	struct g_geom *gp;
343130389Sle	struct gv_sd *broken, *original, *parity, *s;
344135426Sle	struct gv_bioq *bq;
345190507Slulf	struct bio *cbp;
346190507Slulf	int i, psdno, sdno, type, grow;
347137730Sle	off_t real_len, real_off;
348130389Sle
349130389Sle	gp = bp->bio_to->geom;
350130389Sle
351130389Sle	if (p == NULL || LIST_EMPTY(&p->subdisks))
352130389Sle		return (ENXIO);
353130389Sle
354130389Sle	/* We are optimistic and assume that this request will be OK. */
355135426Sle#define	REQ_TYPE_NORMAL		0
356135426Sle#define	REQ_TYPE_DEGRADED	1
357135426Sle#define	REQ_TYPE_NOPARITY	2
358135426Sle
359135426Sle	type = REQ_TYPE_NORMAL;
360130389Sle	original = parity = broken = NULL;
361130389Sle
362190507Slulf	/* XXX: The resize won't crash with rebuild or sync, but we should still
363190507Slulf	 * be aware of it. Also this should perhaps be done on rebuild/check as
364190507Slulf	 * well?
365190507Slulf	 */
366190507Slulf	/* If we're over, we must use the old. */
367190507Slulf	if (boff >= p->synced) {
368190507Slulf		grow = 1;
369190507Slulf	/* Or if over the resized offset, we use all drives. */
370190507Slulf	} else if (boff + bcount <= p->synced) {
371190507Slulf		grow = 0;
372190507Slulf	/* Else, we're in the middle, and must wait a bit. */
373190507Slulf	} else {
374190507Slulf		bioq_disksort(p->rqueue, bp);
375190507Slulf		*delay = 1;
376190507Slulf		return (0);
377190507Slulf	}
378190507Slulf	gv_raid5_offset(p, boff, bcount, &real_off, &real_len,
379190507Slulf	    &sdno, &psdno, grow);
380130389Sle
381130389Sle	/* Find the right subdisks. */
382130389Sle	i = 0;
383130389Sle	LIST_FOREACH(s, &p->subdisks, in_plex) {
384130389Sle		if (i == sdno)
385130389Sle			original = s;
386130389Sle		if (i == psdno)
387130389Sle			parity = s;
388130389Sle		if (s->state != GV_SD_UP)
389130389Sle			broken = s;
390130389Sle		i++;
391130389Sle	}
392130389Sle
393130389Sle	if ((original == NULL) || (parity == NULL))
394130389Sle		return (ENXIO);
395130389Sle
396130389Sle	/* Our data stripe is missing. */
397130389Sle	if (original->state != GV_SD_UP)
398135426Sle		type = REQ_TYPE_DEGRADED;
399190507Slulf
400190507Slulf	/* If synchronizing request, just write it if disks are stale. */
401190507Slulf	if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE &&
402191856Slulf	    bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) {
403190507Slulf		type = REQ_TYPE_NORMAL;
404130389Sle	/* Our parity stripe is missing. */
405190507Slulf	} else if (parity->state != GV_SD_UP) {
406130389Sle		/* We cannot take another failure if we're already degraded. */
407135426Sle		if (type != REQ_TYPE_NORMAL)
408130389Sle			return (ENXIO);
409130389Sle		else
410135426Sle			type = REQ_TYPE_NOPARITY;
411130389Sle	}
412130389Sle
413135426Sle	wp->length = real_len;
414130389Sle	wp->data = addr;
415135426Sle	wp->lockbase = real_off;
416130389Sle
417130389Sle	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
418130389Sle
419190507Slulf	if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced))
420135966Sle		type = REQ_TYPE_NORMAL;
421135966Sle
422190507Slulf	if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) {
423190507Slulf		bioq_disksort(p->rqueue, bp);
424190507Slulf		*delay = 1;
425190507Slulf		return (0);
426190507Slulf	}
427190507Slulf
428130389Sle	switch (bp->bio_cmd) {
429130389Sle	case BIO_READ:
430130389Sle		/*
431130389Sle		 * For a degraded read we need to read in all stripes except
432130389Sle		 * the broken one plus the parity stripe and then recalculate
433130389Sle		 * the desired data.
434130389Sle		 */
435135426Sle		if (type == REQ_TYPE_DEGRADED) {
436135426Sle			bzero(wp->data, wp->length);
437130389Sle			LIST_FOREACH(s, &p->subdisks, in_plex) {
438130389Sle				/* Skip the broken subdisk. */
439130389Sle				if (s == broken)
440130389Sle					continue;
441190507Slulf				/* Skip growing if within offset. */
442190507Slulf				if (grow && s->flags & GV_SD_GROW)
443190507Slulf					continue;
444190507Slulf				cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
445135426Sle				if (cbp == NULL)
446130389Sle					return (ENOMEM);
447135426Sle
448190507Slulf				bioq_insert_tail(p->bqueue, cbp);
449135426Sle
450135426Sle				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
451135426Sle				bq->bp = cbp;
452135426Sle				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
453130389Sle			}
454130389Sle
455130389Sle		/* A normal read can be fulfilled with the original subdisk. */
456130389Sle		} else {
457190507Slulf			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0);
458135426Sle			if (cbp == NULL)
459130389Sle				return (ENOMEM);
460135426Sle
461190507Slulf			bioq_insert_tail(p->bqueue, cbp);
462130389Sle		}
463135426Sle		wp->lockbase = -1;
464135426Sle
465130389Sle		break;
466130389Sle
467130389Sle	case BIO_WRITE:
468130389Sle		/*
469130389Sle		 * A degraded write means we cannot write to the original data
470130389Sle		 * subdisk.  Thus we need to read in all valid stripes,
471130389Sle		 * recalculate the parity from the original data, and then
472130389Sle		 * write the parity stripe back out.
473130389Sle		 */
474135426Sle		if (type == REQ_TYPE_DEGRADED) {
475135426Sle			/* Read all subdisks. */
476130389Sle			LIST_FOREACH(s, &p->subdisks, in_plex) {
477130389Sle				/* Skip the broken and the parity subdisk. */
478135426Sle				if ((s == broken) || (s == parity))
479130389Sle					continue;
480190507Slulf				/* Skip growing if within offset. */
481190507Slulf				if (grow && s->flags & GV_SD_GROW)
482190507Slulf					continue;
483130389Sle
484190507Slulf				cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
485135426Sle				if (cbp == NULL)
486130389Sle					return (ENOMEM);
487135426Sle				cbp->bio_cmd = BIO_READ;
488135426Sle
489190507Slulf				bioq_insert_tail(p->bqueue, cbp);
490135426Sle
491135426Sle				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
492135426Sle				bq->bp = cbp;
493135426Sle				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
494130389Sle			}
495130389Sle
496135426Sle			/* Write the parity data. */
497190507Slulf			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
498135426Sle			if (cbp == NULL)
499130389Sle				return (ENOMEM);
500190507Slulf			bcopy(addr, cbp->bio_data, wp->length);
501135426Sle			wp->parity = cbp;
502130389Sle
503130389Sle		/*
504135426Sle		 * When the parity stripe is missing we just write out the data.
505130389Sle		 */
506135426Sle		} else if (type == REQ_TYPE_NOPARITY) {
507190507Slulf			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
508135426Sle			if (cbp == NULL)
509130925Sle				return (ENOMEM);
510130389Sle
511190507Slulf			bioq_insert_tail(p->bqueue, cbp);
512130389Sle
513135426Sle			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
514135426Sle			bq->bp = cbp;
515135426Sle			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
516130389Sle
517130389Sle		/*
518130389Sle		 * A normal write request goes to the original subdisk, then we
519130389Sle		 * read in all other stripes, recalculate the parity and write
520130389Sle		 * out the parity again.
521130389Sle		 */
522130389Sle		} else {
523135426Sle			/* Read old parity. */
524190507Slulf			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
525135426Sle			if (cbp == NULL)
526130925Sle				return (ENOMEM);
527135426Sle			cbp->bio_cmd = BIO_READ;
528130389Sle
529190507Slulf			bioq_insert_tail(p->bqueue, cbp);
530135426Sle
531135426Sle			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
532135426Sle			bq->bp = cbp;
533135426Sle			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
534135426Sle
535135426Sle			/* Read old data. */
536190507Slulf			cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1);
537135426Sle			if (cbp == NULL)
538135426Sle				return (ENOMEM);
539135426Sle			cbp->bio_cmd = BIO_READ;
540135426Sle
541190507Slulf			bioq_insert_tail(p->bqueue, cbp);
542135426Sle
543135426Sle			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
544135426Sle			bq->bp = cbp;
545135426Sle			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
546135426Sle
547135426Sle			/* Write new data. */
548190507Slulf			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
549135426Sle			if (cbp == NULL)
550135426Sle				return (ENOMEM);
551135426Sle
552135426Sle			/*
553135426Sle			 * We must not write the new data until the old data
554135426Sle			 * was read, so hold this BIO back until we're ready
555135426Sle			 * for it.
556135426Sle			 */
557135426Sle			wp->waiting = cbp;
558135426Sle
559135426Sle			/* The final bio for the parity. */
560190507Slulf			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
561135426Sle			if (cbp == NULL)
562135426Sle				return (ENOMEM);
563135426Sle
564135426Sle			/* Remember that this is the BIO for the parity data. */
565135426Sle			wp->parity = cbp;
566130389Sle		}
567130389Sle		break;
568135426Sle
569130389Sle	default:
570130389Sle		return (EINVAL);
571130389Sle	}
572130389Sle
573130389Sle	return (0);
574130389Sle}
575137730Sle
576190507Slulf/*
577190507Slulf * Calculate the offsets in the various subdisks for a RAID5 request. Also take
578190507Slulf * care of new subdisks in an expanded RAID5 array.
579190507Slulf * XXX: This assumes that the new subdisks are inserted after the others (which
580190507Slulf * is okay as long as plex_offset is larger). If subdisks are inserted into the
581190507Slulf * plexlist before, we get problems.
582190507Slulf */
583190507Slulfstatic int
584137730Slegv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
585190507Slulf    off_t *real_len, int *sdno, int *psdno, int growing)
586137730Sle{
587190507Slulf	struct gv_sd *s;
588190507Slulf	int sd, psd, sdcount;
589137730Sle	off_t len_left, stripeend, stripeoff, stripestart;
590137730Sle
591190507Slulf	sdcount = p->sdcount;
592190507Slulf	if (growing) {
593190507Slulf		LIST_FOREACH(s, &p->subdisks, in_plex) {
594190507Slulf			if (s->flags & GV_SD_GROW)
595190507Slulf				sdcount--;
596190507Slulf		}
597190507Slulf	}
598190507Slulf
599137730Sle	/* The number of the subdisk containing the parity stripe. */
600190507Slulf	psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) %
601190507Slulf	    sdcount;
602137730Sle	KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
603137730Sle
604137730Sle	/* Offset of the start address from the start of the stripe. */
605190507Slulf	stripeoff = boff % (p->stripesize * (sdcount - 1));
606137730Sle	KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
607137730Sle
608137730Sle	/* The number of the subdisk where the stripe resides. */
609137730Sle	sd = stripeoff / p->stripesize;
610137730Sle	KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
611137730Sle
612137730Sle	/* At or past parity subdisk. */
613137730Sle	if (sd >= psd)
614137730Sle		sd++;
615137730Sle
616137730Sle	/* The offset of the stripe on this subdisk. */
617190507Slulf	stripestart = (boff - stripeoff) / (sdcount - 1);
618137730Sle	KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
619137730Sle
620137730Sle	stripeoff %= p->stripesize;
621137730Sle
622137730Sle	/* The offset of the request on this subdisk. */
623137730Sle	*real_off = stripestart + stripeoff;
624137730Sle
625137730Sle	stripeend = stripestart + p->stripesize;
626137730Sle	len_left = stripeend - *real_off;
627137730Sle	KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
628137730Sle
629137730Sle	*real_len = (bcount <= len_left) ? bcount : len_left;
630137730Sle
631137730Sle	if (sdno != NULL)
632137730Sle		*sdno = sd;
633137730Sle	if (psdno != NULL)
634137730Sle		*psdno = psd;
635137730Sle
636137730Sle	return (0);
637137730Sle}
638190507Slulf
639190507Slulfstatic struct bio *
640190507Slulfgv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp,
641190507Slulf    caddr_t addr, int use_wp)
642190507Slulf{
643190507Slulf	struct bio *cbp;
644190507Slulf
645190507Slulf	cbp = g_clone_bio(bp);
646190507Slulf	if (cbp == NULL)
647190507Slulf		return (NULL);
648190507Slulf	if (addr == NULL) {
649190507Slulf		cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO);
650190507Slulf		cbp->bio_cflags |= GV_BIO_MALLOC;
651190507Slulf	} else
652190507Slulf		cbp->bio_data = addr;
653190507Slulf	cbp->bio_offset = wp->lockbase + s->drive_offset;
654190507Slulf	cbp->bio_length = wp->length;
655190507Slulf	cbp->bio_done = gv_done;
656190507Slulf	cbp->bio_caller1 = s;
657190507Slulf	if (use_wp)
658190507Slulf		cbp->bio_caller2 = wp;
659190507Slulf
660190507Slulf	return (cbp);
661190507Slulf}
662