geom/vinum/geom_vinum_raid5.c

130389Sle/*-
190507Slulf * Copyright (c) 2004, 2007 Lukas Ertl
130389Sle * All rights reserved.
130389Sle *
130389Sle * Redistribution and use in source and binary forms, with or without
130389Sle * modification, are permitted provided that the following conditions
130389Sle * are met:
130389Sle * 1. Redistributions of source code must retain the above copyright
130389Sle *    notice, this list of conditions and the following disclaimer.
130389Sle * 2. Redistributions in binary form must reproduce the above copyright
130389Sle *    notice, this list of conditions and the following disclaimer in the
130389Sle *    documentation and/or other materials provided with the distribution.
130389Sle *
130389Sle * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
130389Sle * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
130389Sle * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
130389Sle * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
130389Sle * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
130389Sle * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
130389Sle * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
130389Sle * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
130389Sle * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
130389Sle * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
130389Sle * SUCH DAMAGE.
130389Sle */
130389Sle
130389Sle#include <sys/cdefs.h>
130389Sle__FBSDID("$FreeBSD$");
130389Sle
130389Sle#include <sys/param.h>
130389Sle#include <sys/bio.h>
130389Sle#include <sys/lock.h>
130389Sle#include <sys/malloc.h>
130389Sle#include <sys/systm.h>
130389Sle
130389Sle#include <geom/geom.h>
130389Sle#include <geom/vinum/geom_vinum_var.h>
130389Sle#include <geom/vinum/geom_vinum_raid5.h>
130389Sle#include <geom/vinum/geom_vinum.h>
130389Sle
190507Slulfstatic int		gv_raid5_offset(struct gv_plex *, off_t, off_t,
190507Slulf			    off_t *, off_t *, int *, int *, int);
190507Slulfstatic struct bio *	gv_raid5_clone_bio(struct bio *, struct gv_sd *,
190507Slulf			    struct gv_raid5_packet *, caddr_t, int);
190507Slulfstatic int	gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *,
190507Slulf		    struct bio *, caddr_t, off_t, off_t, int *);
190507Slulfstatic int	gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *,
190507Slulf		    struct bio *, caddr_t, off_t, off_t);
190507Slulfstatic int	gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *,
190507Slulf		    struct bio *, caddr_t, off_t, off_t);
137730Sle
190507Slulfstruct gv_raid5_packet *
190507Slulfgv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff,
190507Slulf    off_t bcount)
190507Slulf{
190507Slulf	struct bio *cbp;
190507Slulf	struct gv_raid5_packet *wp, *wp2;
190507Slulf	struct gv_bioq *bq, *bq2;
190507Slulf	int err, delay;
190507Slulf
190507Slulf	delay = 0;
190507Slulf	wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
190507Slulf	wp->bio = bp;
190507Slulf	wp->waiting = NULL;
190507Slulf	wp->parity = NULL;
190507Slulf	TAILQ_INIT(&wp->bits);
190507Slulf
191856Slulf	if (bp->bio_pflags & GV_BIO_REBUILD)
190507Slulf		err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount);
191856Slulf	else if (bp->bio_pflags & GV_BIO_CHECK)
190507Slulf		err = gv_raid5_check(p, wp, bp, addr, boff, bcount);
190507Slulf	else
190507Slulf		err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay);
190507Slulf
190507Slulf	/* Means we have a delayed request. */
190507Slulf	if (delay) {
190507Slulf		g_free(wp);
190507Slulf		return (NULL);
190507Slulf	}
190507Slulf
190507Slulf	/*
190507Slulf	 * Building the sub-request failed, we probably need to clean up a lot.
190507Slulf	 */
190507Slulf	if (err) {
190507Slulf		G_VINUM_LOGREQ(0, bp, "raid5 plex request failed.");
190507Slulf		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
190507Slulf			TAILQ_REMOVE(&wp->bits, bq, queue);
190507Slulf			g_free(bq);
190507Slulf		}
190507Slulf		if (wp->waiting != NULL) {
190507Slulf			if (wp->waiting->bio_cflags & GV_BIO_MALLOC)
190507Slulf				g_free(wp->waiting->bio_data);
190507Slulf			g_destroy_bio(wp->waiting);
190507Slulf		}
190507Slulf		if (wp->parity != NULL) {
190507Slulf			if (wp->parity->bio_cflags & GV_BIO_MALLOC)
190507Slulf				g_free(wp->parity->bio_data);
190507Slulf			g_destroy_bio(wp->parity);
190507Slulf		}
190507Slulf		g_free(wp);
190507Slulf
190507Slulf		TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
190507Slulf			if (wp->bio != bp)
190507Slulf				continue;
190507Slulf
190507Slulf			TAILQ_REMOVE(&p->packets, wp, list);
190507Slulf			TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
190507Slulf				TAILQ_REMOVE(&wp->bits, bq, queue);
190507Slulf				g_free(bq);
190507Slulf			}
190507Slulf			g_free(wp);
190507Slulf		}
190507Slulf
190507Slulf		cbp = bioq_takefirst(p->bqueue);
190507Slulf		while (cbp != NULL) {
190507Slulf			if (cbp->bio_cflags & GV_BIO_MALLOC)
190507Slulf				g_free(cbp->bio_data);
190507Slulf			g_destroy_bio(cbp);
190507Slulf			cbp = bioq_takefirst(p->bqueue);
190507Slulf		}
190507Slulf
190507Slulf		/* If internal, stop and reset state. */
191856Slulf		if (bp->bio_pflags & GV_BIO_INTERNAL) {
191856Slulf			if (bp->bio_pflags & GV_BIO_MALLOC)
191852Slulf				g_free(bp->bio_data);
190507Slulf			g_destroy_bio(bp);
190507Slulf			/* Reset flags. */
190507Slulf			p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
190507Slulf			    GV_PLEX_GROWING);
190507Slulf			return (NULL);
190507Slulf		}
190507Slulf		g_io_deliver(bp, err);
190507Slulf		return (NULL);
190507Slulf	}
190507Slulf
190507Slulf	return (wp);
190507Slulf}
190507Slulf
130389Sle/*
130389Sle * Check if the stripe that the work packet wants is already being used by
130389Sle * some other work packet.
130389Sle */
130389Sleint
135426Slegv_stripe_active(struct gv_plex *p, struct bio *bp)
130389Sle{
135426Sle	struct gv_raid5_packet *wp, *owp;
135426Sle	int overlap;
130389Sle
190507Slulf	wp = bp->bio_caller2;
135426Sle	if (wp->lockbase == -1)
135426Sle		return (0);
130389Sle
135426Sle	overlap = 0;
135426Sle	TAILQ_FOREACH(owp, &p->packets, list) {
135426Sle		if (owp == wp)
135426Sle			break;
135426Sle		if ((wp->lockbase >= owp->lockbase) &&
135426Sle		    (wp->lockbase <= owp->lockbase + owp->length)) {
135426Sle			overlap++;
135426Sle			break;
130389Sle		}
135426Sle		if ((wp->lockbase <= owp->lockbase) &&
135426Sle		    (wp->lockbase + wp->length >= owp->lockbase)) {
135426Sle			overlap++;
135426Sle			break;
130389Sle		}
130389Sle	}
130389Sle
135426Sle	return (overlap);
130389Sle}
130389Sle
190507Slulfstatic int
190507Slulfgv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
138110Sle    caddr_t addr, off_t boff, off_t bcount)
138110Sle{
138110Sle	struct gv_sd *parity, *s;
138110Sle	struct gv_bioq *bq;
190507Slulf	struct bio *cbp;
138110Sle	int i, psdno;
138110Sle	off_t real_len, real_off;
138110Sle
138110Sle	if (p == NULL || LIST_EMPTY(&p->subdisks))
138110Sle		return (ENXIO);
138110Sle
190507Slulf	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1);
138110Sle
138110Sle	/* Find the right subdisk. */
138110Sle	parity = NULL;
138110Sle	i = 0;
138110Sle	LIST_FOREACH(s, &p->subdisks, in_plex) {
138110Sle		if (i == psdno) {
138110Sle			parity = s;
138110Sle			break;
138110Sle		}
138110Sle		i++;
138110Sle	}
138110Sle
138110Sle	/* Parity stripe not found. */
138110Sle	if (parity == NULL)
138110Sle		return (ENXIO);
138110Sle
138110Sle	if (parity->state != GV_SD_UP)
138110Sle		return (ENXIO);
138110Sle
138110Sle	wp->length = real_len;
138110Sle	wp->data = addr;
138110Sle	wp->lockbase = real_off;
138110Sle
138110Sle	/* Read all subdisks. */
138110Sle	LIST_FOREACH(s, &p->subdisks, in_plex) {
138110Sle		/* Skip the parity subdisk. */
138110Sle		if (s == parity)
138110Sle			continue;
190507Slulf		/* Skip growing subdisks. */
190507Slulf		if (s->flags & GV_SD_GROW)
190507Slulf			continue;
138110Sle
190507Slulf		cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
138110Sle		if (cbp == NULL)
138110Sle			return (ENOMEM);
138110Sle		cbp->bio_cmd = BIO_READ;
138110Sle
190507Slulf		bioq_insert_tail(p->bqueue, cbp);
138110Sle
138110Sle		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
138110Sle		bq->bp = cbp;
138110Sle		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
138110Sle	}
138110Sle
138110Sle	/* Read the parity data. */
190507Slulf	cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
138110Sle	if (cbp == NULL)
138110Sle		return (ENOMEM);
138110Sle	cbp->bio_cmd = BIO_READ;
138110Sle	wp->waiting = cbp;
138110Sle
138110Sle	/*
138110Sle	 * In case we want to rebuild the parity, create an extra BIO to write
138110Sle	 * it out.  It also acts as buffer for the XOR operations.
138110Sle	 */
190507Slulf	cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1);
138110Sle	if (cbp == NULL)
138110Sle		return (ENOMEM);
138110Sle	wp->parity = cbp;
138110Sle
138110Sle	return (0);
138110Sle}
138110Sle
138110Sle/* Rebuild a degraded RAID5 plex. */
190507Slulfstatic int
190507Slulfgv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
135966Sle    caddr_t addr, off_t boff, off_t bcount)
135966Sle{
135966Sle	struct gv_sd *broken, *s;
135966Sle	struct gv_bioq *bq;
190507Slulf	struct bio *cbp;
137730Sle	off_t real_len, real_off;
135966Sle
135966Sle	if (p == NULL || LIST_EMPTY(&p->subdisks))
135966Sle		return (ENXIO);
135966Sle
190507Slulf	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1);
135966Sle
135966Sle	/* Find the right subdisk. */
135966Sle	broken = NULL;
135966Sle	LIST_FOREACH(s, &p->subdisks, in_plex) {
135966Sle		if (s->state != GV_SD_UP)
135966Sle			broken = s;
135966Sle	}
135966Sle
138110Sle	/* Broken stripe not found. */
135966Sle	if (broken == NULL)
135966Sle		return (ENXIO);
135966Sle
135966Sle	switch (broken->state) {
135966Sle	case GV_SD_UP:
135966Sle		return (EINVAL);
135966Sle
135966Sle	case GV_SD_STALE:
191856Slulf		if (!(bp->bio_pflags & GV_BIO_REBUILD))
135966Sle			return (ENXIO);
135966Sle
184292Slulf		G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
135966Sle		gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
190507Slulf		/* Set this bit now, but should be set at end. */
190507Slulf		broken->flags |= GV_SD_CANGOUP;
135966Sle		break;
135966Sle
135966Sle	case GV_SD_REVIVING:
135966Sle		break;
135966Sle
135966Sle	default:
135966Sle		/* All other subdisk states mean it's not accessible. */
135966Sle		return (ENXIO);
135966Sle	}
135966Sle
135966Sle	wp->length = real_len;
135966Sle	wp->data = addr;
135966Sle	wp->lockbase = real_off;
135966Sle
137730Sle	KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
135966Sle
135966Sle	/* Read all subdisks. */
135966Sle	LIST_FOREACH(s, &p->subdisks, in_plex) {
135966Sle		/* Skip the broken subdisk. */
135966Sle		if (s == broken)
135966Sle			continue;
135966Sle
190507Slulf		/* Skip growing subdisks. */
190507Slulf		if (s->flags & GV_SD_GROW)
190507Slulf			continue;
190507Slulf
190507Slulf		cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
135966Sle		if (cbp == NULL)
135966Sle			return (ENOMEM);
135966Sle		cbp->bio_cmd = BIO_READ;
135966Sle
190507Slulf		bioq_insert_tail(p->bqueue, cbp);
135966Sle
135966Sle		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
135966Sle		bq->bp = cbp;
135966Sle		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
135966Sle	}
135966Sle
135966Sle	/* Write the parity data. */
190507Slulf	cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1);
135966Sle	if (cbp == NULL)
135966Sle		return (ENOMEM);
135966Sle	wp->parity = cbp;
135966Sle
135966Sle	p->synced = boff;
135966Sle
190507Slulf	/* Post notification that we're finished. */
135966Sle	return (0);
135966Sle}
135966Sle
130389Sle/* Build a request group to perform (part of) a RAID5 request. */
190507Slulfstatic int
190507Slulfgv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp,
190507Slulf    struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay)
130389Sle{
130389Sle	struct g_geom *gp;
130389Sle	struct gv_sd *broken, *original, *parity, *s;
135426Sle	struct gv_bioq *bq;
190507Slulf	struct bio *cbp;
190507Slulf	int i, psdno, sdno, type, grow;
137730Sle	off_t real_len, real_off;
130389Sle
130389Sle	gp = bp->bio_to->geom;
130389Sle
130389Sle	if (p == NULL || LIST_EMPTY(&p->subdisks))
130389Sle		return (ENXIO);
130389Sle
130389Sle	/* We are optimistic and assume that this request will be OK. */
135426Sle#define	REQ_TYPE_NORMAL		0
135426Sle#define	REQ_TYPE_DEGRADED	1
135426Sle#define	REQ_TYPE_NOPARITY	2
135426Sle
135426Sle	type = REQ_TYPE_NORMAL;
130389Sle	original = parity = broken = NULL;
130389Sle
190507Slulf	/* XXX: The resize won't crash with rebuild or sync, but we should still
190507Slulf	 * be aware of it. Also this should perhaps be done on rebuild/check as
190507Slulf	 * well?
190507Slulf	 */
190507Slulf	/* If we're over, we must use the old. */
190507Slulf	if (boff >= p->synced) {
190507Slulf		grow = 1;
190507Slulf	/* Or if over the resized offset, we use all drives. */
190507Slulf	} else if (boff + bcount <= p->synced) {
190507Slulf		grow = 0;
190507Slulf	/* Else, we're in the middle, and must wait a bit. */
190507Slulf	} else {
190507Slulf		bioq_disksort(p->rqueue, bp);
190507Slulf		*delay = 1;
190507Slulf		return (0);
190507Slulf	}
190507Slulf	gv_raid5_offset(p, boff, bcount, &real_off, &real_len,
190507Slulf	    &sdno, &psdno, grow);
130389Sle
130389Sle	/* Find the right subdisks. */
130389Sle	i = 0;
130389Sle	LIST_FOREACH(s, &p->subdisks, in_plex) {
130389Sle		if (i == sdno)
130389Sle			original = s;
130389Sle		if (i == psdno)
130389Sle			parity = s;
130389Sle		if (s->state != GV_SD_UP)
130389Sle			broken = s;
130389Sle		i++;
130389Sle	}
130389Sle
130389Sle	if ((original == NULL) || (parity == NULL))
130389Sle		return (ENXIO);
130389Sle
130389Sle	/* Our data stripe is missing. */
130389Sle	if (original->state != GV_SD_UP)
135426Sle		type = REQ_TYPE_DEGRADED;
190507Slulf
190507Slulf	/* If synchronizing request, just write it if disks are stale. */
190507Slulf	if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE &&
191856Slulf	    bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) {
190507Slulf		type = REQ_TYPE_NORMAL;
130389Sle	/* Our parity stripe is missing. */
190507Slulf	} else if (parity->state != GV_SD_UP) {
130389Sle		/* We cannot take another failure if we're already degraded. */
135426Sle		if (type != REQ_TYPE_NORMAL)
130389Sle			return (ENXIO);
130389Sle		else
135426Sle			type = REQ_TYPE_NOPARITY;
130389Sle	}
130389Sle
135426Sle	wp->length = real_len;
130389Sle	wp->data = addr;
135426Sle	wp->lockbase = real_off;
130389Sle
130389Sle	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
130389Sle
190507Slulf	if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced))
135966Sle		type = REQ_TYPE_NORMAL;
135966Sle
190507Slulf	if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) {
190507Slulf		bioq_disksort(p->rqueue, bp);
190507Slulf		*delay = 1;
190507Slulf		return (0);
190507Slulf	}
190507Slulf
130389Sle	switch (bp->bio_cmd) {
130389Sle	case BIO_READ:
130389Sle		/*
130389Sle		 * For a degraded read we need to read in all stripes except
130389Sle		 * the broken one plus the parity stripe and then recalculate
130389Sle		 * the desired data.
130389Sle		 */
135426Sle		if (type == REQ_TYPE_DEGRADED) {
135426Sle			bzero(wp->data, wp->length);
130389Sle			LIST_FOREACH(s, &p->subdisks, in_plex) {
130389Sle				/* Skip the broken subdisk. */
130389Sle				if (s == broken)
130389Sle					continue;
190507Slulf				/* Skip growing if within offset. */
190507Slulf				if (grow && s->flags & GV_SD_GROW)
190507Slulf					continue;
190507Slulf				cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
135426Sle				if (cbp == NULL)
130389Sle					return (ENOMEM);
135426Sle
190507Slulf				bioq_insert_tail(p->bqueue, cbp);
135426Sle
135426Sle				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
135426Sle				bq->bp = cbp;
135426Sle				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
130389Sle			}
130389Sle
130389Sle		/* A normal read can be fulfilled with the original subdisk. */
130389Sle		} else {
190507Slulf			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0);
135426Sle			if (cbp == NULL)
130389Sle				return (ENOMEM);
135426Sle
190507Slulf			bioq_insert_tail(p->bqueue, cbp);
130389Sle		}
135426Sle		wp->lockbase = -1;
135426Sle
130389Sle		break;
130389Sle
130389Sle	case BIO_WRITE:
130389Sle		/*
130389Sle		 * A degraded write means we cannot write to the original data
130389Sle		 * subdisk.  Thus we need to read in all valid stripes,
130389Sle		 * recalculate the parity from the original data, and then
130389Sle		 * write the parity stripe back out.
130389Sle		 */
135426Sle		if (type == REQ_TYPE_DEGRADED) {
135426Sle			/* Read all subdisks. */
130389Sle			LIST_FOREACH(s, &p->subdisks, in_plex) {
130389Sle				/* Skip the broken and the parity subdisk. */
135426Sle				if ((s == broken) || (s == parity))
130389Sle					continue;
190507Slulf				/* Skip growing if within offset. */
190507Slulf				if (grow && s->flags & GV_SD_GROW)
190507Slulf					continue;
130389Sle
190507Slulf				cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
135426Sle				if (cbp == NULL)
130389Sle					return (ENOMEM);
135426Sle				cbp->bio_cmd = BIO_READ;
135426Sle
190507Slulf				bioq_insert_tail(p->bqueue, cbp);
135426Sle
135426Sle				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
135426Sle				bq->bp = cbp;
135426Sle				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
130389Sle			}
130389Sle
135426Sle			/* Write the parity data. */
190507Slulf			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
135426Sle			if (cbp == NULL)
130389Sle				return (ENOMEM);
190507Slulf			bcopy(addr, cbp->bio_data, wp->length);
135426Sle			wp->parity = cbp;
130389Sle
130389Sle		/*
135426Sle		 * When the parity stripe is missing we just write out the data.
130389Sle		 */
135426Sle		} else if (type == REQ_TYPE_NOPARITY) {
190507Slulf			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
135426Sle			if (cbp == NULL)
130925Sle				return (ENOMEM);
130389Sle
190507Slulf			bioq_insert_tail(p->bqueue, cbp);
130389Sle
135426Sle			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
135426Sle			bq->bp = cbp;
135426Sle			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
130389Sle
130389Sle		/*
130389Sle		 * A normal write request goes to the original subdisk, then we
130389Sle		 * read in all other stripes, recalculate the parity and write
130389Sle		 * out the parity again.
130389Sle		 */
130389Sle		} else {
135426Sle			/* Read old parity. */
190507Slulf			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
135426Sle			if (cbp == NULL)
130925Sle				return (ENOMEM);
135426Sle			cbp->bio_cmd = BIO_READ;
130389Sle
190507Slulf			bioq_insert_tail(p->bqueue, cbp);
135426Sle
135426Sle			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
135426Sle			bq->bp = cbp;
135426Sle			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
135426Sle
135426Sle			/* Read old data. */
190507Slulf			cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1);
135426Sle			if (cbp == NULL)
135426Sle				return (ENOMEM);
135426Sle			cbp->bio_cmd = BIO_READ;
135426Sle
190507Slulf			bioq_insert_tail(p->bqueue, cbp);
135426Sle
135426Sle			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
135426Sle			bq->bp = cbp;
135426Sle			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
135426Sle
135426Sle			/* Write new data. */
190507Slulf			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
135426Sle			if (cbp == NULL)
135426Sle				return (ENOMEM);
135426Sle
135426Sle			/*
135426Sle			 * We must not write the new data until the old data
135426Sle			 * was read, so hold this BIO back until we're ready
135426Sle			 * for it.
135426Sle			 */
135426Sle			wp->waiting = cbp;
135426Sle
135426Sle			/* The final bio for the parity. */
190507Slulf			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
135426Sle			if (cbp == NULL)
135426Sle				return (ENOMEM);
135426Sle
135426Sle			/* Remember that this is the BIO for the parity data. */
135426Sle			wp->parity = cbp;
130389Sle		}
130389Sle		break;
135426Sle
130389Sle	default:
130389Sle		return (EINVAL);
130389Sle	}
130389Sle
130389Sle	return (0);
130389Sle}
137730Sle
190507Slulf/*
190507Slulf * Calculate the offsets in the various subdisks for a RAID5 request. Also take
190507Slulf * care of new subdisks in an expanded RAID5 array.
190507Slulf * XXX: This assumes that the new subdisks are inserted after the others (which
190507Slulf * is okay as long as plex_offset is larger). If subdisks are inserted into the
190507Slulf * plexlist before, we get problems.
190507Slulf */
190507Slulfstatic int
137730Slegv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
190507Slulf    off_t *real_len, int *sdno, int *psdno, int growing)
137730Sle{
190507Slulf	struct gv_sd *s;
190507Slulf	int sd, psd, sdcount;
137730Sle	off_t len_left, stripeend, stripeoff, stripestart;
137730Sle
190507Slulf	sdcount = p->sdcount;
190507Slulf	if (growing) {
190507Slulf		LIST_FOREACH(s, &p->subdisks, in_plex) {
190507Slulf			if (s->flags & GV_SD_GROW)
190507Slulf				sdcount--;
190507Slulf		}
190507Slulf	}
190507Slulf
137730Sle	/* The number of the subdisk containing the parity stripe. */
190507Slulf	psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) %
190507Slulf	    sdcount;
137730Sle	KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
137730Sle
137730Sle	/* Offset of the start address from the start of the stripe. */
190507Slulf	stripeoff = boff % (p->stripesize * (sdcount - 1));
137730Sle	KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
137730Sle
137730Sle	/* The number of the subdisk where the stripe resides. */
137730Sle	sd = stripeoff / p->stripesize;
137730Sle	KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
137730Sle
137730Sle	/* At or past parity subdisk. */
137730Sle	if (sd >= psd)
137730Sle		sd++;
137730Sle
137730Sle	/* The offset of the stripe on this subdisk. */
190507Slulf	stripestart = (boff - stripeoff) / (sdcount - 1);
137730Sle	KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
137730Sle
137730Sle	stripeoff %= p->stripesize;
137730Sle
137730Sle	/* The offset of the request on this subdisk. */
137730Sle	*real_off = stripestart + stripeoff;
137730Sle
137730Sle	stripeend = stripestart + p->stripesize;
137730Sle	len_left = stripeend - *real_off;
137730Sle	KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
137730Sle
137730Sle	*real_len = (bcount <= len_left) ? bcount : len_left;
137730Sle
137730Sle	if (sdno != NULL)
137730Sle		*sdno = sd;
137730Sle	if (psdno != NULL)
137730Sle		*psdno = psd;
137730Sle
137730Sle	return (0);
137730Sle}
190507Slulf
190507Slulfstatic struct bio *
190507Slulfgv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp,
190507Slulf    caddr_t addr, int use_wp)
190507Slulf{
190507Slulf	struct bio *cbp;
190507Slulf
190507Slulf	cbp = g_clone_bio(bp);
190507Slulf	if (cbp == NULL)
190507Slulf		return (NULL);
190507Slulf	if (addr == NULL) {
190507Slulf		cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO);
190507Slulf		cbp->bio_cflags |= GV_BIO_MALLOC;
190507Slulf	} else
190507Slulf		cbp->bio_data = addr;
190507Slulf	cbp->bio_offset = wp->lockbase + s->drive_offset;
190507Slulf	cbp->bio_length = wp->length;
190507Slulf	cbp->bio_done = gv_done;
190507Slulf	cbp->bio_caller1 = s;
190507Slulf	if (use_wp)
190507Slulf		cbp->bio_caller2 = wp;
190507Slulf
190507Slulf	return (cbp);
190507Slulf}