geom_vinum_raid5.c revision 130477
1130389Sle/*-
2130389Sle * Copyright (c) 2004 Lukas Ertl
3130389Sle * All rights reserved.
4130389Sle *
5130389Sle * Redistribution and use in source and binary forms, with or without
6130389Sle * modification, are permitted provided that the following conditions
7130389Sle * are met:
8130389Sle * 1. Redistributions of source code must retain the above copyright
9130389Sle *    notice, this list of conditions and the following disclaimer.
10130389Sle * 2. Redistributions in binary form must reproduce the above copyright
11130389Sle *    notice, this list of conditions and the following disclaimer in the
12130389Sle *    documentation and/or other materials provided with the distribution.
13130389Sle *
14130389Sle * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15130389Sle * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16130389Sle * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17130389Sle * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18130389Sle * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19130389Sle * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20130389Sle * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21130389Sle * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22130389Sle * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23130389Sle * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24130389Sle * SUCH DAMAGE.
25130389Sle */
26130389Sle
27130389Sle#include <sys/cdefs.h>
28130389Sle__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_raid5.c 130477 2004-06-14 17:06:55Z le $");
29130389Sle
30130389Sle#include <sys/param.h>
31130389Sle#include <sys/bio.h>
32130389Sle#include <sys/conf.h>
33130389Sle#include <sys/errno.h>
34130389Sle#include <sys/kernel.h>
35130389Sle#include <sys/kthread.h>
36130389Sle#include <sys/libkern.h>
37130389Sle#include <sys/lock.h>
38130389Sle#include <sys/malloc.h>
39130389Sle#include <sys/mutex.h>
40130389Sle#include <sys/systm.h>
41130389Sle
42130389Sle#include <geom/geom.h>
43130389Sle#include <geom/vinum/geom_vinum_var.h>
44130389Sle#include <geom/vinum/geom_vinum_raid5.h>
45130389Sle#include <geom/vinum/geom_vinum.h>
46130389Sle
47130389Sleint	gv_raid5_parity(struct gv_raid5_packet *);
48130389Sleint	gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *);
49130389Sle
50130389Slestruct gv_raid5_bit *
51130389Slegv_new_raid5_bit(void)
52130389Sle{
53130389Sle	struct gv_raid5_bit *r;
54130389Sle	r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO);
55130389Sle	KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r"));
56130389Sle	return (r);
57130389Sle}
58130389Sle
59130389Slestruct gv_raid5_packet *
60130389Slegv_new_raid5_packet(void)
61130389Sle{
62130389Sle	struct gv_raid5_packet *wp;
63130389Sle
64130389Sle	wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO);
65130389Sle	KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp"));
66130389Sle	wp->state = SETUP;
67130389Sle	wp->type = JUNK;
68130389Sle	TAILQ_INIT(&wp->bits);
69130389Sle
70130389Sle	return (wp);
71130389Sle}
72130389Sle
73130389Sle/*
74130389Sle * Check if the stripe that the work packet wants is already being used by
75130389Sle * some other work packet.
76130389Sle */
77130389Sleint
78130389Slegv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc)
79130389Sle{
80130389Sle	struct gv_raid5_packet *wpa;
81130389Sle
82130389Sle	TAILQ_FOREACH(wpa, &sc->worklist, list) {
83130389Sle		if (wpa->lockbase == wp->lockbase) {
84130389Sle			if (wpa->bio == wp->bio)
85130389Sle				return (0);
86130389Sle			return (1);
87130389Sle		}
88130389Sle	}
89130389Sle	return (0);
90130389Sle}
91130389Sle
92130389Sle/*
93130389Sle * The "worker" thread that runs through the worklist and fires off the
94130389Sle * "subrequests" needed to fulfill a RAID5 read or write request.
95130389Sle */
96130389Slevoid
97130389Slegv_raid5_worker(void *arg)
98130389Sle{
99130389Sle	struct bio *bp;
100130389Sle	struct g_geom *gp;
101130389Sle	struct gv_plex *p;
102130389Sle	struct gv_raid5_packet *wp, *wpt;
103130389Sle	struct gv_raid5_bit *rbp, *rbpt;
104130389Sle	int error, restart;
105130389Sle
106130389Sle	gp = arg;
107130389Sle	p = gp->softc;
108130389Sle
109130389Sle	mtx_lock(&p->worklist_mtx);
110130389Sle	for (;;) {
111130389Sle		restart = 0;
112130389Sle		g_trace(G_T_TOPOLOGY, "gv_raid5_worker scan");
113130389Sle		TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) {
114130389Sle			/* This request packet is already being processed. */
115130389Sle			if (wp->state == IO)
116130389Sle				continue;
117130389Sle			/* This request packet is ready for processing. */
118130389Sle			if (wp->state == VALID) {
119130389Sle				/* Couldn't get the lock, try again. */
120130389Sle				if ((wp->lockbase != -1) &&
121130389Sle				    gv_stripe_active(wp, p))
122130389Sle					continue;
123130389Sle
124130389Sle				wp->state = IO;
125130389Sle				mtx_unlock(&p->worklist_mtx);
126130389Sle				TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt)
127130389Sle					g_io_request(rbp->bio, rbp->consumer);
128130389Sle				mtx_lock(&p->worklist_mtx);
129130389Sle				continue;
130130389Sle			}
131130389Sle			if (wp->state == FINISH) {
132130389Sle				bp = wp->bio;
133130389Sle				bp->bio_completed += wp->length;
134130389Sle				/*
135130389Sle				 * Deliver the original request if we have
136130389Sle				 * finished.
137130389Sle				 */
138130389Sle				if (bp->bio_completed == bp->bio_length) {
139130389Sle					mtx_unlock(&p->worklist_mtx);
140130389Sle					g_io_deliver(bp, 0);
141130389Sle					mtx_lock(&p->worklist_mtx);
142130389Sle				}
143130389Sle				TAILQ_REMOVE(&p->worklist, wp, list);
144130389Sle				if (wp->bufmalloc == 1)
145130389Sle					g_free(wp->buf);
146130389Sle				g_free(wp);
147130389Sle				restart++;
148130389Sle				/*break;*/
149130389Sle			}
150130389Sle		}
151130389Sle		if (!restart) {
152130389Sle			/* Self-destruct. */
153130389Sle			if (p->flags & GV_PLEX_THREAD_DIE)
154130389Sle				break;
155130389Sle			g_trace(G_T_TOPOLOGY, "gv_raid5_worker sleep");
156130389Sle			error = msleep(p, &p->worklist_mtx, PRIBIO, "-",
157130389Sle			    hz/100);
158130389Sle		}
159130389Sle	}
160130389Sle	mtx_unlock(&p->worklist_mtx);
161130389Sle
162130389Sle	g_trace(G_T_TOPOLOGY, "gv_raid5_worker die");
163130389Sle
164130389Sle	/* Signal our plex that we are dead. */
165130389Sle	p->flags |= GV_PLEX_THREAD_DEAD;
166130389Sle	wakeup(p);
167130389Sle	kthread_exit(0);
168130389Sle}
169130389Sle
170130389Sle/* Final bio transaction to write out the parity data. */
171130389Sleint
172130389Slegv_raid5_parity(struct gv_raid5_packet *wp)
173130389Sle{
174130389Sle	struct bio *bp;
175130389Sle
176130389Sle	bp = g_new_bio();
177130389Sle	if (bp == NULL)
178130389Sle		return (ENOMEM);
179130389Sle
180130389Sle	wp->type = ISPARITY;
181130389Sle	bp->bio_cmd = BIO_WRITE;
182130389Sle	bp->bio_data = wp->buf;
183130389Sle	bp->bio_offset = wp->offset;
184130389Sle	bp->bio_length = wp->length;
185130389Sle	bp->bio_done = gv_raid5_done;
186130389Sle	bp->bio_caller1 = wp;
187130389Sle	bp->bio_caller2 = NULL;
188130389Sle	g_io_request(bp, wp->parity);
189130389Sle
190130389Sle	return (0);
191130389Sle}
192130389Sle
193130389Sle/* We end up here after each subrequest. */
194130389Slevoid
195130389Slegv_raid5_done(struct bio *bp)
196130389Sle{
197130389Sle	struct bio *obp;
198130389Sle	struct g_geom *gp;
199130389Sle	struct gv_plex *p;
200130389Sle	struct gv_raid5_packet *wp;
201130389Sle	struct gv_raid5_bit *rbp;
202130389Sle	off_t i;
203130389Sle	int error;
204130389Sle
205130389Sle	wp = bp->bio_caller1;
206130389Sle	rbp = bp->bio_caller2;
207130389Sle	obp = wp->bio;
208130389Sle	gp = bp->bio_from->geom;
209130389Sle	p = gp->softc;
210130389Sle
211130389Sle	/* One less active subrequest. */
212130389Sle	wp->active--;
213130389Sle
214130389Sle	switch (obp->bio_cmd) {
215130389Sle	case BIO_READ:
216130389Sle		/* Degraded reads need to handle parity data. */
217130389Sle		if (wp->type == DEGRADED) {
218130389Sle			for (i = 0; i < wp->length; i++)
219130389Sle				wp->buf[i] ^= bp->bio_data[i];
220130389Sle
221130389Sle			/* When we're finished copy back the data we want. */
222130389Sle			if (wp->active == 0)
223130389Sle				bcopy(wp->buf, wp->data, wp->length);
224130389Sle		}
225130389Sle
226130389Sle		break;
227130389Sle
228130389Sle	case BIO_WRITE:
229130389Sle		/* Handle the parity data, if needed. */
230130389Sle		if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) {
231130389Sle			for (i = 0; i < wp->length; i++)
232130389Sle				wp->buf[i] ^= bp->bio_data[i];
233130389Sle
234130389Sle			/* Write out the parity data we calculated. */
235130389Sle			if (wp->active == 0) {
236130389Sle				wp->active++;
237130389Sle				error = gv_raid5_parity(wp);
238130389Sle			}
239130389Sle		}
240130389Sle		break;
241130389Sle	}
242130389Sle
243130389Sle	g_destroy_bio(bp);
244130389Sle
245130389Sle	if (rbp != NULL) {
246130389Sle		if (rbp->malloc == 1)
247130389Sle			g_free(rbp->buf);
248130389Sle		TAILQ_REMOVE(&wp->bits, rbp, list);
249130389Sle		g_free(rbp);
250130389Sle	}
251130389Sle
252130389Sle	/* This request group is done. */
253130389Sle	if (wp->active == 0)
254130389Sle		wp->state = FINISH;
255130389Sle}
256130389Sle
257130389Sle/* Build a request group to perform (part of) a RAID5 request. */
258130389Sleint
259130389Slegv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
260130389Sle    long bcount, off_t boff)
261130389Sle{
262130389Sle	struct g_geom *gp;
263130389Sle	struct gv_plex *p;
264130389Sle	struct gv_raid5_bit *rbp;
265130389Sle	struct gv_sd *broken, *original, *parity, *s;
266130389Sle	int i, psdno, sdno;
267130389Sle	off_t len_left, real_off, stripeend, stripeoff, stripestart;
268130389Sle
269130389Sle	gp = bp->bio_to->geom;
270130389Sle	p = gp->softc;
271130389Sle
272130389Sle	if (p == NULL || LIST_EMPTY(&p->subdisks))
273130389Sle		return (ENXIO);
274130389Sle
275130389Sle	/* We are optimistic and assume that this request will be OK. */
276130389Sle	wp->type = NORMAL;
277130389Sle	original = parity = broken = NULL;
278130389Sle
279130389Sle	/* The number of the subdisk containing the parity stripe. */
280130389Sle	psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
281130389Sle	    p->sdcount;
282130389Sle	KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0"));
283130389Sle
284130389Sle	/* Offset of the start address from the start of the stripe. */
285130389Sle	stripeoff = boff % (p->stripesize * (p->sdcount - 1));
286130389Sle	KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0"));
287130389Sle
288130389Sle	/* The number of the subdisk where the stripe resides. */
289130389Sle	sdno = stripeoff / p->stripesize;
290130389Sle	KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0"));
291130389Sle
292130389Sle	/* At or past parity subdisk. */
293130389Sle	if (sdno >= psdno)
294130389Sle		sdno++;
295130389Sle
296130389Sle	/* The offset of the stripe on this subdisk. */
297130389Sle	stripestart = (boff - stripeoff) / (p->sdcount - 1);
298130389Sle	KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0"));
299130389Sle
300130477Sle	stripeoff %= p->stripesize;
301130389Sle
302130389Sle	/* The offset of the request on this subdisk. */
303130389Sle	real_off = stripestart + stripeoff;
304130389Sle
305130389Sle	stripeend = stripestart + p->stripesize;
306130389Sle	len_left = stripeend - real_off;
307130389Sle	KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0"));
308130389Sle
309130389Sle	/* Find the right subdisks. */
310130389Sle	i = 0;
311130389Sle	LIST_FOREACH(s, &p->subdisks, in_plex) {
312130389Sle		if (i == sdno)
313130389Sle			original = s;
314130389Sle		if (i == psdno)
315130389Sle			parity = s;
316130389Sle		if (s->state != GV_SD_UP)
317130389Sle			broken = s;
318130389Sle		i++;
319130389Sle	}
320130389Sle
321130389Sle	if ((original == NULL) || (parity == NULL))
322130389Sle		return (ENXIO);
323130389Sle
324130389Sle	/* Our data stripe is missing. */
325130389Sle	if (original->state != GV_SD_UP)
326130389Sle		wp->type = DEGRADED;
327130389Sle	/* Our parity stripe is missing. */
328130389Sle	if (parity->state != GV_SD_UP) {
329130389Sle		/* We cannot take another failure if we're already degraded. */
330130389Sle		if (wp->type != NORMAL)
331130389Sle			return (ENXIO);
332130389Sle		else
333130389Sle			wp->type = NOPARITY;
334130389Sle	}
335130389Sle
336130389Sle	/*
337130389Sle	 * A combined write is necessary when the original data subdisk and the
338130389Sle	 * parity subdisk are both up, but one of the other subdisks isn't.
339130389Sle	 */
340130389Sle	if ((broken != NULL) && (broken != parity) && (broken != original))
341130389Sle		wp->type = COMBINED;
342130389Sle
343130389Sle	wp->offset = real_off;
344130389Sle	wp->length = (bcount <= len_left) ? bcount : len_left;
345130389Sle	wp->data = addr;
346130389Sle	wp->original = original->consumer;
347130389Sle	wp->parity = parity->consumer;
348130389Sle	wp->lockbase = stripestart;
349130389Sle
350130389Sle	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
351130389Sle
352130389Sle	switch (bp->bio_cmd) {
353130389Sle	case BIO_READ:
354130389Sle		/*
355130389Sle		 * For a degraded read we need to read in all stripes except
356130389Sle		 * the broken one plus the parity stripe and then recalculate
357130389Sle		 * the desired data.
358130389Sle		 */
359130389Sle		if (wp->type == DEGRADED) {
360130389Sle			wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
361130389Sle			wp->bufmalloc = 1;
362130389Sle			LIST_FOREACH(s, &p->subdisks, in_plex) {
363130389Sle				/* Skip the broken subdisk. */
364130389Sle				if (s == broken)
365130389Sle					continue;
366130389Sle				rbp = gv_new_raid5_bit();
367130389Sle				rbp->consumer = s->consumer;
368130389Sle				rbp->bio = g_new_bio();
369130389Sle				if (rbp->bio == NULL)
370130389Sle					return (ENOMEM);
371130389Sle				rbp->buf = g_malloc(wp->length,
372130389Sle					M_WAITOK | M_ZERO);
373130389Sle				rbp->malloc = 1;
374130389Sle				rbp->bio->bio_cmd = BIO_READ;
375130389Sle				rbp->bio->bio_offset = wp->offset;
376130389Sle				rbp->bio->bio_length = wp->length;
377130389Sle				rbp->bio->bio_data = rbp->buf;
378130389Sle				rbp->bio->bio_done = gv_raid5_done;
379130389Sle				rbp->bio->bio_caller1 = wp;
380130389Sle				rbp->bio->bio_caller2 = rbp;
381130389Sle				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
382130389Sle				wp->active++;
383130389Sle				wp->rqcount++;
384130389Sle			}
385130389Sle
386130389Sle		/* A normal read can be fulfilled with the original subdisk. */
387130389Sle		} else {
388130389Sle			rbp = gv_new_raid5_bit();
389130389Sle			rbp->consumer = wp->original;
390130389Sle			rbp->bio = g_new_bio();
391130389Sle			if (rbp->bio == NULL)
392130389Sle				return (ENOMEM);
393130389Sle			rbp->bio->bio_cmd = BIO_READ;
394130389Sle			rbp->bio->bio_offset = wp->offset;
395130389Sle			rbp->bio->bio_length = wp->length;
396130389Sle			rbp->buf = addr;
397130389Sle			rbp->bio->bio_data = rbp->buf;
398130389Sle			rbp->bio->bio_done = gv_raid5_done;
399130389Sle			rbp->bio->bio_caller1 = wp;
400130389Sle			rbp->bio->bio_caller2 = rbp;
401130389Sle			TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
402130389Sle			wp->active++;
403130389Sle			wp->rqcount++;
404130389Sle		}
405130389Sle		if (wp->type != COMBINED)
406130389Sle			wp->lockbase = -1;
407130389Sle		break;
408130389Sle
409130389Sle	case BIO_WRITE:
410130389Sle		/*
411130389Sle		 * A degraded write means we cannot write to the original data
412130389Sle		 * subdisk.  Thus we need to read in all valid stripes,
413130389Sle		 * recalculate the parity from the original data, and then
414130389Sle		 * write the parity stripe back out.
415130389Sle		 */
416130389Sle		if (wp->type == DEGRADED) {
417130389Sle			wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
418130389Sle			wp->bufmalloc = 1;
419130389Sle
420130389Sle			/* Copy the original data. */
421130389Sle			bcopy(wp->data, wp->buf, wp->length);
422130389Sle
423130389Sle			LIST_FOREACH(s, &p->subdisks, in_plex) {
424130389Sle				/* Skip the broken and the parity subdisk. */
425130389Sle				if ((s == broken) ||
426130389Sle				    (s->consumer == wp->parity))
427130389Sle					continue;
428130389Sle
429130389Sle				rbp = gv_new_raid5_bit();
430130389Sle				rbp->consumer = s->consumer;
431130389Sle				rbp->bio = g_new_bio();
432130389Sle				if (rbp->bio == NULL)
433130389Sle					return (ENOMEM);
434130389Sle				rbp->buf = g_malloc(wp->length,
435130389Sle				    M_WAITOK | M_ZERO);
436130389Sle				rbp->malloc = 1;
437130389Sle				rbp->bio->bio_cmd = BIO_READ;
438130389Sle				rbp->bio->bio_data = rbp->buf;
439130389Sle				rbp->bio->bio_offset = wp->offset;
440130389Sle				rbp->bio->bio_length = wp->length;
441130389Sle				rbp->bio->bio_done = gv_raid5_done;
442130389Sle				rbp->bio->bio_caller1 = wp;
443130389Sle				rbp->bio->bio_caller2 = rbp;
444130389Sle				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
445130389Sle				wp->active++;
446130389Sle				wp->rqcount++;
447130389Sle			}
448130389Sle
449130389Sle		/*
450130389Sle		 * When we don't have the parity stripe we just write out the
451130389Sle		 * data.
452130389Sle		 */
453130389Sle		} else if (wp->type == NOPARITY) {
454130389Sle			rbp = gv_new_raid5_bit();
455130389Sle			rbp->consumer = wp->original;
456130389Sle			rbp->bio = g_new_bio();
457130389Sle			if (rbp->bio == NULL)
458130389Sle				return (ENOMEM);
459130389Sle			rbp->bio->bio_cmd = BIO_WRITE;
460130389Sle			rbp->bio->bio_offset = wp->offset;
461130389Sle			rbp->bio->bio_length = wp->length;
462130389Sle			rbp->bio->bio_data = addr;
463130389Sle			rbp->bio->bio_done = gv_raid5_done;
464130389Sle			rbp->bio->bio_caller1 = wp;
465130389Sle			rbp->bio->bio_caller2 = rbp;
466130389Sle			TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
467130389Sle			wp->active++;
468130389Sle			wp->rqcount++;
469130389Sle
470130389Sle		/*
471130389Sle		 * A combined write means that our data subdisk and the parity
472130389Sle		 * subdisks are both up, but another subdisk isn't.  We need to
473130389Sle		 * read all valid stripes including the parity to recalculate
474130389Sle		 * the data of the stripe that is missing.  Then we write our
475130389Sle		 * original data, and together with the other data stripes
476130389Sle		 * recalculate the parity again.
477130389Sle		 */
478130389Sle		} else if (wp->type == COMBINED) {
479130389Sle			wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
480130389Sle			wp->bufmalloc = 1;
481130389Sle
482130389Sle			/* Get the data from all subdisks. */
483130389Sle			LIST_FOREACH(s, &p->subdisks, in_plex) {
484130389Sle				/* Skip the broken subdisk. */
485130389Sle				if (s == broken)
486130389Sle					continue;
487130389Sle
488130389Sle				rbp = gv_new_raid5_bit();
489130389Sle				rbp->consumer = s->consumer;
490130389Sle				rbp->bio = g_new_bio();
491130389Sle				if (rbp->bio == NULL)
492130389Sle					return (ENOMEM);
493130389Sle				rbp->bio->bio_cmd = BIO_READ;
494130389Sle				rbp->buf = g_malloc(wp->length,
495130389Sle				    M_WAITOK | M_ZERO);
496130389Sle				rbp->malloc = 1;
497130389Sle				rbp->bio->bio_data = rbp->buf;
498130389Sle				rbp->bio->bio_offset = wp->offset;
499130389Sle				rbp->bio->bio_length = wp->length;
500130389Sle				rbp->bio->bio_done = gv_raid5_done;
501130389Sle				rbp->bio->bio_caller1 = wp;
502130389Sle				rbp->bio->bio_caller2 = rbp;
503130389Sle				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
504130389Sle				wp->active++;
505130389Sle				wp->rqcount++;
506130389Sle			}
507130389Sle
508130389Sle			/* Write the original data. */
509130389Sle			rbp = gv_new_raid5_bit();
510130389Sle			rbp->consumer = wp->original;
511130389Sle			rbp->buf = addr;
512130389Sle			rbp->bio = g_new_bio();
513130389Sle			if (rbp->bio == NULL)
514130389Sle				return (ENOMEM);
515130389Sle			rbp->bio->bio_cmd = BIO_WRITE;
516130389Sle			rbp->bio->bio_data = rbp->buf;
517130389Sle			rbp->bio->bio_offset = wp->offset;
518130389Sle			rbp->bio->bio_length = wp->length;
519130389Sle			rbp->bio->bio_done = gv_raid5_done;
520130389Sle			rbp->bio->bio_caller1 = wp;
521130389Sle			rbp->bio->bio_caller2 = rbp;
522130389Sle			/*
523130389Sle			 * Insert at the tail, because we want to read the old
524130389Sle			 * data first.
525130389Sle			 */
526130389Sle			TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
527130389Sle			wp->active++;
528130389Sle			wp->rqcount++;
529130389Sle
530130389Sle			/* Get the rest of the data again. */
531130389Sle			LIST_FOREACH(s, &p->subdisks, in_plex) {
532130389Sle				/*
533130389Sle				 * Skip the broken subdisk, the parity, and the
534130389Sle				 * one we just wrote.
535130389Sle				 */
536130389Sle				if ((s == broken) ||
537130389Sle				    (s->consumer == wp->parity) ||
538130389Sle				    (s->consumer == wp->original))
539130389Sle					continue;
540130389Sle				rbp = gv_new_raid5_bit();
541130389Sle				rbp->consumer = s->consumer;
542130389Sle				rbp->bio = g_new_bio();
543130389Sle				if (rbp->bio == NULL)
544130389Sle					return (ENOMEM);
545130389Sle				rbp->bio->bio_cmd = BIO_READ;
546130389Sle				rbp->buf = g_malloc(wp->length,
547130389Sle				    M_WAITOK | M_ZERO);
548130389Sle				rbp->malloc = 1;
549130389Sle				rbp->bio->bio_data = rbp->buf;
550130389Sle				rbp->bio->bio_offset = wp->offset;
551130389Sle				rbp->bio->bio_length = wp->length;
552130389Sle				rbp->bio->bio_done = gv_raid5_done;
553130389Sle				rbp->bio->bio_caller1 = wp;
554130389Sle				rbp->bio->bio_caller2 = rbp;
555130389Sle				/*
556130389Sle				 * Again, insert at the tail to keep correct
557130389Sle				 * order.
558130389Sle				 */
559130389Sle				TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
560130389Sle				wp->active++;
561130389Sle				wp->rqcount++;
562130389Sle			}
563130389Sle
564130389Sle
565130389Sle		/*
566130389Sle		 * A normal write request goes to the original subdisk, then we
567130389Sle		 * read in all other stripes, recalculate the parity and write
568130389Sle		 * out the parity again.
569130389Sle		 */
570130389Sle		} else {
571130389Sle			wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
572130389Sle			wp->bufmalloc = 1;
573130389Sle			LIST_FOREACH(s, &p->subdisks, in_plex) {
574130389Sle				/* Skip the parity stripe. */
575130389Sle				if (s->consumer == wp->parity)
576130389Sle					continue;
577130389Sle
578130389Sle				rbp = gv_new_raid5_bit();
579130389Sle				rbp->consumer = s->consumer;
580130389Sle				rbp->bio = g_new_bio();
581130389Sle				if (rbp->bio == NULL)
582130389Sle					return (ENOMEM);
583130389Sle				/*
584130389Sle				 * The data for the original stripe is written,
585130389Sle				 * the others need to be read in for the parity
586130389Sle				 * calculation.
587130389Sle				 */
588130389Sle				if (s->consumer == wp->original) {
589130389Sle					rbp->bio->bio_cmd = BIO_WRITE;
590130389Sle					rbp->buf = addr;
591130389Sle				} else {
592130389Sle					rbp->bio->bio_cmd = BIO_READ;
593130389Sle					rbp->buf = g_malloc(wp->length,
594130389Sle					    M_WAITOK | M_ZERO);
595130389Sle					rbp->malloc = 1;
596130389Sle				}
597130389Sle				rbp->bio->bio_data = rbp->buf;
598130389Sle				rbp->bio->bio_offset = wp->offset;
599130389Sle				rbp->bio->bio_length = wp->length;
600130389Sle				rbp->bio->bio_done = gv_raid5_done;
601130389Sle				rbp->bio->bio_caller1 = wp;
602130389Sle				rbp->bio->bio_caller2 = rbp;
603130389Sle				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
604130389Sle				wp->active++;
605130389Sle				wp->rqcount++;
606130389Sle			}
607130389Sle		}
608130389Sle		break;
609130389Sle	default:
610130389Sle		return (EINVAL);
611130389Sle	}
612130389Sle
613130389Sle	wp->state = VALID;
614130389Sle	return (0);
615130389Sle}
616