geom_vinum_raid5.c revision 135426
1130389Sle/*-
2130389Sle * Copyright (c) 2004 Lukas Ertl
3130389Sle * All rights reserved.
4130389Sle *
5130389Sle * Redistribution and use in source and binary forms, with or without
6130389Sle * modification, are permitted provided that the following conditions
7130389Sle * are met:
8130389Sle * 1. Redistributions of source code must retain the above copyright
9130389Sle *    notice, this list of conditions and the following disclaimer.
10130389Sle * 2. Redistributions in binary form must reproduce the above copyright
11130389Sle *    notice, this list of conditions and the following disclaimer in the
12130389Sle *    documentation and/or other materials provided with the distribution.
13130389Sle *
14130389Sle * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15130389Sle * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16130389Sle * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17130389Sle * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18130389Sle * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19130389Sle * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20130389Sle * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21130389Sle * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22130389Sle * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23130389Sle * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24130389Sle * SUCH DAMAGE.
25130389Sle */
26130389Sle
27130389Sle#include <sys/cdefs.h>
28130389Sle__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_raid5.c 135426 2004-09-18 13:44:43Z le $");
29130389Sle
30130389Sle#include <sys/param.h>
31130389Sle#include <sys/bio.h>
32130389Sle#include <sys/conf.h>
33130389Sle#include <sys/errno.h>
34130389Sle#include <sys/kernel.h>
35130389Sle#include <sys/kthread.h>
36130389Sle#include <sys/libkern.h>
37130389Sle#include <sys/lock.h>
38130389Sle#include <sys/malloc.h>
39130389Sle#include <sys/mutex.h>
40130389Sle#include <sys/systm.h>
41130389Sle
42130389Sle#include <geom/geom.h>
43130389Sle#include <geom/vinum/geom_vinum_var.h>
44130389Sle#include <geom/vinum/geom_vinum_raid5.h>
45130389Sle#include <geom/vinum/geom_vinum.h>
46130389Sle
47130389Sle/*
48130389Sle * Check if the stripe that the work packet wants is already being used by
49130389Sle * some other work packet.
50130389Sle */
51130389Sleint
52135426Slegv_stripe_active(struct gv_plex *p, struct bio *bp)
53130389Sle{
54135426Sle	struct gv_raid5_packet *wp, *owp;
55135426Sle	int overlap;
56130389Sle
57135426Sle	wp = bp->bio_driver1;
58135426Sle	if (wp->lockbase == -1)
59135426Sle		return (0);
60130389Sle
61135426Sle	overlap = 0;
62135426Sle	TAILQ_FOREACH(owp, &p->packets, list) {
63135426Sle		if (owp == wp)
64135426Sle			break;
65135426Sle		if ((wp->lockbase >= owp->lockbase) &&
66135426Sle		    (wp->lockbase <= owp->lockbase + owp->length)) {
67135426Sle			overlap++;
68135426Sle			break;
69130389Sle		}
70135426Sle		if ((wp->lockbase <= owp->lockbase) &&
71135426Sle		    (wp->lockbase + wp->length >= owp->lockbase)) {
72135426Sle			overlap++;
73135426Sle			break;
74130389Sle		}
75130389Sle	}
76130389Sle
77135426Sle	return (overlap);
78130389Sle}
79130389Sle
80130389Sle/* Build a request group to perform (part of) a RAID5 request. */
81130389Sleint
82135426Slegv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
83135426Sle    struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
84130389Sle{
85130389Sle	struct g_geom *gp;
86130389Sle	struct gv_sd *broken, *original, *parity, *s;
87135426Sle	struct gv_bioq *bq;
88135426Sle	struct bio *cbp, *pbp;
89135426Sle	int i, psdno, sdno, type;
90135426Sle	off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart;
91130389Sle
92130389Sle	gp = bp->bio_to->geom;
93130389Sle
94130389Sle	if (p == NULL || LIST_EMPTY(&p->subdisks))
95130389Sle		return (ENXIO);
96130389Sle
97130389Sle	/* We are optimistic and assume that this request will be OK. */
98135426Sle#define	REQ_TYPE_NORMAL		0
99135426Sle#define	REQ_TYPE_DEGRADED	1
100135426Sle#define	REQ_TYPE_NOPARITY	2
101135426Sle
102135426Sle	type = REQ_TYPE_NORMAL;
103130389Sle	original = parity = broken = NULL;
104130389Sle
105130389Sle	/* The number of the subdisk containing the parity stripe. */
106130389Sle	psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
107130389Sle	    p->sdcount;
108130389Sle	KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0"));
109130389Sle
110130389Sle	/* Offset of the start address from the start of the stripe. */
111130389Sle	stripeoff = boff % (p->stripesize * (p->sdcount - 1));
112130389Sle	KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0"));
113130389Sle
114130389Sle	/* The number of the subdisk where the stripe resides. */
115130389Sle	sdno = stripeoff / p->stripesize;
116130389Sle	KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0"));
117130389Sle
118130389Sle	/* At or past parity subdisk. */
119130389Sle	if (sdno >= psdno)
120130389Sle		sdno++;
121130389Sle
122130389Sle	/* The offset of the stripe on this subdisk. */
123130389Sle	stripestart = (boff - stripeoff) / (p->sdcount - 1);
124130389Sle	KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0"));
125130389Sle
126130477Sle	stripeoff %= p->stripesize;
127130389Sle
128130389Sle	/* The offset of the request on this subdisk. */
129130389Sle	real_off = stripestart + stripeoff;
130130389Sle
131130389Sle	stripeend = stripestart + p->stripesize;
132130389Sle	len_left = stripeend - real_off;
133130389Sle	KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0"));
134130389Sle
135130389Sle	/* Find the right subdisks. */
136130389Sle	i = 0;
137130389Sle	LIST_FOREACH(s, &p->subdisks, in_plex) {
138130389Sle		if (i == sdno)
139130389Sle			original = s;
140130389Sle		if (i == psdno)
141130389Sle			parity = s;
142130389Sle		if (s->state != GV_SD_UP)
143130389Sle			broken = s;
144130389Sle		i++;
145130389Sle	}
146130389Sle
147130389Sle	if ((original == NULL) || (parity == NULL))
148130389Sle		return (ENXIO);
149130389Sle
150130389Sle	/* Our data stripe is missing. */
151130389Sle	if (original->state != GV_SD_UP)
152135426Sle		type = REQ_TYPE_DEGRADED;
153130389Sle	/* Our parity stripe is missing. */
154130389Sle	if (parity->state != GV_SD_UP) {
155130389Sle		/* We cannot take another failure if we're already degraded. */
156135426Sle		if (type != REQ_TYPE_NORMAL)
157130389Sle			return (ENXIO);
158130389Sle		else
159135426Sle			type = REQ_TYPE_NOPARITY;
160130389Sle	}
161130389Sle
162135426Sle	real_len = (bcount <= len_left) ? bcount : len_left;
163135426Sle	wp->length = real_len;
164130389Sle	wp->data = addr;
165135426Sle	wp->lockbase = real_off;
166130389Sle
167130389Sle	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
168130389Sle
169130389Sle	switch (bp->bio_cmd) {
170130389Sle	case BIO_READ:
171130389Sle		/*
172130389Sle		 * For a degraded read we need to read in all stripes except
173130389Sle		 * the broken one plus the parity stripe and then recalculate
174130389Sle		 * the desired data.
175130389Sle		 */
176135426Sle		if (type == REQ_TYPE_DEGRADED) {
177135426Sle			bzero(wp->data, wp->length);
178130389Sle			LIST_FOREACH(s, &p->subdisks, in_plex) {
179130389Sle				/* Skip the broken subdisk. */
180130389Sle				if (s == broken)
181130389Sle					continue;
182135426Sle				cbp = g_clone_bio(bp);
183135426Sle				if (cbp == NULL)
184130389Sle					return (ENOMEM);
185135426Sle				cbp->bio_data = g_malloc(real_len, M_WAITOK);
186135426Sle				cbp->bio_cflags |= GV_BIO_MALLOC;
187135426Sle				cbp->bio_offset = real_off;
188135426Sle				cbp->bio_length = real_len;
189135426Sle				cbp->bio_done = gv_plex_done;
190135426Sle				cbp->bio_caller2 = s->consumer;
191135426Sle				cbp->bio_driver1 = wp;
192135426Sle
193135426Sle				GV_ENQUEUE(bp, cbp, pbp);
194135426Sle
195135426Sle				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
196135426Sle				bq->bp = cbp;
197135426Sle				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
198130389Sle			}
199130389Sle
200130389Sle		/* A normal read can be fulfilled with the original subdisk. */
201130389Sle		} else {
202135426Sle			cbp = g_clone_bio(bp);
203135426Sle			if (cbp == NULL)
204130389Sle				return (ENOMEM);
205135426Sle			cbp->bio_offset = real_off;
206135426Sle			cbp->bio_length = real_len;
207135426Sle			cbp->bio_data = addr;
208135426Sle			cbp->bio_done = g_std_done;
209135426Sle			cbp->bio_caller2 = original->consumer;
210135426Sle
211135426Sle			GV_ENQUEUE(bp, cbp, pbp);
212130389Sle		}
213135426Sle		wp->lockbase = -1;
214135426Sle
215130389Sle		break;
216130389Sle
217130389Sle	case BIO_WRITE:
218130389Sle		/*
219130389Sle		 * A degraded write means we cannot write to the original data
220130389Sle		 * subdisk.  Thus we need to read in all valid stripes,
221130389Sle		 * recalculate the parity from the original data, and then
222130389Sle		 * write the parity stripe back out.
223130389Sle		 */
224135426Sle		if (type == REQ_TYPE_DEGRADED) {
225135426Sle			/* Read all subdisks. */
226130389Sle			LIST_FOREACH(s, &p->subdisks, in_plex) {
227130389Sle				/* Skip the broken and the parity subdisk. */
228135426Sle				if ((s == broken) || (s == parity))
229130389Sle					continue;
230130389Sle
231135426Sle				cbp = g_clone_bio(bp);
232135426Sle				if (cbp == NULL)
233130389Sle					return (ENOMEM);
234135426Sle				cbp->bio_cmd = BIO_READ;
235135426Sle				cbp->bio_data = g_malloc(real_len, M_WAITOK);
236135426Sle				cbp->bio_cflags |= GV_BIO_MALLOC;
237135426Sle				cbp->bio_offset = real_off;
238135426Sle				cbp->bio_length = real_len;
239135426Sle				cbp->bio_done = gv_plex_done;
240135426Sle				cbp->bio_caller2 = s->consumer;
241135426Sle				cbp->bio_driver1 = wp;
242135426Sle
243135426Sle				GV_ENQUEUE(bp, cbp, pbp);
244135426Sle
245135426Sle				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
246135426Sle				bq->bp = cbp;
247135426Sle				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
248130389Sle			}
249130389Sle
250135426Sle			/* Write the parity data. */
251135426Sle			cbp = g_clone_bio(bp);
252135426Sle			if (cbp == NULL)
253130389Sle				return (ENOMEM);
254135426Sle			cbp->bio_data = g_malloc(real_len, M_WAITOK);
255135426Sle			cbp->bio_cflags |= GV_BIO_MALLOC;
256135426Sle			bcopy(addr, cbp->bio_data, real_len);
257135426Sle			cbp->bio_offset = real_off;
258135426Sle			cbp->bio_length = real_len;
259135426Sle			cbp->bio_done = gv_plex_done;
260135426Sle			cbp->bio_caller2 = parity->consumer;
261135426Sle			cbp->bio_driver1 = wp;
262135426Sle			wp->parity = cbp;
263130389Sle
264130389Sle		/*
265135426Sle		 * When the parity stripe is missing we just write out the data.
266130389Sle		 */
267135426Sle		} else if (type == REQ_TYPE_NOPARITY) {
268135426Sle			cbp = g_clone_bio(bp);
269135426Sle			if (cbp == NULL)
270130925Sle				return (ENOMEM);
271135426Sle			cbp->bio_offset = real_off;
272135426Sle			cbp->bio_length = real_len;
273135426Sle			cbp->bio_data = addr;
274135426Sle			cbp->bio_done = gv_plex_done;
275135426Sle			cbp->bio_caller2 = original->consumer;
276135426Sle			cbp->bio_driver1 = wp;
277130389Sle
278135426Sle			GV_ENQUEUE(bp, cbp, pbp);
279130389Sle
280135426Sle			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
281135426Sle			bq->bp = cbp;
282135426Sle			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
283130389Sle
284130389Sle		/*
285130389Sle		 * A normal write request goes to the original subdisk, then we
286130389Sle		 * read in all other stripes, recalculate the parity and write
287130389Sle		 * out the parity again.
288130389Sle		 */
289130389Sle		} else {
290135426Sle			/* Read old parity. */
291135426Sle			cbp = g_clone_bio(bp);
292135426Sle			if (cbp == NULL)
293130925Sle				return (ENOMEM);
294135426Sle			cbp->bio_cmd = BIO_READ;
295135426Sle			cbp->bio_data = g_malloc(real_len, M_WAITOK);
296135426Sle			cbp->bio_cflags |= GV_BIO_MALLOC;
297135426Sle			cbp->bio_offset = real_off;
298135426Sle			cbp->bio_length = real_len;
299135426Sle			cbp->bio_done = gv_plex_done;
300135426Sle			cbp->bio_caller2 = parity->consumer;
301135426Sle			cbp->bio_driver1 = wp;
302130389Sle
303135426Sle			GV_ENQUEUE(bp, cbp, pbp);
304135426Sle
305135426Sle			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
306135426Sle			bq->bp = cbp;
307135426Sle			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
308135426Sle
309135426Sle			/* Read old data. */
310135426Sle			cbp = g_clone_bio(bp);
311135426Sle			if (cbp == NULL)
312135426Sle				return (ENOMEM);
313135426Sle			cbp->bio_cmd = BIO_READ;
314135426Sle			cbp->bio_data = g_malloc(real_len, M_WAITOK);
315135426Sle			cbp->bio_cflags |= GV_BIO_MALLOC;
316135426Sle			cbp->bio_offset = real_off;
317135426Sle			cbp->bio_length = real_len;
318135426Sle			cbp->bio_done = gv_plex_done;
319135426Sle			cbp->bio_caller2 = original->consumer;
320135426Sle			cbp->bio_driver1 = wp;
321135426Sle
322135426Sle			GV_ENQUEUE(bp, cbp, pbp);
323135426Sle
324135426Sle			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
325135426Sle			bq->bp = cbp;
326135426Sle			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
327135426Sle
328135426Sle			/* Write new data. */
329135426Sle			cbp = g_clone_bio(bp);
330135426Sle			if (cbp == NULL)
331135426Sle				return (ENOMEM);
332135426Sle			cbp->bio_data = addr;
333135426Sle			cbp->bio_offset = real_off;
334135426Sle			cbp->bio_length = real_len;
335135426Sle			cbp->bio_done = gv_plex_done;
336135426Sle			cbp->bio_caller2 = original->consumer;
337135426Sle
338135426Sle			cbp->bio_driver1 = wp;
339135426Sle
340135426Sle			/*
341135426Sle			 * We must not write the new data until the old data
342135426Sle			 * was read, so hold this BIO back until we're ready
343135426Sle			 * for it.
344135426Sle			 */
345135426Sle			wp->waiting = cbp;
346135426Sle
347135426Sle			/* The final bio for the parity. */
348135426Sle			cbp = g_clone_bio(bp);
349135426Sle			if (cbp == NULL)
350135426Sle				return (ENOMEM);
351135426Sle			cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
352135426Sle			cbp->bio_cflags |= GV_BIO_MALLOC;
353135426Sle			cbp->bio_offset = real_off;
354135426Sle			cbp->bio_length = real_len;
355135426Sle			cbp->bio_done = gv_plex_done;
356135426Sle			cbp->bio_caller2 = parity->consumer;
357135426Sle			cbp->bio_driver1 = wp;
358135426Sle
359135426Sle			/* Remember that this is the BIO for the parity data. */
360135426Sle			wp->parity = cbp;
361130389Sle		}
362130389Sle		break;
363135426Sle
364130389Sle	default:
365130389Sle		return (EINVAL);
366130389Sle	}
367130389Sle
368130389Sle	return (0);
369130389Sle}
370