Deleted Added
full compact
geom_vinum_raid5.c (184292) geom_vinum_raid5.c (190507)
1/*-
1/*-
2 * Copyright (c) 2004 Lukas Ertl
2 * Copyright (c) 2004, 2007 Lukas Ertl
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright

--- 9 unchanged lines hidden (view full) ---

20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright

--- 9 unchanged lines hidden (view full) ---

20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_raid5.c 184292 2008-10-26 17:20:37Z lulf $");
28__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_raid5.c 190507 2009-03-28 17:20:08Z lulf $");
29
30#include <sys/param.h>
31#include <sys/bio.h>
29
30#include <sys/param.h>
31#include <sys/bio.h>
32#include <sys/conf.h>
33#include <sys/errno.h>
34#include <sys/kernel.h>
35#include <sys/kthread.h>
36#include <sys/libkern.h>
37#include <sys/lock.h>
38#include <sys/malloc.h>
32#include <sys/lock.h>
33#include <sys/malloc.h>
39#include <sys/mutex.h>
40#include <sys/systm.h>
41
42#include <geom/geom.h>
43#include <geom/vinum/geom_vinum_var.h>
44#include <geom/vinum/geom_vinum_raid5.h>
45#include <geom/vinum/geom_vinum.h>
46
34#include <sys/systm.h>
35
36#include <geom/geom.h>
37#include <geom/vinum/geom_vinum_var.h>
38#include <geom/vinum/geom_vinum_raid5.h>
39#include <geom/vinum/geom_vinum.h>
40
47int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *,
48 int *, int *);
41static int gv_raid5_offset(struct gv_plex *, off_t, off_t,
42 off_t *, off_t *, int *, int *, int);
43static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *,
44 struct gv_raid5_packet *, caddr_t, int);
45static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *,
46 struct bio *, caddr_t, off_t, off_t, int *);
47static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *,
48 struct bio *, caddr_t, off_t, off_t);
49static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *,
50 struct bio *, caddr_t, off_t, off_t);
49
51
52struct gv_raid5_packet *
53gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff,
54 off_t bcount)
55{
56 struct bio *cbp;
57 struct gv_raid5_packet *wp, *wp2;
58 struct gv_bioq *bq, *bq2;
59 int err, delay;
60
61 delay = 0;
62 wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
63 wp->bio = bp;
64 wp->waiting = NULL;
65 wp->parity = NULL;
66 TAILQ_INIT(&wp->bits);
67
68 if (bp->bio_cflags & GV_BIO_REBUILD)
69 err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount);
70 else if (bp->bio_cflags & GV_BIO_CHECK)
71 err = gv_raid5_check(p, wp, bp, addr, boff, bcount);
72 else
73 err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay);
74
75 /* Means we have a delayed request. */
76 if (delay) {
77 g_free(wp);
78 return (NULL);
79 }
80
81 /*
82 * Building the sub-request failed, we probably need to clean up a lot.
83 */
84 if (err) {
85 G_VINUM_LOGREQ(0, bp, "raid5 plex request failed.");
86 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
87 TAILQ_REMOVE(&wp->bits, bq, queue);
88 g_free(bq);
89 }
90 if (wp->waiting != NULL) {
91 if (wp->waiting->bio_cflags & GV_BIO_MALLOC)
92 g_free(wp->waiting->bio_data);
93 g_destroy_bio(wp->waiting);
94 }
95 if (wp->parity != NULL) {
96 if (wp->parity->bio_cflags & GV_BIO_MALLOC)
97 g_free(wp->parity->bio_data);
98 g_destroy_bio(wp->parity);
99 }
100 g_free(wp);
101
102 TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
103 if (wp->bio != bp)
104 continue;
105
106 TAILQ_REMOVE(&p->packets, wp, list);
107 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
108 TAILQ_REMOVE(&wp->bits, bq, queue);
109 g_free(bq);
110 }
111 g_free(wp);
112 }
113
114 cbp = bioq_takefirst(p->bqueue);
115 while (cbp != NULL) {
116 if (cbp->bio_cflags & GV_BIO_MALLOC)
117 g_free(cbp->bio_data);
118 g_destroy_bio(cbp);
119 cbp = bioq_takefirst(p->bqueue);
120 }
121
122 /* If internal, stop and reset state. */
123 if (bp->bio_cflags & GV_BIO_INTERNAL) {
124 if (bp->bio_cflags & GV_BIO_MALLOC)
125 g_free(cbp->bio_data);
126 g_destroy_bio(bp);
127 /* Reset flags. */
128 p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
129 GV_PLEX_GROWING);
130 return (NULL);
131 }
132 g_io_deliver(bp, err);
133 return (NULL);
134 }
135
136 return (wp);
137}
138
50/*
51 * Check if the stripe that the work packet wants is already being used by
52 * some other work packet.
53 */
54int
55gv_stripe_active(struct gv_plex *p, struct bio *bp)
56{
57 struct gv_raid5_packet *wp, *owp;
58 int overlap;
59
139/*
140 * Check if the stripe that the work packet wants is already being used by
141 * some other work packet.
142 */
143int
144gv_stripe_active(struct gv_plex *p, struct bio *bp)
145{
146 struct gv_raid5_packet *wp, *owp;
147 int overlap;
148
60 wp = bp->bio_driver1;
149 wp = bp->bio_caller2;
61 if (wp->lockbase == -1)
62 return (0);
63
64 overlap = 0;
65 TAILQ_FOREACH(owp, &p->packets, list) {
66 if (owp == wp)
67 break;
68 if ((wp->lockbase >= owp->lockbase) &&

--- 6 unchanged lines hidden (view full) ---

75 overlap++;
76 break;
77 }
78 }
79
80 return (overlap);
81}
82
150 if (wp->lockbase == -1)
151 return (0);
152
153 overlap = 0;
154 TAILQ_FOREACH(owp, &p->packets, list) {
155 if (owp == wp)
156 break;
157 if ((wp->lockbase >= owp->lockbase) &&

--- 6 unchanged lines hidden (view full) ---

164 overlap++;
165 break;
166 }
167 }
168
169 return (overlap);
170}
171
83int
84gv_check_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
172static int
173gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
85 caddr_t addr, off_t boff, off_t bcount)
86{
87 struct gv_sd *parity, *s;
88 struct gv_bioq *bq;
174 caddr_t addr, off_t boff, off_t bcount)
175{
176 struct gv_sd *parity, *s;
177 struct gv_bioq *bq;
89 struct bio *cbp, *pbp;
178 struct bio *cbp;
90 int i, psdno;
91 off_t real_len, real_off;
92
93 if (p == NULL || LIST_EMPTY(&p->subdisks))
94 return (ENXIO);
95
179 int i, psdno;
180 off_t real_len, real_off;
181
182 if (p == NULL || LIST_EMPTY(&p->subdisks))
183 return (ENXIO);
184
96 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno);
185 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1);
97
98 /* Find the right subdisk. */
99 parity = NULL;
100 i = 0;
101 LIST_FOREACH(s, &p->subdisks, in_plex) {
102 if (i == psdno) {
103 parity = s;
104 break;

--- 12 unchanged lines hidden (view full) ---

117 wp->data = addr;
118 wp->lockbase = real_off;
119
120 /* Read all subdisks. */
121 LIST_FOREACH(s, &p->subdisks, in_plex) {
122 /* Skip the parity subdisk. */
123 if (s == parity)
124 continue;
186
187 /* Find the right subdisk. */
188 parity = NULL;
189 i = 0;
190 LIST_FOREACH(s, &p->subdisks, in_plex) {
191 if (i == psdno) {
192 parity = s;
193 break;

--- 12 unchanged lines hidden (view full) ---

206 wp->data = addr;
207 wp->lockbase = real_off;
208
209 /* Read all subdisks. */
210 LIST_FOREACH(s, &p->subdisks, in_plex) {
211 /* Skip the parity subdisk. */
212 if (s == parity)
213 continue;
214 /* Skip growing subdisks. */
215 if (s->flags & GV_SD_GROW)
216 continue;
125
217
126 cbp = g_clone_bio(bp);
218 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
127 if (cbp == NULL)
128 return (ENOMEM);
129 cbp->bio_cmd = BIO_READ;
219 if (cbp == NULL)
220 return (ENOMEM);
221 cbp->bio_cmd = BIO_READ;
130 cbp->bio_data = g_malloc(real_len, M_WAITOK);
131 cbp->bio_cflags |= GV_BIO_MALLOC;
132 cbp->bio_offset = real_off;
133 cbp->bio_length = real_len;
134 cbp->bio_done = gv_plex_done;
135 cbp->bio_caller2 = s->consumer;
136 cbp->bio_driver1 = wp;
137
222
138 GV_ENQUEUE(bp, cbp, pbp);
223 bioq_insert_tail(p->bqueue, cbp);
139
140 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
141 bq->bp = cbp;
142 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
143 }
144
145 /* Read the parity data. */
224
225 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
226 bq->bp = cbp;
227 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
228 }
229
230 /* Read the parity data. */
146 cbp = g_clone_bio(bp);
231 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
147 if (cbp == NULL)
148 return (ENOMEM);
149 cbp->bio_cmd = BIO_READ;
232 if (cbp == NULL)
233 return (ENOMEM);
234 cbp->bio_cmd = BIO_READ;
150 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
151 cbp->bio_cflags |= GV_BIO_MALLOC;
152 cbp->bio_offset = real_off;
153 cbp->bio_length = real_len;
154 cbp->bio_done = gv_plex_done;
155 cbp->bio_caller2 = parity->consumer;
156 cbp->bio_driver1 = wp;
157 wp->waiting = cbp;
158
159 /*
160 * In case we want to rebuild the parity, create an extra BIO to write
161 * it out. It also acts as buffer for the XOR operations.
162 */
235 wp->waiting = cbp;
236
237 /*
238 * In case we want to rebuild the parity, create an extra BIO to write
239 * it out. It also acts as buffer for the XOR operations.
240 */
163 cbp = g_clone_bio(bp);
241 cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1);
164 if (cbp == NULL)
165 return (ENOMEM);
242 if (cbp == NULL)
243 return (ENOMEM);
166 cbp->bio_data = addr;
167 cbp->bio_offset = real_off;
168 cbp->bio_length = real_len;
169 cbp->bio_done = gv_plex_done;
170 cbp->bio_caller2 = parity->consumer;
171 cbp->bio_driver1 = wp;
172 wp->parity = cbp;
173
174 return (0);
175}
176
177/* Rebuild a degraded RAID5 plex. */
244 wp->parity = cbp;
245
246 return (0);
247}
248
249/* Rebuild a degraded RAID5 plex. */
178int
179gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
250static int
251gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
180 caddr_t addr, off_t boff, off_t bcount)
181{
182 struct gv_sd *broken, *s;
183 struct gv_bioq *bq;
252 caddr_t addr, off_t boff, off_t bcount)
253{
254 struct gv_sd *broken, *s;
255 struct gv_bioq *bq;
184 struct bio *cbp, *pbp;
256 struct bio *cbp;
185 off_t real_len, real_off;
186
187 if (p == NULL || LIST_EMPTY(&p->subdisks))
188 return (ENXIO);
189
257 off_t real_len, real_off;
258
259 if (p == NULL || LIST_EMPTY(&p->subdisks))
260 return (ENXIO);
261
190 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL);
262 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1);
191
192 /* Find the right subdisk. */
193 broken = NULL;
194 LIST_FOREACH(s, &p->subdisks, in_plex) {
195 if (s->state != GV_SD_UP)
196 broken = s;
197 }
198

--- 6 unchanged lines hidden (view full) ---

205 return (EINVAL);
206
207 case GV_SD_STALE:
208 if (!(bp->bio_cflags & GV_BIO_REBUILD))
209 return (ENXIO);
210
211 G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
212 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
263
264 /* Find the right subdisk. */
265 broken = NULL;
266 LIST_FOREACH(s, &p->subdisks, in_plex) {
267 if (s->state != GV_SD_UP)
268 broken = s;
269 }
270

--- 6 unchanged lines hidden (view full) ---

277 return (EINVAL);
278
279 case GV_SD_STALE:
280 if (!(bp->bio_cflags & GV_BIO_REBUILD))
281 return (ENXIO);
282
283 G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
284 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
285 /* Set this bit now, but should be set at end. */
286 broken->flags |= GV_SD_CANGOUP;
213 break;
214
215 case GV_SD_REVIVING:
216 break;
217
218 default:
219 /* All other subdisk states mean it's not accessible. */
220 return (ENXIO);

--- 6 unchanged lines hidden (view full) ---

227 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
228
229 /* Read all subdisks. */
230 LIST_FOREACH(s, &p->subdisks, in_plex) {
231 /* Skip the broken subdisk. */
232 if (s == broken)
233 continue;
234
287 break;
288
289 case GV_SD_REVIVING:
290 break;
291
292 default:
293 /* All other subdisk states mean it's not accessible. */
294 return (ENXIO);

--- 6 unchanged lines hidden (view full) ---

301 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
302
303 /* Read all subdisks. */
304 LIST_FOREACH(s, &p->subdisks, in_plex) {
305 /* Skip the broken subdisk. */
306 if (s == broken)
307 continue;
308
235 cbp = g_clone_bio(bp);
309 /* Skip growing subdisks. */
310 if (s->flags & GV_SD_GROW)
311 continue;
312
313 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
236 if (cbp == NULL)
237 return (ENOMEM);
238 cbp->bio_cmd = BIO_READ;
314 if (cbp == NULL)
315 return (ENOMEM);
316 cbp->bio_cmd = BIO_READ;
239 cbp->bio_data = g_malloc(real_len, M_WAITOK);
240 cbp->bio_cflags |= GV_BIO_MALLOC;
241 cbp->bio_offset = real_off;
242 cbp->bio_length = real_len;
243 cbp->bio_done = gv_plex_done;
244 cbp->bio_caller2 = s->consumer;
245 cbp->bio_driver1 = wp;
246
317
247 GV_ENQUEUE(bp, cbp, pbp);
318 bioq_insert_tail(p->bqueue, cbp);
248
249 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
250 bq->bp = cbp;
251 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
252 }
253
254 /* Write the parity data. */
319
320 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
321 bq->bp = cbp;
322 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
323 }
324
325 /* Write the parity data. */
255 cbp = g_clone_bio(bp);
326 cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1);
256 if (cbp == NULL)
257 return (ENOMEM);
327 if (cbp == NULL)
328 return (ENOMEM);
258 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
259 cbp->bio_cflags |= GV_BIO_MALLOC;
260 cbp->bio_offset = real_off;
261 cbp->bio_length = real_len;
262 cbp->bio_done = gv_plex_done;
263 cbp->bio_caller2 = broken->consumer;
264 cbp->bio_driver1 = wp;
265 cbp->bio_cflags |= GV_BIO_REBUILD;
266 wp->parity = cbp;
267
268 p->synced = boff;
269
329 cbp->bio_cflags |= GV_BIO_REBUILD;
330 wp->parity = cbp;
331
332 p->synced = boff;
333
334 /* Post notification that we're finished. */
270 return (0);
271}
272
273/* Build a request group to perform (part of) a RAID5 request. */
335 return (0);
336}
337
338/* Build a request group to perform (part of) a RAID5 request. */
274int
275gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
276 struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
339static int
340gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp,
341 struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay)
277{
278 struct g_geom *gp;
279 struct gv_sd *broken, *original, *parity, *s;
280 struct gv_bioq *bq;
342{
343 struct g_geom *gp;
344 struct gv_sd *broken, *original, *parity, *s;
345 struct gv_bioq *bq;
281 struct bio *cbp, *pbp;
282 int i, psdno, sdno, type;
346 struct bio *cbp;
347 int i, psdno, sdno, type, grow;
283 off_t real_len, real_off;
284
285 gp = bp->bio_to->geom;
286
287 if (p == NULL || LIST_EMPTY(&p->subdisks))
288 return (ENXIO);
289
290 /* We are optimistic and assume that this request will be OK. */
291#define REQ_TYPE_NORMAL 0
292#define REQ_TYPE_DEGRADED 1
293#define REQ_TYPE_NOPARITY 2
294
295 type = REQ_TYPE_NORMAL;
296 original = parity = broken = NULL;
297
348 off_t real_len, real_off;
349
350 gp = bp->bio_to->geom;
351
352 if (p == NULL || LIST_EMPTY(&p->subdisks))
353 return (ENXIO);
354
355 /* We are optimistic and assume that this request will be OK. */
356#define REQ_TYPE_NORMAL 0
357#define REQ_TYPE_DEGRADED 1
358#define REQ_TYPE_NOPARITY 2
359
360 type = REQ_TYPE_NORMAL;
361 original = parity = broken = NULL;
362
298 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno);
363 /* XXX: The resize won't crash with rebuild or sync, but we should still
364 * be aware of it. Also this should perhaps be done on rebuild/check as
365 * well?
366 */
367 /* If we're over, we must use the old. */
368 if (boff >= p->synced) {
369 grow = 1;
370 /* Or if over the resized offset, we use all drives. */
371 } else if (boff + bcount <= p->synced) {
372 grow = 0;
373 /* Else, we're in the middle, and must wait a bit. */
374 } else {
375 bioq_disksort(p->rqueue, bp);
376 *delay = 1;
377 return (0);
378 }
379 gv_raid5_offset(p, boff, bcount, &real_off, &real_len,
380 &sdno, &psdno, grow);
299
300 /* Find the right subdisks. */
301 i = 0;
302 LIST_FOREACH(s, &p->subdisks, in_plex) {
303 if (i == sdno)
304 original = s;
305 if (i == psdno)
306 parity = s;
307 if (s->state != GV_SD_UP)
308 broken = s;
309 i++;
310 }
311
312 if ((original == NULL) || (parity == NULL))
313 return (ENXIO);
314
315 /* Our data stripe is missing. */
316 if (original->state != GV_SD_UP)
317 type = REQ_TYPE_DEGRADED;
381
382 /* Find the right subdisks. */
383 i = 0;
384 LIST_FOREACH(s, &p->subdisks, in_plex) {
385 if (i == sdno)
386 original = s;
387 if (i == psdno)
388 parity = s;
389 if (s->state != GV_SD_UP)
390 broken = s;
391 i++;
392 }
393
394 if ((original == NULL) || (parity == NULL))
395 return (ENXIO);
396
397 /* Our data stripe is missing. */
398 if (original->state != GV_SD_UP)
399 type = REQ_TYPE_DEGRADED;
400
401 /* If synchronizing request, just write it if disks are stale. */
402 if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE &&
403 bp->bio_cflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) {
404 type = REQ_TYPE_NORMAL;
318 /* Our parity stripe is missing. */
405 /* Our parity stripe is missing. */
319 if (parity->state != GV_SD_UP) {
406 } else if (parity->state != GV_SD_UP) {
320 /* We cannot take another failure if we're already degraded. */
321 if (type != REQ_TYPE_NORMAL)
322 return (ENXIO);
323 else
324 type = REQ_TYPE_NOPARITY;
325 }
326
327 wp->length = real_len;
328 wp->data = addr;
329 wp->lockbase = real_off;
330
331 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
332
407 /* We cannot take another failure if we're already degraded. */
408 if (type != REQ_TYPE_NORMAL)
409 return (ENXIO);
410 else
411 type = REQ_TYPE_NOPARITY;
412 }
413
414 wp->length = real_len;
415 wp->data = addr;
416 wp->lockbase = real_off;
417
418 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
419
333 if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced))
420 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced))
334 type = REQ_TYPE_NORMAL;
335
421 type = REQ_TYPE_NORMAL;
422
423 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) {
424 bioq_disksort(p->rqueue, bp);
425 *delay = 1;
426 return (0);
427 }
428
336 switch (bp->bio_cmd) {
337 case BIO_READ:
338 /*
339 * For a degraded read we need to read in all stripes except
340 * the broken one plus the parity stripe and then recalculate
341 * the desired data.
342 */
343 if (type == REQ_TYPE_DEGRADED) {
344 bzero(wp->data, wp->length);
345 LIST_FOREACH(s, &p->subdisks, in_plex) {
346 /* Skip the broken subdisk. */
347 if (s == broken)
348 continue;
429 switch (bp->bio_cmd) {
430 case BIO_READ:
431 /*
432 * For a degraded read we need to read in all stripes except
433 * the broken one plus the parity stripe and then recalculate
434 * the desired data.
435 */
436 if (type == REQ_TYPE_DEGRADED) {
437 bzero(wp->data, wp->length);
438 LIST_FOREACH(s, &p->subdisks, in_plex) {
439 /* Skip the broken subdisk. */
440 if (s == broken)
441 continue;
349 cbp = g_clone_bio(bp);
442 /* Skip growing if within offset. */
443 if (grow && s->flags & GV_SD_GROW)
444 continue;
445 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
350 if (cbp == NULL)
351 return (ENOMEM);
446 if (cbp == NULL)
447 return (ENOMEM);
352 cbp->bio_data = g_malloc(real_len, M_WAITOK);
353 cbp->bio_cflags |= GV_BIO_MALLOC;
354 cbp->bio_offset = real_off;
355 cbp->bio_length = real_len;
356 cbp->bio_done = gv_plex_done;
357 cbp->bio_caller2 = s->consumer;
358 cbp->bio_driver1 = wp;
359
448
360 GV_ENQUEUE(bp, cbp, pbp);
449 bioq_insert_tail(p->bqueue, cbp);
361
362 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
363 bq->bp = cbp;
364 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
365 }
366
367 /* A normal read can be fulfilled with the original subdisk. */
368 } else {
450
451 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
452 bq->bp = cbp;
453 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
454 }
455
456 /* A normal read can be fulfilled with the original subdisk. */
457 } else {
369 cbp = g_clone_bio(bp);
458 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0);
370 if (cbp == NULL)
371 return (ENOMEM);
459 if (cbp == NULL)
460 return (ENOMEM);
372 cbp->bio_offset = real_off;
373 cbp->bio_length = real_len;
374 cbp->bio_data = addr;
375 cbp->bio_done = g_std_done;
376 cbp->bio_caller2 = original->consumer;
377
461
378 GV_ENQUEUE(bp, cbp, pbp);
462 bioq_insert_tail(p->bqueue, cbp);
379 }
380 wp->lockbase = -1;
381
382 break;
383
384 case BIO_WRITE:
385 /*
386 * A degraded write means we cannot write to the original data
387 * subdisk. Thus we need to read in all valid stripes,
388 * recalculate the parity from the original data, and then
389 * write the parity stripe back out.
390 */
391 if (type == REQ_TYPE_DEGRADED) {
392 /* Read all subdisks. */
393 LIST_FOREACH(s, &p->subdisks, in_plex) {
394 /* Skip the broken and the parity subdisk. */
395 if ((s == broken) || (s == parity))
396 continue;
463 }
464 wp->lockbase = -1;
465
466 break;
467
468 case BIO_WRITE:
469 /*
470 * A degraded write means we cannot write to the original data
471 * subdisk. Thus we need to read in all valid stripes,
472 * recalculate the parity from the original data, and then
473 * write the parity stripe back out.
474 */
475 if (type == REQ_TYPE_DEGRADED) {
476 /* Read all subdisks. */
477 LIST_FOREACH(s, &p->subdisks, in_plex) {
478 /* Skip the broken and the parity subdisk. */
479 if ((s == broken) || (s == parity))
480 continue;
481 /* Skip growing if within offset. */
482 if (grow && s->flags & GV_SD_GROW)
483 continue;
397
484
398 cbp = g_clone_bio(bp);
485 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
399 if (cbp == NULL)
400 return (ENOMEM);
401 cbp->bio_cmd = BIO_READ;
486 if (cbp == NULL)
487 return (ENOMEM);
488 cbp->bio_cmd = BIO_READ;
402 cbp->bio_data = g_malloc(real_len, M_WAITOK);
403 cbp->bio_cflags |= GV_BIO_MALLOC;
404 cbp->bio_offset = real_off;
405 cbp->bio_length = real_len;
406 cbp->bio_done = gv_plex_done;
407 cbp->bio_caller2 = s->consumer;
408 cbp->bio_driver1 = wp;
409
489
410 GV_ENQUEUE(bp, cbp, pbp);
490 bioq_insert_tail(p->bqueue, cbp);
411
412 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
413 bq->bp = cbp;
414 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
415 }
416
417 /* Write the parity data. */
491
492 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
493 bq->bp = cbp;
494 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
495 }
496
497 /* Write the parity data. */
418 cbp = g_clone_bio(bp);
498 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
419 if (cbp == NULL)
420 return (ENOMEM);
499 if (cbp == NULL)
500 return (ENOMEM);
421 cbp->bio_data = g_malloc(real_len, M_WAITOK);
422 cbp->bio_cflags |= GV_BIO_MALLOC;
423 bcopy(addr, cbp->bio_data, real_len);
424 cbp->bio_offset = real_off;
425 cbp->bio_length = real_len;
426 cbp->bio_done = gv_plex_done;
427 cbp->bio_caller2 = parity->consumer;
428 cbp->bio_driver1 = wp;
501 bcopy(addr, cbp->bio_data, wp->length);
429 wp->parity = cbp;
430
431 /*
432 * When the parity stripe is missing we just write out the data.
433 */
434 } else if (type == REQ_TYPE_NOPARITY) {
502 wp->parity = cbp;
503
504 /*
505 * When the parity stripe is missing we just write out the data.
506 */
507 } else if (type == REQ_TYPE_NOPARITY) {
435 cbp = g_clone_bio(bp);
508 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
436 if (cbp == NULL)
437 return (ENOMEM);
509 if (cbp == NULL)
510 return (ENOMEM);
438 cbp->bio_offset = real_off;
439 cbp->bio_length = real_len;
440 cbp->bio_data = addr;
441 cbp->bio_done = gv_plex_done;
442 cbp->bio_caller2 = original->consumer;
443 cbp->bio_driver1 = wp;
444
511
445 GV_ENQUEUE(bp, cbp, pbp);
512 bioq_insert_tail(p->bqueue, cbp);
446
447 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
448 bq->bp = cbp;
449 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
450
451 /*
452 * A normal write request goes to the original subdisk, then we
453 * read in all other stripes, recalculate the parity and write
454 * out the parity again.
455 */
456 } else {
457 /* Read old parity. */
513
514 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
515 bq->bp = cbp;
516 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
517
518 /*
519 * A normal write request goes to the original subdisk, then we
520 * read in all other stripes, recalculate the parity and write
521 * out the parity again.
522 */
523 } else {
524 /* Read old parity. */
458 cbp = g_clone_bio(bp);
525 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
459 if (cbp == NULL)
460 return (ENOMEM);
461 cbp->bio_cmd = BIO_READ;
526 if (cbp == NULL)
527 return (ENOMEM);
528 cbp->bio_cmd = BIO_READ;
462 cbp->bio_data = g_malloc(real_len, M_WAITOK);
463 cbp->bio_cflags |= GV_BIO_MALLOC;
464 cbp->bio_offset = real_off;
465 cbp->bio_length = real_len;
466 cbp->bio_done = gv_plex_done;
467 cbp->bio_caller2 = parity->consumer;
468 cbp->bio_driver1 = wp;
469
529
470 GV_ENQUEUE(bp, cbp, pbp);
530 bioq_insert_tail(p->bqueue, cbp);
471
472 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
473 bq->bp = cbp;
474 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
475
476 /* Read old data. */
531
532 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
533 bq->bp = cbp;
534 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
535
536 /* Read old data. */
477 cbp = g_clone_bio(bp);
537 cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1);
478 if (cbp == NULL)
479 return (ENOMEM);
480 cbp->bio_cmd = BIO_READ;
538 if (cbp == NULL)
539 return (ENOMEM);
540 cbp->bio_cmd = BIO_READ;
481 cbp->bio_data = g_malloc(real_len, M_WAITOK);
482 cbp->bio_cflags |= GV_BIO_MALLOC;
483 cbp->bio_offset = real_off;
484 cbp->bio_length = real_len;
485 cbp->bio_done = gv_plex_done;
486 cbp->bio_caller2 = original->consumer;
487 cbp->bio_driver1 = wp;
488
541
489 GV_ENQUEUE(bp, cbp, pbp);
542 bioq_insert_tail(p->bqueue, cbp);
490
491 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
492 bq->bp = cbp;
493 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
494
495 /* Write new data. */
543
544 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
545 bq->bp = cbp;
546 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
547
548 /* Write new data. */
496 cbp = g_clone_bio(bp);
549 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
497 if (cbp == NULL)
498 return (ENOMEM);
550 if (cbp == NULL)
551 return (ENOMEM);
499 cbp->bio_data = addr;
500 cbp->bio_offset = real_off;
501 cbp->bio_length = real_len;
502 cbp->bio_done = gv_plex_done;
503 cbp->bio_caller2 = original->consumer;
504
552
505 cbp->bio_driver1 = wp;
506
507 /*
508 * We must not write the new data until the old data
509 * was read, so hold this BIO back until we're ready
510 * for it.
511 */
512 wp->waiting = cbp;
513
514 /* The final bio for the parity. */
553 /*
554 * We must not write the new data until the old data
555 * was read, so hold this BIO back until we're ready
556 * for it.
557 */
558 wp->waiting = cbp;
559
560 /* The final bio for the parity. */
515 cbp = g_clone_bio(bp);
561 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
516 if (cbp == NULL)
517 return (ENOMEM);
562 if (cbp == NULL)
563 return (ENOMEM);
518 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
519 cbp->bio_cflags |= GV_BIO_MALLOC;
520 cbp->bio_offset = real_off;
521 cbp->bio_length = real_len;
522 cbp->bio_done = gv_plex_done;
523 cbp->bio_caller2 = parity->consumer;
524 cbp->bio_driver1 = wp;
525
526 /* Remember that this is the BIO for the parity data. */
527 wp->parity = cbp;
528 }
529 break;
530
531 default:
532 return (EINVAL);
533 }
534
535 return (0);
536}
537
564
565 /* Remember that this is the BIO for the parity data. */
566 wp->parity = cbp;
567 }
568 break;
569
570 default:
571 return (EINVAL);
572 }
573
574 return (0);
575}
576
538/* Calculate the offsets in the various subdisks for a RAID5 request. */
539int
577/*
578 * Calculate the offsets in the various subdisks for a RAID5 request. Also take
579 * care of new subdisks in an expanded RAID5 array.
580 * XXX: This assumes that the new subdisks are inserted after the others (which
581 * is okay as long as plex_offset is larger). If subdisks are inserted into the
582 * plexlist before, we get problems.
583 */
584static int
540gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
585gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
541 off_t *real_len, int *sdno, int *psdno)
586 off_t *real_len, int *sdno, int *psdno, int growing)
542{
587{
543 int sd, psd;
588 struct gv_sd *s;
589 int sd, psd, sdcount;
544 off_t len_left, stripeend, stripeoff, stripestart;
545
590 off_t len_left, stripeend, stripeoff, stripestart;
591
592 sdcount = p->sdcount;
593 if (growing) {
594 LIST_FOREACH(s, &p->subdisks, in_plex) {
595 if (s->flags & GV_SD_GROW)
596 sdcount--;
597 }
598 }
599
546 /* The number of the subdisk containing the parity stripe. */
600 /* The number of the subdisk containing the parity stripe. */
547 psd = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
548 p->sdcount;
601 psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) %
602 sdcount;
549 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
550
551 /* Offset of the start address from the start of the stripe. */
603 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
604
605 /* Offset of the start address from the start of the stripe. */
552 stripeoff = boff % (p->stripesize * (p->sdcount - 1));
606 stripeoff = boff % (p->stripesize * (sdcount - 1));
553 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
554
555 /* The number of the subdisk where the stripe resides. */
556 sd = stripeoff / p->stripesize;
557 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
558
559 /* At or past parity subdisk. */
560 if (sd >= psd)
561 sd++;
562
563 /* The offset of the stripe on this subdisk. */
607 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
608
609 /* The number of the subdisk where the stripe resides. */
610 sd = stripeoff / p->stripesize;
611 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
612
613 /* At or past parity subdisk. */
614 if (sd >= psd)
615 sd++;
616
617 /* The offset of the stripe on this subdisk. */
564 stripestart = (boff - stripeoff) / (p->sdcount - 1);
618 stripestart = (boff - stripeoff) / (sdcount - 1);
565 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
566
567 stripeoff %= p->stripesize;
568
569 /* The offset of the request on this subdisk. */
570 *real_off = stripestart + stripeoff;
571
572 stripeend = stripestart + p->stripesize;

--- 4 unchanged lines hidden (view full) ---

577
578 if (sdno != NULL)
579 *sdno = sd;
580 if (psdno != NULL)
581 *psdno = psd;
582
583 return (0);
584}
619 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
620
621 stripeoff %= p->stripesize;
622
623 /* The offset of the request on this subdisk. */
624 *real_off = stripestart + stripeoff;
625
626 stripeend = stripestart + p->stripesize;

--- 4 unchanged lines hidden (view full) ---

631
632 if (sdno != NULL)
633 *sdno = sd;
634 if (psdno != NULL)
635 *psdno = psd;
636
637 return (0);
638}
639
640static struct bio *
641gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp,
642 caddr_t addr, int use_wp)
643{
644 struct bio *cbp;
645
646 cbp = g_clone_bio(bp);
647 if (cbp == NULL)
648 return (NULL);
649 if (addr == NULL) {
650 cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO);
651 cbp->bio_cflags |= GV_BIO_MALLOC;
652 } else
653 cbp->bio_data = addr;
654 cbp->bio_offset = wp->lockbase + s->drive_offset;
655 cbp->bio_length = wp->length;
656 cbp->bio_done = gv_done;
657 cbp->bio_caller1 = s;
658 if (use_wp)
659 cbp->bio_caller2 = wp;
660
661 return (cbp);
662}