geom_vinum_plex.c revision 190507
1/*-
2 * Copyright (c) 2004, 2007 Lukas Ertl
3 * Copyright (c) 2007, 2009 Ulf Lilleengen
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_plex.c 190507 2009-03-28 17:20:08Z lulf $");
30
31#include <sys/param.h>
32#include <sys/bio.h>
33#include <sys/lock.h>
34#include <sys/malloc.h>
35#include <sys/systm.h>
36
37#include <geom/geom.h>
38#include <geom/vinum/geom_vinum_var.h>
39#include <geom/vinum/geom_vinum_raid5.h>
40#include <geom/vinum/geom_vinum.h>
41
42static int	gv_check_parity(struct gv_plex *, struct bio *,
43		    struct gv_raid5_packet *);
44static int	gv_normal_parity(struct gv_plex *, struct bio *,
45		    struct gv_raid5_packet *);
46static void	gv_plex_flush(struct gv_plex *);
47static int	gv_plex_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *,
48		    int *, int);
49static int 	gv_plex_normal_request(struct gv_plex *, struct bio *, off_t,
50		    off_t,  caddr_t);
51void
52gv_plex_start(struct gv_plex *p, struct bio *bp)
53{
54	struct bio *cbp;
55	struct gv_sd *s;
56	struct gv_raid5_packet *wp;
57	caddr_t addr;
58	off_t bcount, boff, len;
59
60	bcount = bp->bio_length;
61	addr = bp->bio_data;
62	boff = bp->bio_offset;
63
64	/* Walk over the whole length of the request, we might split it up. */
65	while (bcount > 0) {
66		wp = NULL;
67
68 		/*
69		 * RAID5 plexes need special treatment, as a single request
70		 * might involve several read/write sub-requests.
71 		 */
72		if (p->org == GV_PLEX_RAID5) {
73			wp = gv_raid5_start(p, bp, addr, boff, bcount);
74 			if (wp == NULL)
75 				return;
76
77			len = wp->length;
78
79			if (TAILQ_EMPTY(&wp->bits))
80				g_free(wp);
81			else if (wp->lockbase != -1)
82				TAILQ_INSERT_TAIL(&p->packets, wp, list);
83
84		/*
85		 * Requests to concatenated and striped plexes go straight
86		 * through.
87		 */
88		} else {
89			len = gv_plex_normal_request(p, bp, boff, bcount, addr);
90		}
91		if (len < 0)
92			return;
93
94		bcount -= len;
95		addr += len;
96		boff += len;
97	}
98
99	/*
100	 * Fire off all sub-requests.  We get the correct consumer (== drive)
101	 * to send each request to via the subdisk that was stored in
102	 * cbp->bio_caller1.
103	 */
104	cbp = bioq_takefirst(p->bqueue);
105	while (cbp != NULL) {
106		/*
107		 * RAID5 sub-requests need to come in correct order, otherwise
108		 * we trip over the parity, as it might be overwritten by
109		 * another sub-request.  We abuse cbp->bio_caller2 to mark
110		 * potential overlap situations.
111		 */
112		if (cbp->bio_caller2 != NULL && gv_stripe_active(p, cbp)) {
113			/* Park the bio on the waiting queue. */
114			cbp->bio_cflags |= GV_BIO_ONHOLD;
115			bioq_disksort(p->wqueue, cbp);
116		} else {
117			s = cbp->bio_caller1;
118			g_io_request(cbp, s->drive_sc->consumer);
119		}
120		cbp = bioq_takefirst(p->bqueue);
121	}
122}
123
124static int
125gv_plex_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
126    off_t *real_len, int *sdno, int growing)
127{
128	struct gv_sd *s;
129	int i, sdcount;
130	off_t len_left, stripeend, stripeno, stripestart;
131
132	switch (p->org) {
133	case GV_PLEX_CONCAT:
134		/*
135		 * Find the subdisk where this request starts.  The subdisks in
136		 * this list must be ordered by plex_offset.
137		 */
138		i = 0;
139		LIST_FOREACH(s, &p->subdisks, in_plex) {
140			if (s->plex_offset <= boff &&
141			    s->plex_offset + s->size > boff) {
142				*sdno = i;
143				break;
144			}
145			i++;
146		}
147		if (s == NULL || s->drive_sc == NULL)
148			return (GV_ERR_NOTFOUND);
149
150		/* Calculate corresponding offsets on disk. */
151		*real_off = boff - s->plex_offset;
152		len_left = s->size - (*real_off);
153		KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0"));
154		*real_len = (bcount > len_left) ? len_left : bcount;
155		break;
156
157	case GV_PLEX_STRIPED:
158		/* The number of the stripe where the request starts. */
159		stripeno = boff / p->stripesize;
160		KASSERT(stripeno >= 0, ("gv_plex_offset: stripeno < 0"));
161
162		/* Take growing subdisks into account when calculating. */
163		sdcount = gv_sdcount(p, (boff >= p->synced));
164
165		if (!(boff + bcount <= p->synced) &&
166		    (p->flags & GV_PLEX_GROWING) &&
167		    !growing)
168			return (GV_ERR_ISBUSY);
169		*sdno = stripeno % sdcount;
170
171		KASSERT(sdno >= 0, ("gv_plex_offset: sdno < 0"));
172		stripestart = (stripeno / sdcount) *
173		    p->stripesize;
174		KASSERT(stripestart >= 0, ("gv_plex_offset: stripestart < 0"));
175		stripeend = stripestart + p->stripesize;
176		*real_off = boff - (stripeno * p->stripesize) +
177		    stripestart;
178		len_left = stripeend - *real_off;
179		KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0"));
180
181		*real_len = (bcount <= len_left) ? bcount : len_left;
182		break;
183
184	default:
185		return (GV_ERR_PLEXORG);
186	}
187	return (0);
188}
189
190/*
191 * Prepare a normal plex request.
192 */
193static int
194gv_plex_normal_request(struct gv_plex *p, struct bio *bp, off_t boff,
195    off_t bcount,  caddr_t addr)
196{
197	struct gv_sd *s;
198	struct bio *cbp;
199	off_t real_len, real_off;
200	int i, err, sdno;
201
202	s = NULL;
203	sdno = -1;
204	real_len = real_off = 0;
205
206	err = ENXIO;
207
208	if (p == NULL || LIST_EMPTY(&p->subdisks))
209		goto bad;
210
211	err = gv_plex_offset(p, boff, bcount, &real_off,
212	    &real_len, &sdno, (bp->bio_pflags & GV_BIO_SYNCREQ));
213	/* If the request was blocked, put it into wait. */
214	if (err == GV_ERR_ISBUSY) {
215		bioq_disksort(p->rqueue, bp);
216		return (-1); /* "Fail", and delay request. */
217	}
218	if (err) {
219		err = ENXIO;
220		goto bad;
221	}
222	err = ENXIO;
223
224	/* Find the right subdisk. */
225	i = 0;
226	LIST_FOREACH(s, &p->subdisks, in_plex) {
227		if (i == sdno)
228			break;
229		i++;
230	}
231
232	/* Subdisk not found. */
233	if (s == NULL || s->drive_sc == NULL)
234		goto bad;
235
236	/* Now check if we can handle the request on this subdisk. */
237	switch (s->state) {
238	case GV_SD_UP:
239		/* If the subdisk is up, just continue. */
240		break;
241	case GV_SD_DOWN:
242		if (bp->bio_cflags & GV_BIO_INTERNAL)
243			G_VINUM_DEBUG(0, "subdisk must be in the stale state in"
244			    " order to perform administrative requests");
245		goto bad;
246	case GV_SD_STALE:
247		if (!(bp->bio_cflags & GV_BIO_SYNCREQ)) {
248			G_VINUM_DEBUG(0, "subdisk stale, unable to perform "
249			    "regular requests");
250			goto bad;
251		}
252
253		G_VINUM_DEBUG(1, "sd %s is initializing", s->name);
254		gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
255		break;
256	case GV_SD_INITIALIZING:
257		if (bp->bio_cmd == BIO_READ)
258			goto bad;
259		break;
260	default:
261		/* All other subdisk states mean it's not accessible. */
262		goto bad;
263	}
264
265	/* Clone the bio and adjust the offsets and sizes. */
266	cbp = g_clone_bio(bp);
267	if (cbp == NULL) {
268		err = ENOMEM;
269		goto bad;
270	}
271	cbp->bio_offset = real_off + s->drive_offset;
272	cbp->bio_length = real_len;
273	cbp->bio_data = addr;
274	cbp->bio_done = gv_done;
275	cbp->bio_caller1 = s;
276	if ((bp->bio_cflags & GV_BIO_SYNCREQ))
277		cbp->bio_cflags |= GV_BIO_SYNCREQ;
278
279	/* Store the sub-requests now and let others issue them. */
280	bioq_insert_tail(p->bqueue, cbp);
281	return (real_len);
282bad:
283	G_VINUM_LOGREQ(0, bp, "plex request failed.");
284	/* Building the sub-request failed. If internal BIO, do not deliver. */
285	if (bp->bio_cflags & GV_BIO_INTERNAL) {
286		if (bp->bio_cflags & GV_BIO_MALLOC)
287			g_free(bp->bio_data);
288		g_destroy_bio(bp);
289		p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
290		    GV_PLEX_GROWING);
291		return (-1);
292	}
293	g_io_deliver(bp, err);
294	return (-1);
295}
296
297/*
298 * Handle a completed request to a striped or concatenated plex.
299 */
300void
301gv_plex_normal_done(struct gv_plex *p, struct bio *bp)
302{
303	struct bio *pbp;
304
305	pbp = bp->bio_parent;
306	if (pbp->bio_error == 0)
307		pbp->bio_error = bp->bio_error;
308	g_destroy_bio(bp);
309	pbp->bio_inbed++;
310	if (pbp->bio_children == pbp->bio_inbed) {
311		/* Just set it to length since multiple plexes will
312		 * screw things up. */
313		pbp->bio_completed = pbp->bio_length;
314		if (pbp->bio_cflags & GV_BIO_SYNCREQ)
315			gv_sync_complete(p, pbp);
316		else if (pbp->bio_pflags & GV_BIO_SYNCREQ)
317			gv_grow_complete(p, pbp);
318		else
319			g_io_deliver(pbp, pbp->bio_error);
320	}
321}
322
323/*
324 * Handle a completed request to a RAID-5 plex.
325 */
326void
327gv_plex_raid5_done(struct gv_plex *p, struct bio *bp)
328{
329	struct gv_softc *sc;
330	struct bio *cbp, *pbp;
331	struct gv_bioq *bq, *bq2;
332	struct gv_raid5_packet *wp;
333	off_t completed;
334	int i;
335
336	completed = 0;
337	sc = p->vinumconf;
338	wp = bp->bio_caller2;
339
340	switch (bp->bio_parent->bio_cmd) {
341	case BIO_READ:
342		if (wp == NULL) {
343			completed = bp->bio_completed;
344			break;
345		}
346
347		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
348			if (bq->bp != bp)
349				continue;
350			TAILQ_REMOVE(&wp->bits, bq, queue);
351			g_free(bq);
352			for (i = 0; i < wp->length; i++)
353				wp->data[i] ^= bp->bio_data[i];
354			break;
355		}
356		if (TAILQ_EMPTY(&wp->bits)) {
357			completed = wp->length;
358			if (wp->lockbase != -1) {
359				TAILQ_REMOVE(&p->packets, wp, list);
360				/* Bring the waiting bios back into the game. */
361				pbp = bioq_takefirst(p->wqueue);
362				while (pbp != NULL) {
363					mtx_lock(&sc->queue_mtx);
364					bioq_disksort(sc->bqueue, pbp);
365					mtx_unlock(&sc->queue_mtx);
366					pbp = bioq_takefirst(p->wqueue);
367				}
368			}
369			g_free(wp);
370		}
371
372		break;
373
374 	case BIO_WRITE:
375		/* XXX can this ever happen? */
376		if (wp == NULL) {
377			completed = bp->bio_completed;
378			break;
379		}
380
381		/* Check if we need to handle parity data. */
382		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
383			if (bq->bp != bp)
384				continue;
385			TAILQ_REMOVE(&wp->bits, bq, queue);
386			g_free(bq);
387			cbp = wp->parity;
388			if (cbp != NULL) {
389				for (i = 0; i < wp->length; i++)
390					cbp->bio_data[i] ^= bp->bio_data[i];
391			}
392			break;
393		}
394
395		/* Handle parity data. */
396		if (TAILQ_EMPTY(&wp->bits)) {
397			if (bp->bio_parent->bio_cflags & GV_BIO_CHECK)
398				i = gv_check_parity(p, bp, wp);
399			else
400				i = gv_normal_parity(p, bp, wp);
401
402			/* All of our sub-requests have finished. */
403			if (i) {
404				completed = wp->length;
405				TAILQ_REMOVE(&p->packets, wp, list);
406				/* Bring the waiting bios back into the game. */
407				pbp = bioq_takefirst(p->wqueue);
408				while (pbp != NULL) {
409					mtx_lock(&sc->queue_mtx);
410					bioq_disksort(sc->bqueue, pbp);
411					mtx_unlock(&sc->queue_mtx);
412					pbp = bioq_takefirst(p->wqueue);
413				}
414				g_free(wp);
415			}
416		}
417
418		break;
419	}
420
421	pbp = bp->bio_parent;
422	if (pbp->bio_error == 0)
423		pbp->bio_error = bp->bio_error;
424	pbp->bio_completed += completed;
425
426	/* When the original request is finished, we deliver it. */
427	pbp->bio_inbed++;
428	if (pbp->bio_inbed == pbp->bio_children) {
429		/* Hand it over for checking or delivery. */
430		if (pbp->bio_cmd == BIO_WRITE &&
431		    (pbp->bio_cflags & GV_BIO_CHECK)) {
432			gv_parity_complete(p, pbp);
433		} else if (pbp->bio_cmd == BIO_WRITE &&
434		    (pbp->bio_cflags & GV_BIO_REBUILD)) {
435			gv_rebuild_complete(p, pbp);
436		} else if (pbp->bio_cflags & GV_BIO_INIT) {
437			gv_init_complete(p, pbp);
438		} else if (pbp->bio_cflags & GV_BIO_SYNCREQ) {
439			gv_sync_complete(p, pbp);
440		} else if (pbp->bio_pflags & GV_BIO_SYNCREQ) {
441			gv_grow_complete(p, pbp);
442		} else {
443			g_io_deliver(pbp, pbp->bio_error);
444		}
445	}
446
447	/* Clean up what we allocated. */
448	if (bp->bio_cflags & GV_BIO_MALLOC)
449		g_free(bp->bio_data);
450	g_destroy_bio(bp);
451}
452
453static int
454gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
455{
456	struct bio *pbp;
457	struct gv_sd *s;
458	int err, finished, i;
459
460	err = 0;
461	finished = 1;
462
463	if (wp->waiting != NULL) {
464		pbp = wp->waiting;
465		wp->waiting = NULL;
466		s = pbp->bio_caller1;
467		g_io_request(pbp, s->drive_sc->consumer);
468		finished = 0;
469
470	} else if (wp->parity != NULL) {
471		pbp = wp->parity;
472		wp->parity = NULL;
473
474		/* Check if the parity is correct. */
475		for (i = 0; i < wp->length; i++) {
476			if (bp->bio_data[i] != pbp->bio_data[i]) {
477				err = 1;
478				break;
479			}
480		}
481
482		/* The parity is not correct... */
483		if (err) {
484			bp->bio_parent->bio_error = EAGAIN;
485
486			/* ... but we rebuild it. */
487			if (bp->bio_parent->bio_cflags & GV_BIO_PARITY) {
488				s = pbp->bio_caller1;
489				g_io_request(pbp, s->drive_sc->consumer);
490				finished = 0;
491			}
492		}
493
494		/*
495		 * Clean up the BIO we would have used for rebuilding the
496		 * parity.
497		 */
498		if (finished) {
499			bp->bio_parent->bio_inbed++;
500			g_destroy_bio(pbp);
501		}
502
503	}
504
505	return (finished);
506}
507
508static int
509gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
510{
511	struct bio *cbp, *pbp;
512	struct gv_sd *s;
513	int finished, i;
514
515	finished = 1;
516
517	if (wp->waiting != NULL) {
518		pbp = wp->waiting;
519		wp->waiting = NULL;
520		cbp = wp->parity;
521		for (i = 0; i < wp->length; i++)
522			cbp->bio_data[i] ^= pbp->bio_data[i];
523		s = pbp->bio_caller1;
524		g_io_request(pbp, s->drive_sc->consumer);
525		finished = 0;
526
527	} else if (wp->parity != NULL) {
528		cbp = wp->parity;
529		wp->parity = NULL;
530		s = cbp->bio_caller1;
531		g_io_request(cbp, s->drive_sc->consumer);
532		finished = 0;
533	}
534
535	return (finished);
536}
537
538/* Flush the queue with delayed requests. */
539static void
540gv_plex_flush(struct gv_plex *p)
541{
542	struct gv_softc *sc;
543	struct bio *bp;
544
545	sc = p->vinumconf;
546	bp = bioq_takefirst(p->rqueue);
547	while (bp != NULL) {
548		gv_plex_start(p, bp);
549		bp = bioq_takefirst(p->rqueue);
550	}
551}
552
553int
554gv_sync_request(struct gv_plex *from, struct gv_plex *to, off_t offset,
555    off_t length, int type, caddr_t data)
556{
557	struct gv_softc *sc;
558	struct bio *bp;
559
560	KASSERT(from != NULL, ("NULL from"));
561	KASSERT(to != NULL, ("NULL to"));
562	sc = from->vinumconf;
563	KASSERT(sc != NULL, ("NULL sc"));
564
565	bp = g_new_bio();
566	if (bp == NULL) {
567		G_VINUM_DEBUG(0, "sync from '%s' failed at offset "
568		    " %jd; out of memory", from->name, offset);
569		return (ENOMEM);
570	}
571	bp->bio_length = length;
572	bp->bio_done = gv_done;
573	bp->bio_cflags |= GV_BIO_SYNCREQ;
574	bp->bio_offset = offset;
575	bp->bio_caller1 = from;
576	bp->bio_caller2 = to;
577	bp->bio_cmd = type;
578	if (data == NULL)
579		data = g_malloc(length, M_WAITOK);
580	bp->bio_cflags |= GV_BIO_MALLOC; /* Free on the next run. */
581	bp->bio_data = data;
582
583	/* Send down next. */
584	mtx_lock(&sc->queue_mtx);
585	bioq_disksort(sc->bqueue, bp);
586	mtx_unlock(&sc->queue_mtx);
587	//gv_plex_start(from, bp);
588	return (0);
589}
590
591/*
592 * Handle a finished plex sync bio.
593 */
594int
595gv_sync_complete(struct gv_plex *to, struct bio *bp)
596{
597	struct gv_plex *from, *p;
598	struct gv_sd *s;
599	struct gv_volume *v;
600	struct gv_softc *sc;
601	off_t offset;
602	int err;
603
604	g_topology_assert_not();
605
606	err = 0;
607	KASSERT(to != NULL, ("NULL to"));
608	KASSERT(bp != NULL, ("NULL bp"));
609	from = bp->bio_caller2;
610	KASSERT(from != NULL, ("NULL from"));
611	v = to->vol_sc;
612	KASSERT(v != NULL, ("NULL v"));
613	sc = v->vinumconf;
614	KASSERT(sc != NULL, ("NULL sc"));
615
616	/* If it was a read, write it. */
617	if (bp->bio_cmd == BIO_READ) {
618		err = gv_sync_request(from, to, bp->bio_offset, bp->bio_length,
619	    	    BIO_WRITE, bp->bio_data);
620	/* If it was a write, read the next one. */
621	} else if (bp->bio_cmd == BIO_WRITE) {
622		if (bp->bio_cflags & GV_BIO_MALLOC)
623			g_free(bp->bio_data);
624		to->synced += bp->bio_length;
625		/* If we're finished, clean up. */
626		if (bp->bio_offset + bp->bio_length >= from->size) {
627			G_VINUM_DEBUG(1, "syncing of %s from %s completed",
628			    to->name, from->name);
629			/* Update our state. */
630			LIST_FOREACH(s, &to->subdisks, in_plex)
631				gv_set_sd_state(s, GV_SD_UP, 0);
632			gv_update_plex_state(to);
633			to->flags &= ~GV_PLEX_SYNCING;
634			to->synced = 0;
635			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
636		} else {
637			offset = bp->bio_offset + bp->bio_length;
638			err = gv_sync_request(from, to, offset,
639			    MIN(bp->bio_length, from->size - offset),
640			    BIO_READ, NULL);
641		}
642	}
643	g_destroy_bio(bp);
644	/* Clean up if there was an error. */
645	if (err) {
646		to->flags &= ~GV_PLEX_SYNCING;
647		G_VINUM_DEBUG(0, "error syncing plexes: error code %d", err);
648	}
649
650	/* Check if all plexes are synced, and lower refcounts. */
651	g_topology_lock();
652	LIST_FOREACH(p, &v->plexes, in_volume) {
653		if (p->flags & GV_PLEX_SYNCING) {
654			g_topology_unlock();
655			return (-1);
656		}
657	}
658	/* If we came here, all plexes are synced, and we're free. */
659	gv_access(v->provider, -1, -1, 0);
660	g_topology_unlock();
661	G_VINUM_DEBUG(1, "plex sync completed");
662	gv_volume_flush(v);
663	return (0);
664}
665
666/*
667 * Create a new bio struct for the next grow request.
668 */
669int
670gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type,
671    caddr_t data)
672{
673	struct gv_softc *sc;
674	struct bio *bp;
675
676	KASSERT(p != NULL, ("gv_grow_request: NULL p"));
677	sc = p->vinumconf;
678	KASSERT(sc != NULL, ("gv_grow_request: NULL sc"));
679
680	bp = g_new_bio();
681	if (bp == NULL) {
682		G_VINUM_DEBUG(0, "grow of %s failed creating bio: "
683		    "out of memory", p->name);
684		return (ENOMEM);
685	}
686
687	bp->bio_cmd = type;
688	bp->bio_done = gv_done;
689	bp->bio_error = 0;
690	bp->bio_caller1 = p;
691	bp->bio_offset = offset;
692	bp->bio_length = length;
693	bp->bio_pflags |= GV_BIO_SYNCREQ; /* XXX: misuse of pflags AND syncreq.*/
694	if (data == NULL)
695		data = g_malloc(length, M_WAITOK);
696	bp->bio_cflags |= GV_BIO_MALLOC;
697	bp->bio_data = data;
698
699	mtx_lock(&sc->queue_mtx);
700	bioq_disksort(sc->bqueue, bp);
701	mtx_unlock(&sc->queue_mtx);
702	//gv_plex_start(p, bp);
703	return (0);
704}
705
706/*
707 * Finish handling of a bio to a growing plex.
708 */
709void
710gv_grow_complete(struct gv_plex *p, struct bio *bp)
711{
712	struct gv_softc *sc;
713	struct gv_sd *s;
714	struct gv_volume *v;
715	off_t origsize, offset;
716	int sdcount, err;
717
718	v = p->vol_sc;
719	KASSERT(v != NULL, ("gv_grow_complete: NULL v"));
720	sc = v->vinumconf;
721	KASSERT(sc != NULL, ("gv_grow_complete: NULL sc"));
722	err = 0;
723
724	/* If it was a read, write it. */
725	if (bp->bio_cmd == BIO_READ) {
726		p->synced += bp->bio_length;
727		err = gv_grow_request(p, bp->bio_offset, bp->bio_length,
728		    BIO_WRITE, bp->bio_data);
729	/* If it was a write, read next. */
730	} else if (bp->bio_cmd == BIO_WRITE) {
731		if (bp->bio_cflags & GV_BIO_MALLOC)
732			g_free(bp->bio_data);
733
734		/* Find the real size of the plex. */
735		sdcount = gv_sdcount(p, 1);
736		s = LIST_FIRST(&p->subdisks);
737		KASSERT(s != NULL, ("NULL s"));
738		origsize = (s->size * (sdcount - 1));
739		if (bp->bio_offset + bp->bio_length >= origsize) {
740			G_VINUM_DEBUG(1, "growing of %s completed", p->name);
741			p->flags &= ~GV_PLEX_GROWING;
742			LIST_FOREACH(s, &p->subdisks, in_plex) {
743				s->flags &= ~GV_SD_GROW;
744				gv_set_sd_state(s, GV_SD_UP, 0);
745			}
746			p->size = gv_plex_size(p);
747			gv_update_vol_size(v, gv_vol_size(v));
748			gv_set_plex_state(p, GV_PLEX_UP, 0);
749			g_topology_lock();
750			gv_access(v->provider, -1, -1, 0);
751			g_topology_unlock();
752			p->synced = 0;
753			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
754			/* Issue delayed requests. */
755			gv_plex_flush(p);
756		} else {
757			offset = bp->bio_offset + bp->bio_length;
758			err = gv_grow_request(p, offset,
759			   MIN(bp->bio_length, origsize - offset),
760			   BIO_READ, NULL);
761		}
762	}
763	g_destroy_bio(bp);
764
765	if (err) {
766		p->flags &= ~GV_PLEX_GROWING;
767		G_VINUM_DEBUG(0, "error growing plex: error code %d", err);
768	}
769}
770
771
772/*
773 * Create an initialization BIO and send it off to the consumer. Assume that
774 * we're given initialization data as parameter.
775 */
776void
777gv_init_request(struct gv_sd *s, off_t start, caddr_t data, off_t length)
778{
779	struct gv_drive *d;
780	struct g_consumer *cp;
781	struct bio *bp, *cbp;
782
783	KASSERT(s != NULL, ("gv_init_request: NULL s"));
784	d = s->drive_sc;
785	KASSERT(d != NULL, ("gv_init_request: NULL d"));
786	cp = d->consumer;
787	KASSERT(cp != NULL, ("gv_init_request: NULL cp"));
788
789	bp = g_new_bio();
790	if (bp == NULL) {
791		G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd"
792		    " (drive offset %jd); out of memory", s->name,
793		    (intmax_t)s->initialized, (intmax_t)start);
794		return; /* XXX: Error codes. */
795	}
796	bp->bio_cmd = BIO_WRITE;
797	bp->bio_data = data;
798	bp->bio_done = gv_done;
799	bp->bio_error = 0;
800	bp->bio_length = length;
801	bp->bio_cflags |= GV_BIO_INIT;
802	bp->bio_offset = start;
803	bp->bio_caller1 = s;
804
805	/* Then ofcourse, we have to clone it. */
806	cbp = g_clone_bio(bp);
807	if (cbp == NULL) {
808		G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd"
809		    " (drive offset %jd); out of memory", s->name,
810		    (intmax_t)s->initialized, (intmax_t)start);
811		return; /* XXX: Error codes. */
812	}
813	cbp->bio_done = gv_done;
814	cbp->bio_caller1 = s;
815	/* Send it off to the consumer. */
816	g_io_request(cbp, cp);
817}
818
819/*
820 * Handle a finished initialization BIO.
821 */
822void
823gv_init_complete(struct gv_plex *p, struct bio *bp)
824{
825	struct gv_softc *sc;
826	struct gv_drive *d;
827	struct g_consumer *cp;
828	struct gv_sd *s;
829	off_t start, length;
830	caddr_t data;
831	int error;
832
833	s = bp->bio_caller1;
834	start = bp->bio_offset;
835	length = bp->bio_length;
836	error = bp->bio_error;
837	data = bp->bio_data;
838
839	KASSERT(s != NULL, ("gv_init_complete: NULL s"));
840	d = s->drive_sc;
841	KASSERT(d != NULL, ("gv_init_complete: NULL d"));
842	cp = d->consumer;
843	KASSERT(cp != NULL, ("gv_init_complete: NULL cp"));
844	sc = p->vinumconf;
845	KASSERT(sc != NULL, ("gv_init_complete: NULL sc"));
846
847	g_destroy_bio(bp);
848
849	/*
850	 * First we need to find out if it was okay, and abort if it's not.
851	 * Then we need to free previous buffers, find out the correct subdisk,
852	 * as well as getting the correct starting point and length of the BIO.
853	 */
854	if (start >= s->drive_offset + s->size) {
855		/* Free the data we initialized. */
856		if (data != NULL)
857			g_free(data);
858		g_topology_assert_not();
859		g_topology_lock();
860		g_access(cp, 0, -1, 0);
861		g_topology_unlock();
862		if (error) {
863			gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE |
864			    GV_SETSTATE_CONFIG);
865		} else {
866			gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG);
867			s->initialized = 0;
868			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
869			G_VINUM_DEBUG(1, "subdisk '%s' init: finished "
870			    "successfully", s->name);
871		}
872		return;
873	}
874	s->initialized += length;
875	start += length;
876	gv_init_request(s, start, data, length);
877}
878
879/*
880 * Create a new bio struct for the next parity rebuild. Used both by internal
881 * rebuild of degraded plexes as well as user initiated rebuilds/checks.
882 */
883void
884gv_parity_request(struct gv_plex *p, int flags, off_t offset)
885{
886	struct gv_softc *sc;
887	struct bio *bp;
888
889	KASSERT(p != NULL, ("gv_parity_request: NULL p"));
890	sc = p->vinumconf;
891	KASSERT(sc != NULL, ("gv_parity_request: NULL sc"));
892
893	bp = g_new_bio();
894	if (bp == NULL) {
895		G_VINUM_DEBUG(0, "rebuild of %s failed creating bio: "
896		    "out of memory", p->name);
897		return;
898	}
899
900	bp->bio_cmd = BIO_WRITE;
901	bp->bio_done = gv_done;
902	bp->bio_error = 0;
903	bp->bio_length = p->stripesize;
904	bp->bio_caller1 = p;
905
906	/*
907	 * Check if it's a rebuild of a degraded plex or a user request of
908	 * parity rebuild.
909	 */
910	if (flags & GV_BIO_REBUILD)
911		bp->bio_data = g_malloc(GV_DFLT_SYNCSIZE, M_WAITOK);
912	else if (flags & GV_BIO_CHECK)
913		bp->bio_data = g_malloc(p->stripesize, M_WAITOK | M_ZERO);
914	else {
915		G_VINUM_DEBUG(0, "invalid flags given in rebuild");
916		return;
917	}
918
919	bp->bio_cflags = flags;
920	bp->bio_cflags |= GV_BIO_MALLOC;
921
922	/* We still have more parity to build. */
923	bp->bio_offset = offset;
924	mtx_lock(&sc->queue_mtx);
925	bioq_disksort(sc->bqueue, bp);
926	mtx_unlock(&sc->queue_mtx);
927	//gv_plex_start(p, bp); /* Send it down to the plex. */
928}
929
930/*
931 * Handle a finished parity write.
932 */
933void
934gv_parity_complete(struct gv_plex *p, struct bio *bp)
935{
936	struct gv_softc *sc;
937	int error, flags;
938
939	error = bp->bio_error;
940	flags = bp->bio_cflags;
941	flags &= ~GV_BIO_MALLOC;
942
943	sc = p->vinumconf;
944	KASSERT(sc != NULL, ("gv_parity_complete: NULL sc"));
945
946	/* Clean up what we allocated. */
947	if (bp->bio_cflags & GV_BIO_MALLOC)
948		g_free(bp->bio_data);
949	g_destroy_bio(bp);
950
951	if (error == EAGAIN) {
952		G_VINUM_DEBUG(0, "parity incorrect at offset 0x%jx",
953		    (intmax_t)p->synced);
954	}
955
956	/* Any error is fatal, except EAGAIN when we're rebuilding. */
957	if (error && !(error == EAGAIN && (flags & GV_BIO_PARITY))) {
958		/* Make sure we don't have the lock. */
959		g_topology_assert_not();
960		g_topology_lock();
961		gv_access(p->vol_sc->provider, -1, -1, 0);
962		g_topology_unlock();
963		G_VINUM_DEBUG(0, "parity check on %s failed at 0x%jx "
964		    "errno %d", p->name, (intmax_t)p->synced, error);
965		return;
966	} else {
967		p->synced += p->stripesize;
968	}
969
970	if (p->synced >= p->size) {
971		/* Make sure we don't have the lock. */
972		g_topology_assert_not();
973		g_topology_lock();
974		gv_access(p->vol_sc->provider, -1, -1, 0);
975		g_topology_unlock();
976		/* We're finished. */
977		G_VINUM_DEBUG(1, "parity operation on %s finished", p->name);
978		p->synced = 0;
979		gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
980		return;
981	}
982
983	/* Send down next. It will determine if we need to itself. */
984	gv_parity_request(p, flags, p->synced);
985}
986
987/*
988 * Handle a finished plex rebuild bio.
989 */
990void
991gv_rebuild_complete(struct gv_plex *p, struct bio *bp)
992{
993	struct gv_softc *sc;
994	struct gv_sd *s;
995	int error, flags;
996	off_t offset;
997
998	error = bp->bio_error;
999	flags = bp->bio_cflags;
1000	offset = bp->bio_offset;
1001	flags &= ~GV_BIO_MALLOC;
1002	sc = p->vinumconf;
1003	KASSERT(sc != NULL, ("gv_rebuild_complete: NULL sc"));
1004
1005	/* Clean up what we allocated. */
1006	if (bp->bio_cflags & GV_BIO_MALLOC)
1007		g_free(bp->bio_data);
1008	g_destroy_bio(bp);
1009
1010	if (error) {
1011		g_topology_assert_not();
1012		g_topology_lock();
1013		gv_access(p->vol_sc->provider, -1, -1, 0);
1014		g_topology_unlock();
1015
1016		G_VINUM_DEBUG(0, "rebuild of %s failed at offset %jd errno: %d",
1017		    p->name, (intmax_t)offset, error);
1018		p->flags &= ~GV_PLEX_REBUILDING;
1019		p->synced = 0;
1020		gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */
1021		return;
1022	}
1023
1024	offset += (p->stripesize * (gv_sdcount(p, 1) - 1));
1025	if (offset >= p->size) {
1026		/* We're finished. */
1027		g_topology_assert_not();
1028		g_topology_lock();
1029		gv_access(p->vol_sc->provider, -1, -1, 0);
1030		g_topology_unlock();
1031
1032		G_VINUM_DEBUG(1, "rebuild of %s finished", p->name);
1033		gv_save_config(p->vinumconf);
1034		p->flags &= ~GV_PLEX_REBUILDING;
1035		p->synced = 0;
1036		/* Try to up all subdisks. */
1037		LIST_FOREACH(s, &p->subdisks, in_plex)
1038			gv_update_sd_state(s);
1039		gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
1040		gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */
1041		return;
1042	}
1043
1044	/* Send down next. It will determine if we need to itself. */
1045	gv_parity_request(p, flags, offset);
1046}
1047